In [3]:
# !pip install imblearn

In [4]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import OneHotEncoder, StandardScaler

from sklearn.impute import MissingIndicator, SimpleImputer

from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_selection import SelectFromModel

# plot_confusion_matrix is a handy visual tool, added in the latest version of scikit-learn
# if you are running an older version, comment out this line and just use confusion_matrix
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [5]:
df = pd.read_csv('../../Data/music_subset.csv')
df = df.drop('Unnamed: 0', axis=1)

In [6]:
def encode_and_concat_feature(X, feature_name, ohe):
    """
    Helper function for transforming a feature into multiple columns of 1s and 0s. Used
    in both training and testing steps.  Takes in the full X dataframe, feature name, 
    and encoder, and returns the dataframe with that feature transformed into multiple
    columns of 1s and 0s
    """
    # create new one-hot encoded df based on the feature
    single_feature_df = X[[feature_name]]
    feature_array = ohe.transform(single_feature_df).toarray()
    ohe_df = pd.DataFrame(feature_array, columns=ohe.categories_[0])
    # drop the old feature from X and concat the new one-hot encoded df
    X = pd.concat([X, ohe_df], axis=1)
    return X

In [7]:
def encode_and_concat_feature_train(X_train_all_features, feature_name):
    """
    Helper function for transforming training data.  It takes in the full X dataframe and
    feature name, makes a one-hot encoder, and returns the encoder as well as the dataframe
    with that feature transformed into multiple columns of 1s and 0s
    """
    # make a one-hot encoder and fit it to the training data
    ohe = OneHotEncoder(categories="auto", handle_unknown="ignore")
    single_feature_df = X_train_all_features[[feature_name]]
    ohe.fit(single_feature_df)
    # call helper function that actually encodes the feature and concats it
    X_train_all_features = encode_and_concat_feature(X_train_all_features, feature_name, ohe)
    return ohe, X_train_all_features

# WEDNESDAY (SMOTE artificial sample production)

In [48]:
df = pd.read_csv('../../Data/music_subset.csv')
df = df.drop('Unnamed: 0', axis=1).set_index('track_id')
df = df[df['genre'] != 'classic pop and rock']
df['genre'].value_counts()

folk                     13192
dance and electronica     4935
jazz and blues            4334
soul and reggae           4016
punk                      3200
metal                     2103
classical                 1874
pop                       1617
hip-hop                    434
Name: genre, dtype: int64

In [49]:
X = df.drop('genre', axis=1)
y = df['genre']

In [50]:
X = encode_artist_name(X)

In [51]:
# train test split with stratified target('genre')
x_train, x_test, y_train, y_test = train_test_split(X, y, stratify=y)

In [52]:
x_test = x_test.drop('title', axis=1)
x_train = x_train.drop('title', axis=1)

In [53]:
print(len(x_train), len(y_train), len(x_test), len(y_test))

26778 26778 8927 8927


In [61]:
sm = SMOTE()

In [62]:
x_train_resamp, y_train_resamp = sm.fit_resample(x_train, y_train)

In [71]:
str(sm)

"SMOTE(k_neighbors=5, n_jobs=None, random_state=None, sampling_strategy='auto')"

In [55]:
print(y_train_resamp.value_counts(), y_test_resamp.value_counts())


NameError: name 'y_test_resamp' is not defined

In [None]:
rf = RandomForestClassifier(n_jobs=-1)
rf.fit(x_train_resamp, y_train_resamp)

In [None]:
print(classification_report(y_train_resamp, rf.predict(x_train_resamp)))

In [None]:
print(classification_report(y_test, rf.predict(x_test)))

higher score than our Random Forest Classifier using weighted classes

### Pickling

In [57]:
import pickle

In [67]:
with open("../../src/pickles/{}_classifier.pickle".format(type(sm).__name__), "wb") as output_file: # specify file name for model
    pickle.dump(sm, output_file)                     # call model by its name in code
output_file.close()

# load a model from file
# model_file = open("sm_classifier.pickle", "rb")
# loaded_model = pickle.load(model_file)
# model_file.close()

In [85]:
def pickle_it(model_name):
    import pickle
    
    output_file = open('../../src/pickles/{}_classifier.pickle'.format(type(model_name).__name__), 'wb')
    pickle.dump(model_name, output_file)
    output_file.close()
    
    print('I Kid You Not Jeff, He Turns Himself Into a Pickle')
    pass

In [86]:
pickle_it(sm)

I Kid You Not Jeff, He Turns Himself Into a Pickle
