In [None]:
# !pip install imblearn

In [2]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import OneHotEncoder, StandardScaler

from sklearn.impute import MissingIndicator, SimpleImputer

from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_selection import SelectFromModel

# plot_confusion_matrix is a handy visual tool, added in the latest version of scikit-learn
# if you are running an older version, comment out this line and just use confusion_matrix
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [3]:
df = pd.read_csv('../../Data/music_subset.csv')
df = df.drop('Unnamed: 0', axis=1)

In [4]:
def encode_and_concat_feature(X, feature_name, ohe):
    """
    Helper function for transforming a feature into multiple columns of 1s and 0s. Used
    in both training and testing steps.  Takes in the full X dataframe, feature name, 
    and encoder, and returns the dataframe with that feature transformed into multiple
    columns of 1s and 0s
    """
    # create new one-hot encoded df based on the feature
    single_feature_df = X[[feature_name]]
    feature_array = ohe.transform(single_feature_df).toarray()
    ohe_df = pd.DataFrame(feature_array, columns=ohe.categories_[0])
    # drop the old feature from X and concat the new one-hot encoded df
    X = pd.concat([X, ohe_df], axis=1)
    return X

In [5]:
def encode_and_concat_feature_train(X_train_all_features, feature_name):
    """
    Helper function for transforming training data.  It takes in the full X dataframe and
    feature name, makes a one-hot encoder, and returns the encoder as well as the dataframe
    with that feature transformed into multiple columns of 1s and 0s
    """
    # make a one-hot encoder and fit it to the training data
    ohe = OneHotEncoder(categories="auto", handle_unknown="ignore")
    single_feature_df = X_train_all_features[[feature_name]]
    ohe.fit(single_feature_df)
    # call helper function that actually encodes the feature and concats it
    X_train_all_features = encode_and_concat_feature(X_train_all_features, feature_name, ohe)
    return ohe, X_train_all_features

In [6]:
ohe, df = encode_and_concat_feature_train(df, 'genre')
ohe_genres = ohe.categories_[0]

In [8]:
# df.dtypes

In [9]:
key_dict = {0:'C', 1:'C#/Db', 2:'D', 
            3:'D#/Eb', 4:'E', 5:'F', 
            6:'F#/Gb', 7:'G', 8:'G#/Ab', 
            9:'A', 10:'A#/Bb', 11:'B'}
df['key']
genre_columns = ['classic pop and rock', 'classical', 
                 'dance and electronica', 'folk', 'hip-hop',
                'jazz and blues', 'metal', 'pop', 'punk',
                'soul and reggae']


In [10]:
X = df.drop((['genre', 'title', 'track_id', 'artist_name']+genre_columns), axis=1)
track_ids = df['track_id']
y = df['genre']

In [11]:
X_columns = X.columns
ss = StandardScaler()
X = pd.DataFrame(ss.fit_transform(X))
X.columns = X_columns
X.head()

Unnamed: 0,loudness,tempo,time_signature,key,mode,duration,avg_timbre1,avg_timbre2,avg_timbre3,avg_timbre4,...,var_timbre3,var_timbre4,var_timbre5,var_timbre6,var_timbre7,var_timbre8,var_timbre9,var_timbre10,var_timbre11,var_timbre12
0,0.508735,0.923956,-1.96521,1.048934,0.657427,-0.024294,0.835503,0.420007,0.160998,-0.02991,...,-0.651247,-0.959705,-0.709239,-0.606494,-0.987079,-0.675561,-0.828315,-0.910889,-0.651066,-0.965628
1,0.148327,0.737095,-1.96521,-0.348494,-1.521081,-0.476913,0.367161,-1.487053,0.872575,0.429374,...,-0.061616,-0.526899,-0.737697,-0.626681,-0.403496,-0.407307,-0.237706,-0.59047,-0.560248,-0.386641
2,-0.372447,-0.27795,-1.96521,1.32842,-1.521081,-0.729996,-0.572071,-0.998178,0.541128,-0.869605,...,-0.69096,1.054225,-0.416093,-0.446295,-0.324081,-0.27353,0.30511,-0.420348,0.854475,-0.384887
3,-0.242391,-0.148903,0.384558,0.489963,0.657427,0.006871,0.200291,0.46872,1.20784,0.053538,...,-1.000695,-0.974762,-0.944384,-0.957597,-0.829458,-0.982475,-0.7676,-1.02586,-1.199868,0.124363
4,-0.482479,0.539356,0.384558,1.048934,-1.521081,0.462,-0.045677,-0.107932,0.832519,-0.377376,...,-0.778112,-1.180547,-0.062467,-1.243304,-1.040457,-1.165665,0.098408,-1.186592,-1.223835,-0.636719


# WEDNESDAY (SMOTE artificial sample production)

1. read in csv (drop unneeded columns, encode artist name)
2. perform train test split
3. Preprocessing
4. Transformations
5. Model

In [60]:
df = pd.read_csv('../../Data/music_subset.csv')
df = df.drop('Unnamed: 0', axis=1).set_index('track_id')
df = df.drop('title', axis=1)
df = df[df['genre'] != 'classic pop and rock']

# df = encode_artist_name(df)
df = df.drop('artist_name', axis=1)

In [61]:
X = df.drop('genre', axis=1)
y = df['genre']

In [49]:
# train test split with stratified target('genre')
x_train, x_test, y_train, y_test = train_test_split(X, y, random_state=2020, stratify=y)

### << PIPELINE STARTS AFTER TRAIN-TEST SPLIT >>

In [74]:
from imblearn.pipeline import Pipeline

# sm = SMOTE()
# x_train_resamp, y_train_resamp = sm.fit_resample(x_train, y_train)
# x_train_resamp, y_train_resamp = sm.fit_resample(x_train, y_train)
# rf_model = RandomForestClassifier(n_jobs=-1)

pipe = Pipeline(steps=[
    ('smote', SMOTE(random_state=2020)),
    ('model', RandomForestClassifier(n_jobs=-1))
])

In [75]:
pipe.fit(x_train, y_train)

Pipeline(memory=None,
         steps=[('smote',
                 SMOTE(k_neighbors=5, n_jobs=None, random_state=2020,
                       sampling_strategy='auto')),
                ('model',
                 RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                        class_weight=None, criterion='gini',
                                        max_depth=None, max_features='auto',
                                        max_leaf_nodes=None, max_samples=None,
                                        min_impurity_decrease=0.0,
                                        min_impurity_split=None,
                                        min_samples_leaf=1, min_samples_split=2,
                                        min_weight_fraction_leaf=0.0,
                                        n_estimators=100, n_jobs=-1,
                                        oob_score=False, random_state=None,
                                        verbose=0, warm_start=False))],
 

In [38]:
print(len(x_train), len(y_train), len(x_test), len(y_test))

26778 26778 8927 8927


In [66]:
from sklearn.base import BaseEstimator

# sm = SMOTE()
# x_train_resamp, y_train_resamp = sm.fit_resample(x_train, y_train)

class SMOTE_Transformer(BaseEstimator):
    sm = SMOTE()
    def fit(self, X, y):
        X_new, y_new = sm.fit_resample(X, y)
        return X_new, y_new

    def transform(self, X, y=None):
        return X

In [40]:
# print(y_train_resamp.value_counts(), y_test_resamp.value_counts())


In [41]:
rf = RandomForestClassifier(n_jobs=-1)
rf.fit(x_train_resamp, y_train_resamp)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=-1, oob_score=False, random_state=None, verbose=0,
                       warm_start=False)

In [16]:
# # Step 3, 4, 5: perform all preprocessing steps on X_train and fit model
# pipe = Pipeline(steps=[
#     ("transform_precip", PrecipitationTransformer()),
#     ("encode_winter", ColumnTransformer(transformers=[
#         ("ohe", OneHotEncoder(sparse=False, handle_unknown="ignore"),
#          ["winter_severity_index"])
#     ], remainder="passthrough"
#     )),
#     ("linreg_model", LinearRegression())
# ])
# pipe.fit(X_train, y_train)

### << PIPELINE ENDS HERE >>

In [37]:
pipe.fit(x_train, y_train)

In [73]:
print(classification_report(y_train_resamp, rf.predict(x_train_resamp)))
print(classification_report(y_test, pipe.predict(x_test)))

                       precision    recall  f1-score   support

            classical       0.99      0.96      0.97      9894
dance and electronica       0.95      0.91      0.93      9894
                 folk       0.91      1.00      0.96      9894
              hip-hop       0.99      0.99      0.99      9894
       jazz and blues       0.95      0.93      0.94      9894
                metal       0.97      0.97      0.97      9894
                  pop       0.94      0.97      0.95      9894
                 punk       0.96      0.92      0.94      9894
      soul and reggae       0.92      0.93      0.92      9894

             accuracy                           0.95     89046
            macro avg       0.95      0.95      0.95     89046
         weighted avg       0.95      0.95      0.95     89046

                       precision    recall  f1-score   support

            classical       0.68      0.79      0.73       469
dance and electronica       0.66      0.64      0.6

In [44]:
print(classification_report(y_test, rf.predict(x_test)))

                       precision    recall  f1-score   support

            classical       0.70      0.78      0.74       469
dance and electronica       0.66      0.63      0.64      1234
                 folk       0.78      0.76      0.77      3298
              hip-hop       0.36      0.38      0.37       108
       jazz and blues       0.62      0.57      0.59      1084
                metal       0.78      0.76      0.77       526
                  pop       0.37      0.46      0.41       404
                 punk       0.65      0.63      0.64       800
      soul and reggae       0.55      0.60      0.57      1004

             accuracy                           0.67      8927
            macro avg       0.61      0.62      0.61      8927
         weighted avg       0.68      0.67      0.67      8927



higher score than our Random Forest Classifier using weighted classes

## hyperparameter tuning

In [25]:
rf = RandomForestClassifier(n_jobs=-1, min_samples_leaf=2, max_features=20)
rf.fit(x_train_resamp, y_train_resamp)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features=20,
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=2, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=-1, oob_score=False, random_state=None, verbose=0,
                       warm_start=False)

In [26]:
print(classification_report(y_train_resamp, rf.predict(x_train_resamp)))

                       precision    recall  f1-score   support

            classical       0.85      0.96      0.90      9894
dance and electronica       0.92      0.79      0.85      9894
                 folk       0.86      0.77      0.82      9894
              hip-hop       0.95      1.00      0.97      9894
       jazz and blues       0.91      0.85      0.88      9894
                metal       0.90      0.98      0.94      9894
                  pop       0.95      0.98      0.96      9894
                 punk       0.90      0.91      0.90      9894
      soul and reggae       0.93      0.95      0.94      9894

             accuracy                           0.91     89046
            macro avg       0.91      0.91      0.91     89046
         weighted avg       0.91      0.91      0.91     89046



In [27]:
print(classification_report(y_test, rf.predict(x_test)))

                       precision    recall  f1-score   support

            classical       0.48      0.91      0.63       469
dance and electronica       0.85      0.60      0.71      1234
                 folk       0.88      0.72      0.79      3298
              hip-hop       0.42      0.89      0.57       108
       jazz and blues       0.72      0.69      0.71      1084
                metal       0.69      0.92      0.79       526
                  pop       0.66      0.85      0.74       404
                 punk       0.72      0.79      0.75       800
      soul and reggae       0.78      0.83      0.80      1004

             accuracy                           0.75      8927
            macro avg       0.69      0.80      0.72      8927
         weighted avg       0.78      0.75      0.75      8927

