In [33]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import OneHotEncoder, StandardScaler

from sklearn.impute import MissingIndicator, SimpleImputer

from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_selection import SelectFromModel

from sklearn.pipeline import Pipeline

# plot_confusion_matrix is a handy visual tool, added in the latest version of scikit-learn
# if you are running an older version, comment out this line and just use confusion_matrix
# from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import confusion_matrix

In [75]:
# import data set
df = pd.read_csv('../../Data/music_subset.csv')

Steps
1. Import data from csv
2. Encode genre and concatenate
3. Drop non-numerical columns
4. Train test split
5. Fit training data to selected model
       a. Build pipeline to include all transforms plus final fit
6. Score model on training data
7. Score model on test data
8. Predict genre for a test row
9. Predict probability of genre for a test row

In [129]:
df['artist_name'].value_counts()

Daniel Johnston                        155
Carly Simon                            154
Gregory Isaacs                         143
Jim Reeves                             137
Pete Seeger                            131
                                      ... 
Lost Kids                                1
Nicki Minaj / Sean Garrett               1
Boyz II Men / Driver                     1
bloodsimple (featuring Chad Gray)        1
The Ruby Suns feat. The Penan tribe      1
Name: artist_name, Length: 3605, dtype: int64

In [89]:
df['genre'].value_counts()

classic pop and rock     23895
folk                     13192
dance and electronica     4935
jazz and blues            4334
soul and reggae           4016
punk                      3200
metal                     2103
classical                 1874
pop                       1617
hip-hop                    434
Name: genre, dtype: int64

In [77]:
# for quick reference
key_dict = {0:'C', 1:'C#/Db', 2:'D', 
            3:'D#/Eb', 4:'E', 5:'F', 
            6:'F#/Gb', 7:'G', 8:'G#/Ab', 
            9:'A', 10:'A#/Bb', 11:'B'}

In [34]:
def encode_and_concat_feature(X, feature_name, ohe):
    """
    Helper function for transforming a feature into multiple columns of 1s and 0s. Used
    in both training and testing steps.  Takes in the full X dataframe, feature name, 
    and encoder, and returns the dataframe with that feature transformed into multiple
    columns of 1s and 0s
    """
    # create new one-hot encoded df based on the feature
    single_feature_df = X[[feature_name]]
    feature_array = ohe.transform(single_feature_df).toarray()
    ohe_df = pd.DataFrame(feature_array, columns=ohe.categories_[0])
    # drop the old feature from X and concat the new one-hot encoded df
    X = pd.concat([X, ohe_df], axis=1)
    return X

In [39]:
def encode_and_concat_feature_train(X_train_all_features, feature_name):
    """
    Helper function for transforming training data.  It takes in the full X dataframe and
    feature name, makes a one-hot encoder, and returns the encoder as well as the dataframe
    with that feature transformed into multiple columns of 1s and 0s
    """
    # make a one-hot encoder and fit it to the training data
    ohe = OneHotEncoder(categories="auto", handle_unknown="ignore")
    single_feature_df = X_train_all_features[[feature_name]]
    ohe.fit(single_feature_df)
    # call helper function that actually encodes the feature and concats it
    X_train_all_features = encode_and_concat_feature(X_train_all_features, feature_name, ohe)
    X_train_all_features = X_train_all_features.drop(['Unnamed: 0'], axis=1)
    return ohe, X_train_all_features

In [76]:
# one hot encode the genre column and concatenate those columns to the dataframe
ohe, df_encoded = encode_and_concat_feature_train(df, 'genre')

In [78]:
# drop non-number, and one hot encoded columns
genre_columns = list(df_encoded['genre'].unique())

X = df_encoded.drop((['genre', 'track_id', 'title', 'artist_name']+genre_columns), axis=1)
y = df_encoded['genre']

In [11]:
X_timbres = X.drop(['loudness', 'tempo', 'time_signature', 'key', 'mode', 'duration'], axis=1)
X_no_timbres = X.drop(X_timbres.columns, axis=1)
X_genres = df[genre_columns]
X_no_timbres = pd.concat([X_no_timbres, X_genres], sort=True, axis=1)
X_no_timbres.columns

Index(['loudness', 'tempo', 'time_signature', 'key', 'mode', 'duration',
       'classic pop and rock', 'classical', 'dance and electronica', 'folk',
       'hip-hop', 'jazz and blues', 'metal', 'pop', 'punk', 'soul and reggae'],
      dtype='object')

In [117]:
df_encoded1 = df_encoded[df_encoded['genre']!='classic pop and rock']
X1 = df_encoded1.drop((['genre', 'track_id', 'title', 'artist_name']+genre_columns), axis=1)
y = df_encoded1['genre']

In [118]:
df_encoded1['genre'].value_counts()

folk                     13192
dance and electronica     4935
jazz and blues            4334
soul and reggae           4016
punk                      3200
metal                     2103
classical                 1874
pop                       1617
hip-hop                    434
Name: genre, dtype: int64

#### pairplots w genre encoding

In [10]:
# sns.pairplot(X_no_timbres)

#### modeling for predicting genre

In [119]:
x1_train, x1_test, y1_train, y1_test = train_test_split(X1, y)

In [125]:
rf = RandomForestClassifier(class_weight='balanced')
rf1 = RandomForestClassifier()
rf.fit(x1_train, y1_train)
rf1.fit(x1_train, y1_train)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [91]:
# encode_concat = Pipeline(steps=[
#     ('encoded_concated', encode_and_concat_feature_train(df, 'genre')),
#     ('rf', rf)
# ])

In [126]:
print(rf.score(x1_train, y1_train))
print(rf1.score(x1_train, y1_train))

0.9880125476137127
0.9888341175591904


In [112]:
y.value_counts()

classic pop and rock     23895
folk                     13192
dance and electronica     4935
jazz and blues            4334
soul and reggae           4016
punk                      3200
metal                     2103
classical                 1874
pop                       1617
hip-hop                    434
Name: genre, dtype: int64

In [127]:
print(rf.score(x1_test, y1_test))
print(rf1.score(x1_test, y1_test))

0.6266382883387476
0.6307830178111348


In [97]:
from sklearn.metrics import classification_report

In [128]:
print(classification_report(y1_test, rf.predict(x1_test)))
print(classification_report(y1_test, rf1.predict(x1_test)))

                       precision    recall  f1-score   support

            classical       0.73      0.67      0.70       464
dance and electronica       0.57      0.57      0.57      1239
                 folk       0.64      0.87      0.74      3321
              hip-hop       0.38      0.14      0.21        84
       jazz and blues       0.62      0.40      0.49      1094
                metal       0.75      0.67      0.71       504
                  pop       0.38      0.12      0.18       414
                 punk       0.69      0.51      0.59       838
      soul and reggae       0.53      0.43      0.48       969

             accuracy                           0.63      8927
            macro avg       0.59      0.49      0.52      8927
         weighted avg       0.62      0.63      0.61      8927

                       precision    recall  f1-score   support

            classical       0.69      0.67      0.68       464
dance and electronica       0.54      0.58      0.5

# cross_val_score(rf, x_test, y_test)

In [96]:
confusion_matrix(y_test, rf.predict(x_test))

array([[4677,   18,  148,  757,    4,   85,   46,   14,   75,  109],
       [  92,  310,   18,   33,    0,   14,    2,    1,    0,    1],
       [ 677,   11,  381,   88,    3,   40,    6,    5,   10,   33],
       [1647,   15,   36, 1496,    2,   62,    4,    5,    2,   15],
       [  65,    1,   12,    5,    9,    0,    0,    0,    2,   19],
       [ 489,   38,   52,  178,    0,  335,    0,    0,    0,   11],
       [ 161,    1,    9,    7,    0,    2,  315,    1,   30,    0],
       [ 323,    0,   13,   52,    0,    2,    1,   14,    4,   12],
       [ 431,    2,   14,   26,    2,    5,   41,    0,  233,   10],
       [ 674,    0,   46,   79,    3,    9,    1,    4,    4,  211]])

#### modeling for predicting whether or not a song is 'classic pop and rock'

In [17]:
df.columns

Index(['genre', 'track_id', 'artist_name', 'title', 'loudness', 'tempo',
       'time_signature', 'key', 'mode', 'duration', 'avg_timbre1',
       'avg_timbre2', 'avg_timbre3', 'avg_timbre4', 'avg_timbre5',
       'avg_timbre6', 'avg_timbre7', 'avg_timbre8', 'avg_timbre9',
       'avg_timbre10', 'avg_timbre11', 'avg_timbre12', 'var_timbre1',
       'var_timbre2', 'var_timbre3', 'var_timbre4', 'var_timbre5',
       'var_timbre6', 'var_timbre7', 'var_timbre8', 'var_timbre9',
       'var_timbre10', 'var_timbre11', 'var_timbre12', 'classic pop and rock',
       'classical', 'dance and electronica', 'folk', 'hip-hop',
       'jazz and blues', 'metal', 'pop', 'punk', 'soul and reggae'],
      dtype='object')

In [18]:
# this is the 'accuracy' of a model (test on our whole dataset) that would just predict a genre of classic rock for every song in the data set
df['classic pop and rock'].value_counts()[1]/df['classic pop and rock'].value_counts()[0]

0.6692340008402184

In [19]:
X = df.drop(['genre', 'track_id', 'artist_name', 'title'], axis=1)
y = df['classic pop and rock']
X = X.drop(genre_columns, axis=1)
X.columns

Index(['loudness', 'tempo', 'time_signature', 'key', 'mode', 'duration',
       'avg_timbre1', 'avg_timbre2', 'avg_timbre3', 'avg_timbre4',
       'avg_timbre5', 'avg_timbre6', 'avg_timbre7', 'avg_timbre8',
       'avg_timbre9', 'avg_timbre10', 'avg_timbre11', 'avg_timbre12',
       'var_timbre1', 'var_timbre2', 'var_timbre3', 'var_timbre4',
       'var_timbre5', 'var_timbre6', 'var_timbre7', 'var_timbre8',
       'var_timbre9', 'var_timbre10', 'var_timbre11', 'var_timbre12'],
      dtype='object')

In [20]:
classic_x_train, classic_x_test, classic_y_train, classic_y_test = train_test_split(X, y)

In [21]:
rf_classic = RandomForestClassifier()
rf_classic.fit(classic_x_train, classic_y_train)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [22]:
rf_classic.score(classic_x_train, classic_y_train)

0.9851006711409396

In [23]:
rf_classic.score(classic_x_test, classic_y_test)

0.6714093959731544

In [24]:
cross_val_score(rf_classic, classic_x_test, classic_y_test)



array([0.65633179, 0.65592913, 0.64760371])