In [1]:
#Import libraries:
import pandas as pd
import numpy as np

from sklearn.model_selection import cross_validate
from sklearn import metrics  
from sklearn.ensemble import RandomForestClassifier

import matplotlib.pyplot as plt
%matplotlib inline
from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 12, 4

In [2]:
#Load Test and Training Data
test_data = pd.read_csv('C:\\Users\\Inno Mvula\\Desktop\\MSc Quantitative Finance\\S2.CS985 - Machine Learning and Data Analytics\\Assignment\\cs98xspotifyclassification\\CS98XClassificationTest.csv')
train_data = pd.read_csv('C:\\Users\\Inno Mvula\\Desktop\\MSc Quantitative Finance\\S2.CS985 - Machine Learning and Data Analytics\\Assignment\\cs98xspotifyclassification\\CS98XClassificationTrain.csv')

In [3]:
#Obseravtion of Training Dataset
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 453 entries, 0 to 452
Data columns (total 15 columns):
Id           453 non-null int64
title        453 non-null object
artist       453 non-null object
year         453 non-null int64
bpm          453 non-null int64
nrgy         453 non-null int64
dnce         453 non-null int64
dB           453 non-null int64
live         453 non-null int64
val          453 non-null int64
dur          453 non-null int64
acous        453 non-null int64
spch         453 non-null int64
pop          453 non-null int64
top genre    438 non-null object
dtypes: int64(12), object(3)
memory usage: 53.2+ KB


In [4]:
train_data.isnull().sum()

Id            0
title         0
artist        0
year          0
bpm           0
nrgy          0
dnce          0
dB            0
live          0
val           0
dur           0
acous         0
spch          0
pop           0
top genre    15
dtype: int64

In [5]:
#Drop rows where genre is missing. Only 15 were missing out of 453 so dropping 15 shouldn't have a huge effect on the dataset
train_data = train_data.dropna()
train_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 438 entries, 0 to 452
Data columns (total 15 columns):
Id           438 non-null int64
title        438 non-null object
artist       438 non-null object
year         438 non-null int64
bpm          438 non-null int64
nrgy         438 non-null int64
dnce         438 non-null int64
dB           438 non-null int64
live         438 non-null int64
val          438 non-null int64
dur          438 non-null int64
acous        438 non-null int64
spch         438 non-null int64
pop          438 non-null int64
top genre    438 non-null object
dtypes: int64(12), object(3)
memory usage: 54.8+ KB


In [6]:
train_data.head()

Unnamed: 0,Id,title,artist,year,bpm,nrgy,dnce,dB,live,val,dur,acous,spch,pop,top genre
0,1,My Happiness,Connie Francis,1996,107,31,45,-8,13,28,150,75,3,44,adult standards
2,3,How Deep Is Your Love,Bee Gees,1979,105,36,63,-9,13,67,245,11,3,77,adult standards
3,4,Woman in Love,Barbra Streisand,1980,170,28,47,-16,13,33,232,25,3,67,adult standards
4,5,Goodbye Yellow Brick Road - Remastered 2014,Elton John,1973,121,47,56,-8,15,40,193,45,3,63,glam rock
5,6,Grenade,Bruno Mars,2010,110,56,71,-7,12,23,223,15,6,74,pop


In [7]:
#Obseravtion of the distribution of genres
train_data['top genre'].value_counts()

adult standards       68
album rock            66
dance pop             61
glam rock             16
brill building pop    16
                      ..
britpop                1
italian pop            1
classic danish pop     1
rock-and-roll          1
german dance           1
Name: top genre, Length: 86, dtype: int64

In [8]:
#Creating the dependent variable class and encoding
factor = pd.factorize(train_data['top genre'])
train_data['top genre'] = factor[0]
definitions = factor[1]
print(train_data['top genre'].head())
print(definitions)

0    0
2    0
3    0
4    1
5    2
Name: top genre, dtype: int64
Index(['adult standards', 'glam rock', 'pop', 'album rock', 'boy band',
       'brill building pop', 'rock-and-roll', 'country rock', 'canadian pop',
       'dance pop', 'europop', 'bebop', 'art rock', 'disco', 'blues',
       'barbadian pop', 'deep adult standards', 'deep house', 'atl hip hop',
       'dance rock', 'disco house', 'bubblegum dance', 'classic soul',
       'classic uk pop', 'east coast hip hop', 'doo-wop', 'britpop',
       'british blues', 'soft rock', 'australian talent show', 'art pop',
       'british soul', 'british invasion', 'belgian pop', 'big room',
       'german dance', 'italian pop', 'british folk', 'brit funk',
       'chicago soul', 'neo mellow', 'british comedy', 'hip hop',
       'new wave pop', 'eurodance', 'detroit hip hop', 'classic rock',
       'uk garage', 'afrobeat', 'r&b', 'chicago rap', 'classic danish pop',
       'british dance band', 'drone folk', 'permanent wave', 'merseybeat',

In [9]:
train_data.head()

Unnamed: 0,Id,title,artist,year,bpm,nrgy,dnce,dB,live,val,dur,acous,spch,pop,top genre
0,1,My Happiness,Connie Francis,1996,107,31,45,-8,13,28,150,75,3,44,0
2,3,How Deep Is Your Love,Bee Gees,1979,105,36,63,-9,13,67,245,11,3,77,0
3,4,Woman in Love,Barbra Streisand,1980,170,28,47,-16,13,33,232,25,3,67,0
4,5,Goodbye Yellow Brick Road - Remastered 2014,Elton John,1973,121,47,56,-8,15,40,193,45,3,63,1
5,6,Grenade,Bruno Mars,2010,110,56,71,-7,12,23,223,15,6,74,2


In [10]:
train_data['top genre'].value_counts()

0     68
3     66
9     61
1     16
5     16
      ..
61     1
27     1
26     1
47     1
85     1
Name: top genre, Length: 86, dtype: int64

In [11]:
#Drop all genres with only one instance. Synthetic Minority Over-sampling Technique (SMOTE) has issues handling classes with only one instance
counts = train_data['top genre'].value_counts()
rtrain_data = train_data[~train_data['top genre'].isin(counts[counts < 2].index)]
rtrain_data['top genre'].value_counts()

0     68
3     66
9     61
1     16
5     16
10    14
19    13
4     10
32     8
12     7
13     7
21     7
18     6
15     6
16     6
44     6
28     5
22     5
2      5
31     5
23     4
24     4
25     4
20     3
56     3
43     3
71     2
54     2
64     2
62     2
58     2
57     2
7      2
8      2
55     2
50     2
11     2
30     2
46     2
45     2
42     2
39     2
37     2
34     2
33     2
81     2
Name: top genre, dtype: int64

In [12]:
#reduced dataframe with genres that have 2 or more instances
rtrain_data

Unnamed: 0,Id,title,artist,year,bpm,nrgy,dnce,dB,live,val,dur,acous,spch,pop,top genre
0,1,My Happiness,Connie Francis,1996,107,31,45,-8,13,28,150,75,3,44,0
2,3,How Deep Is Your Love,Bee Gees,1979,105,36,63,-9,13,67,245,11,3,77,0
3,4,Woman in Love,Barbra Streisand,1980,170,28,47,-16,13,33,232,25,3,67,0
4,5,Goodbye Yellow Brick Road - Remastered 2014,Elton John,1973,121,47,56,-8,15,40,193,45,3,63,1
5,6,Grenade,Bruno Mars,2010,110,56,71,-7,12,23,223,15,6,74,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
448,449,But Not For Me,Ella Fitzgerald,1959,80,22,18,-17,10,16,214,92,4,45,0
449,450,Surf City,Jan & Dean,2010,148,81,53,-13,23,96,147,50,3,50,5
450,451,Dilemma,Nelly,2002,168,55,73,-8,20,61,289,23,14,77,9
451,452,It's Gonna Be Me,*NSYNC,2000,165,87,64,-5,6,88,191,5,8,62,4


In [13]:
#Extracting features and Target
#Splitting the data into independent and dependent variables
X = rtrain_data.iloc[:, 4:14].values
Y = rtrain_data.iloc[:, 14].values
print('The independent features set: ')
print(X[:5,:])
print('The dependent variable: ')
print(Y[:5])

The independent features set: 
[[107  31  45  -8  13  28 150  75   3  44]
 [105  36  63  -9  13  67 245  11   3  77]
 [170  28  47 -16  13  33 232  25   3  67]
 [121  47  56  -8  15  40 193  45   3  63]
 [110  56  71  -7  12  23 223  15   6  74]]
The dependent variable: 
[0 0 0 1 2]


In [14]:
len(X), len(Y)

(398, 398)

In [15]:
gen_dict ={}
for gen in rtrain_data['top genre']:
    if gen not in gen_dict:
        gen_dict[gen] = 1
    else:
        gen_dict[gen] += 1
print(gen_dict)

{0: 68, 1: 16, 2: 5, 3: 66, 4: 10, 5: 16, 7: 2, 8: 2, 9: 61, 10: 14, 11: 2, 12: 7, 13: 7, 15: 6, 16: 6, 18: 6, 19: 13, 20: 3, 21: 7, 22: 5, 23: 4, 24: 4, 25: 4, 28: 5, 30: 2, 31: 5, 32: 8, 33: 2, 34: 2, 37: 2, 39: 2, 42: 2, 43: 3, 44: 6, 45: 2, 46: 2, 50: 2, 54: 2, 55: 2, 56: 3, 57: 2, 58: 2, 62: 2, 64: 2, 71: 2, 81: 2}


In [16]:
#create sampling strategy dict. This dictionary adds 4 sythentic duplicates to each class with less than 60 instances.
#Reason for choosing 4 is because after experimenting with values, adding 4 instances gave me the best results
#the dataset is still relatively imablanced but there is slight improvement.
for key, value in gen_dict.items():
    if value < 60:
        gen_dict[key] += 4
print(gen_dict)

{0: 68, 1: 20, 2: 9, 3: 66, 4: 14, 5: 20, 7: 6, 8: 6, 9: 61, 10: 18, 11: 6, 12: 11, 13: 11, 15: 10, 16: 10, 18: 10, 19: 17, 20: 7, 21: 11, 22: 9, 23: 8, 24: 8, 25: 8, 28: 9, 30: 6, 31: 9, 32: 12, 33: 6, 34: 6, 37: 6, 39: 6, 42: 6, 43: 7, 44: 10, 45: 6, 46: 6, 50: 6, 54: 6, 55: 6, 56: 7, 57: 6, 58: 6, 62: 6, 64: 6, 71: 6, 81: 6}


In [17]:
#improving the balance of the dataset using SMOTE.
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state = 42, k_neighbors = 1, sampling_strategy = gen_dict )
x_train_res, y_train_res = sm.fit_sample(X, Y)

In [18]:
len(x_train_res), len(y_train_res)

(570, 570)

In [19]:
#Creating the Training and Test set from data
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(x_train_res, y_train_res, test_size = 0.10, random_state = 42, stratify = y_train_res)

In [20]:
len(X_train)

513

In [21]:
import collections
print(collections.Counter(Y_train))

Counter({0: 61, 3: 59, 9: 55, 5: 18, 1: 18, 10: 16, 19: 15, 4: 13, 32: 11, 21: 10, 13: 10, 12: 10, 15: 9, 16: 9, 18: 9, 44: 9, 31: 8, 28: 8, 2: 8, 22: 8, 25: 7, 23: 7, 24: 7, 20: 6, 8: 6, 34: 6, 7: 6, 55: 6, 30: 6, 42: 6, 43: 6, 81: 6, 56: 6, 33: 6, 50: 6, 58: 6, 11: 5, 71: 5, 45: 5, 39: 5, 46: 5, 64: 5, 62: 5, 57: 5, 54: 5, 37: 5})


In [22]:
len(collections.Counter(Y_train))

46

In [23]:
#Here we have selected several parameters we would like to tune to get optimal values in each that improve our model
#parameters = {"n_estimators":num_est, "max_depth": max_dep, "criterion": crit, "max_features": max_feat}
rfc = RandomForestClassifier(random_state = 42)
num_est = []
max_dep = []
crit = ['gini', 'entropy']
max_feat = []
for i in range(5, 100, 5):
    num_est.append(i)
for l in range(2, 16, 2):
    max_dep.append(l)
max_dep.append(None)
for f in range(1, 11, 1):
    max_feat.append(f)
parameters = {
    "n_estimators":num_est,
    "max_depth": max_dep,
    "criterion": crit,
    "max_features": max_feat
    
}
print(parameters)

{'n_estimators': [5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95], 'max_depth': [2, 4, 6, 8, 10, 12, 14, None], 'criterion': ['gini', 'entropy'], 'max_features': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}


In [24]:
#Here we use GridSearch to hypertune our parameters to get optimal values. Gridsearch iterates through all the values
#set for each parameter and returns the best ones
from sklearn.model_selection import GridSearchCV
cv = GridSearchCV(rfc,parameters,cv = 3)
cv.fit(X_train, Y_train)

GridSearchCV(cv=3, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=None,
                                              oob_score=False, random_state=42,
                                  

In [38]:
#Function used to print out different parameter combinations and their scores
def display(results):
    print(f'Best parameters are: {results.best_params_}')
    print("\n")
    mean_score = results.cv_results_['mean_test_score']
    std_score = results.cv_results_['std_test_score']
    params = results.cv_results_['params']
    for mean,std,params in zip(mean_score, std_score, params):
        if round(mean,3) >= 0.57:
            print(f'{round(mean,3)} + or -{round(std,3)} for the {params}')

In [39]:
display(cv)

Best parameters are: {'criterion': 'entropy', 'max_depth': 14, 'max_features': 3, 'n_estimators': 80}


0.571 + or -0.044 for the {'criterion': 'entropy', 'max_depth': 14, 'max_features': 3, 'n_estimators': 80}
0.571 + or -0.044 for the {'criterion': 'entropy', 'max_depth': None, 'max_features': 3, 'n_estimators': 80}


In [40]:
#Train and fit our model using the recommended values for our parameters
from sklearn.ensemble import RandomForestClassifier
ranfor = RandomForestClassifier(n_estimators = 80, random_state = 42, criterion = 'entropy', max_features = 3, max_depth = 14)
ranfor.fit(X_train, Y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='entropy', max_depth=14, max_features=3,
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=80,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

In [41]:
#evaluation
Y_pred = ranfor.predict(X_test)
#Reverse factorize
reversefactor = dict(zip(range(86),definitions))
Y_test = np.vectorize(reversefactor.get)(Y_test)
Y_pred = np.vectorize(reversefactor.get)(Y_pred)
# Making the Confusion Matrix
print(pd.crosstab(Y_test, Y_pred, rownames=['Actual genres'], colnames=['Predicted genres']))

Predicted genres      adult standards  album rock  alternative metal  \
Actual genres                                                          
adult standards                     3           4                  0   
album rock                          0           5                  0   
alternative metal                   0           0                  1   
art rock                            1           0                  0   
atl hip hop                         0           0                  0   
barbadian pop                       0           0                  0   
bebop                               0           0                  0   
blues rock                          0           0                  0   
boy band                            0           0                  0   
brill building pop                  1           0                  0   
british folk                        0           0                  0   
british invasion                    0           1               

In [42]:
#print a classification report depicting the precision, recall, and f1-score of t=the different classes and overall model
from sklearn.metrics import classification_report
class_rep_forest = classification_report(Y_test, Y_pred)
print(class_rep_forest)

                      precision    recall  f1-score   support

     adult standards       0.30      0.43      0.35         7
          album rock       0.38      0.71      0.50         7
   alternative metal       1.00      1.00      1.00         1
            art rock       0.00      0.00      0.00         1
         atl hip hop       0.00      0.00      0.00         1
       barbadian pop       0.00      0.00      0.00         1
               bebop       1.00      1.00      1.00         1
          blues rock       1.00      1.00      1.00         1
            boy band       1.00      1.00      1.00         1
  brill building pop       1.00      0.50      0.67         2
        british folk       0.00      0.00      0.00         1
    british invasion       0.00      0.00      0.00         1
        british soul       1.00      1.00      1.00         1
     bubblegum dance       1.00      1.00      1.00         1
             chanson       1.00      1.00      1.00         1
       

  _warn_prf(average, modifier, msg_start, len(result))


In [43]:
#Storing the trained model
#We are going to observe the importance for each of the features and then store the Random Forest classifier using the joblib function of sklearn.
print(list(zip(train_data.columns[4:14], ranfor.feature_importances_)))

[('bpm', 0.08981873811722724), ('nrgy', 0.10878154932831144), ('dnce', 0.10249459980229532), ('dB', 0.08968351884835), ('live', 0.07271296173982128), ('val', 0.0747656589218342), ('dur', 0.1529996730315351), ('acous', 0.10846830858945702), ('spch', 0.07813118007587419), ('pop', 0.12214381154529423)]


In [44]:
# predicting test data results
test_data.head()

Unnamed: 0,Id,title,artist,year,bpm,nrgy,dnce,dB,live,val,dur,acous,spch,pop
0,454,Pump It,The Black Eyed Peas,2005,154,93,65,-3,75,74,213,1,18,72
1,455,"Circle of Life - From ""The Lion King""/Soundtra...",Elton John,1994,161,39,30,-15,11,14,292,26,3,59
2,456,We Are The Champions - Remastered 2011,Queen,1977,64,46,27,-7,12,18,179,38,3,76
3,457,Insomnia - Radio Edit,Faithless,2010,127,92,71,-9,37,53,216,6,4,50
4,458,This Eve of Parting,John Hartford,2018,115,46,56,-12,21,34,153,18,3,44


In [45]:
#Extracting our features from the test data and predicting their genres
X1 = test_data.iloc[:, 4:14].values
reversefactor = dict(zip(range(86),definitions))
test_data['top genre'] = ranfor.predict(X1)
test_data['top genre'] = np.vectorize(reversefactor.get)(test_data['top genre'])

In [46]:
test_data.head(10)

Unnamed: 0,Id,title,artist,year,bpm,nrgy,dnce,dB,live,val,dur,acous,spch,pop,top genre
0,454,Pump It,The Black Eyed Peas,2005,154,93,65,-3,75,74,213,1,18,72,alternative metal
1,455,"Circle of Life - From ""The Lion King""/Soundtra...",Elton John,1994,161,39,30,-15,11,14,292,26,3,59,album rock
2,456,We Are The Champions - Remastered 2011,Queen,1977,64,46,27,-7,12,18,179,38,3,76,british invasion
3,457,Insomnia - Radio Edit,Faithless,2010,127,92,71,-9,37,53,216,6,4,50,dance pop
4,458,This Eve of Parting,John Hartford,2018,115,46,56,-12,21,34,153,18,3,44,adult standards
5,459,You Keep Me Hangin On,Kim Wilde,1986,123,68,69,-12,14,71,255,8,3,60,album rock
6,460,Mandy,Barry Manilow,1974,104,46,31,-9,14,32,213,42,3,69,dance pop
7,461,Collette,Billy Fury,1982,155,44,55,-10,10,73,112,82,4,27,deep adult standards
8,462,Against All Odds (Take a Look at Me Now) - 201...,Phil Collins,2016,116,50,58,-7,13,11,206,20,3,74,dance pop
9,463,Highway to Hell,AC/DC,1979,116,91,57,-5,16,42,208,6,13,83,album rock


In [47]:
test_data['top genre'].value_counts()

album rock              31
adult standards         27
dance pop               24
glam rock                3
new wave pop             3
british invasion         3
disco                    2
atl hip hop              2
glam metal               2
deep adult standards     2
boy band                 2
bronx hip hop            1
art pop                  1
chicago soul             1
disco house              1
permanent wave           1
brill building pop       1
g funk                   1
europop                  1
bubblegum dance          1
merseybeat               1
alternative metal        1
dance rock               1
Name: top genre, dtype: int64

In [48]:
#saving predictions as a csv for submission on kaggle
prediction = test_data[['Id', 'top genre']]
prediction.to_csv("SamplingRF29_submission.csv", index=False)
prediction.tail(10)

Unnamed: 0,Id,top genre
103,558,deep adult standards
104,559,dance pop
105,560,europop
106,561,new wave pop
107,562,adult standards
108,563,dance pop
109,564,dance pop
110,565,dance pop
111,566,dance pop
112,567,glam metal
