In [14]:
import pandas as pd
import numpy as np
%matplotlib inline
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV
import numpy as np
%matplotlib inline

In [2]:
genre = pd.read_excel('genres.xlsx')
songs = pd.read_csv('songs.csv')
artists = pd.read_csv('artist.csv')

In [3]:
#needed functions for preprocessing
def normalize_column(df_column: pd.Series) -> pd.Series:
    """
    This method should recalculate all values of a numeric column and normalise it between 0 and 1.
    :param df_column: Dataset's column
    :return: The column normalized
    """
    min_ = df_column.min()
    max_ = df_column.max()
    return (df_column - min_) / (max_ - min_)


def generate_label_encoder(df_column: pd.Series) -> LabelEncoder:
    """
    This method should generate a (sklearn version of a) label encoder of a categorical column
    :param df_column: Dataset's column
    :return: A label encoder of the column
    """

    #generate and fit the encoder
    label_encoder = LabelEncoder()
    return label_encoder.fit(df_column)



def replace_with_label_encoder(df: pd.DataFrame, column: str, le: LabelEncoder) -> pd.DataFrame:
    """
    This method should replace the column of df with the label encoder's version of the column
    :param df: Dataset
    :param column: column to be replaced
    :param le: the label encoder to be used to replace the column
    :return: The df with the column replaced with the one from label encoder
    """

    #transform the encoder in order to replace with the previous column
    df1 = df.copy() #making a copy of df and returning the new updated one in order not to change df because it will be called in main function
    df1[column] = le.transform(df1[column])
    return df1

In [4]:
genre.drop(['popularity'],axis=1,inplace=True)

In [5]:
genre['duration_ms'] = normalize_column(genre['duration_ms'])
genre['loudness'] = normalize_column(genre['loudness'])
genre['key'] = normalize_column(genre['key'])
genre['tempo'] = normalize_column(genre['tempo'])

In [6]:
genre.describe()

Unnamed: 0,acousticness,danceability,duration_ms,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,key,mode
count,993.0,993.0,993.0,993.0,993.0,993.0,993.0,993.0,993.0,993.0,993.0,993.0
mean,0.378522,0.552452,0.210013,0.562817,0.288778,0.192803,0.748251,0.06805,0.371563,0.47882,0.551863,0.823766
std,0.314923,0.139433,0.0825,0.227355,0.296682,0.097128,0.153539,0.053207,0.109188,0.193495,0.29623,0.381211
min,1.3e-05,0.0878,0.0,0.00395,0.0,0.0165,0.0,0.0258,0.0,0.0326,0.0,0.0
25%,0.104579,0.476421,0.161651,0.398871,0.0331,0.135071,0.699422,0.045459,0.315281,0.349233,0.363636,1.0
50%,0.287462,0.564644,0.192111,0.612272,0.160567,0.180736,0.787832,0.05647,0.383509,0.488772,0.636364,1.0
75%,0.636397,0.6406,0.244734,0.731036,0.502,0.221526,0.849001,0.07555,0.424658,0.615604,0.818182,1.0
max,0.996,0.94,1.0,0.999,0.968,0.873,1.0,0.946248,1.0,0.969,1.0,1.0


In [7]:
#label encoding the genre
le = generate_label_encoder(genre['genres'])
genre['genres'].unique()

array(['acoustic', 'ambient', 'blues', 'chill', 'classical', 'dance',
       'pop', 'rock', 'jazz', 'techno', 'house'], dtype=object)

In [8]:
df = genre.copy()
df['genres'] = le.fit_transform(genre.genres)

In [9]:
df

Unnamed: 0,genres,acousticness,danceability,duration_ms,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,key,mode
0,0,0.740820,0.602674,0.147095,0.365927,0.105188,0.168647,0.703573,0.067375,0.336886,0.635845,0.818182,1
1,0,0.000013,0.393000,0.015508,0.706000,0.000006,0.873000,0.903204,0.048800,0.430209,0.155000,0.090909,1
2,0,0.977000,0.397000,0.184918,0.275000,0.912000,0.119000,0.573193,0.049300,0.706965,0.499000,0.363636,0
3,0,0.438118,0.552803,0.175781,0.512679,0.092907,0.181124,0.790551,0.046735,0.381409,0.392058,0.181818,1
4,0,0.117354,0.569396,0.149886,0.718587,0.401008,0.188727,0.761440,0.069765,0.354678,0.549017,1.000000,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
988,6,0.147585,0.658971,0.176350,0.739907,0.175272,0.188220,0.878881,0.079463,0.395056,0.458863,0.636364,1
989,6,0.027995,0.712273,0.199983,0.858485,0.284335,0.144903,0.906275,0.053197,0.406091,0.486303,0.090909,0
990,6,0.000334,0.673000,0.375662,0.911000,0.854500,0.071800,0.861591,0.039750,0.404727,0.408000,0.545455,0
991,6,0.037839,0.726576,0.256566,0.784644,0.310123,0.154810,0.843280,0.066680,0.412614,0.620237,0.454545,0


In [10]:
#x y split
X = df.drop('genres',axis=1)
y = df['genres']

In [11]:
X

Unnamed: 0,acousticness,danceability,duration_ms,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,key,mode
0,0.740820,0.602674,0.147095,0.365927,0.105188,0.168647,0.703573,0.067375,0.336886,0.635845,0.818182,1
1,0.000013,0.393000,0.015508,0.706000,0.000006,0.873000,0.903204,0.048800,0.430209,0.155000,0.090909,1
2,0.977000,0.397000,0.184918,0.275000,0.912000,0.119000,0.573193,0.049300,0.706965,0.499000,0.363636,0
3,0.438118,0.552803,0.175781,0.512679,0.092907,0.181124,0.790551,0.046735,0.381409,0.392058,0.181818,1
4,0.117354,0.569396,0.149886,0.718587,0.401008,0.188727,0.761440,0.069765,0.354678,0.549017,1.000000,1
...,...,...,...,...,...,...,...,...,...,...,...,...
988,0.147585,0.658971,0.176350,0.739907,0.175272,0.188220,0.878881,0.079463,0.395056,0.458863,0.636364,1
989,0.027995,0.712273,0.199983,0.858485,0.284335,0.144903,0.906275,0.053197,0.406091,0.486303,0.090909,0
990,0.000334,0.673000,0.375662,0.911000,0.854500,0.071800,0.861591,0.039750,0.404727,0.408000,0.545455,0
991,0.037839,0.726576,0.256566,0.784644,0.310123,0.154810,0.843280,0.066680,0.412614,0.620237,0.454545,0


In [12]:
#train-test split
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.25, random_state=42)

Training model for predicting Genre

Random Forest

In [27]:
#gridsearch for random forest
est= RandomForestClassifier()
param_grid = { 
            "n_estimators"      : [50,100],
            "max_features"      : ["auto", "sqrt", "log2"],
            "min_samples_split" : [2,4,8],
            "bootstrap": [True],
            }

RF_genre= GridSearchCV(est, param_grid,refit=True,verbose=3, cv=2)
RF_genre.fit(X_train, y_train)

Fitting 2 folds for each of 18 candidates, totalling 36 fits
[CV] bootstrap=True, max_features=auto, min_samples_split=2, n_estimators=50 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  bootstrap=True, max_features=auto, min_samples_split=2, n_estimators=50, score=0.575, total=   0.4s
[CV] bootstrap=True, max_features=auto, min_samples_split=2, n_estimators=50 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.3s remaining:    0.0s


[CV]  bootstrap=True, max_features=auto, min_samples_split=2, n_estimators=50, score=0.559, total=   0.4s
[CV] bootstrap=True, max_features=auto, min_samples_split=2, n_estimators=100 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.7s remaining:    0.0s


[CV]  bootstrap=True, max_features=auto, min_samples_split=2, n_estimators=100, score=0.559, total=   0.8s
[CV] bootstrap=True, max_features=auto, min_samples_split=2, n_estimators=100 
[CV]  bootstrap=True, max_features=auto, min_samples_split=2, n_estimators=100, score=0.554, total=   0.7s
[CV] bootstrap=True, max_features=auto, min_samples_split=4, n_estimators=50 
[CV]  bootstrap=True, max_features=auto, min_samples_split=4, n_estimators=50, score=0.556, total=   0.4s
[CV] bootstrap=True, max_features=auto, min_samples_split=4, n_estimators=50 
[CV]  bootstrap=True, max_features=auto, min_samples_split=4, n_estimators=50, score=0.527, total=   0.4s
[CV] bootstrap=True, max_features=auto, min_samples_split=4, n_estimators=100 
[CV]  bootstrap=True, max_features=auto, min_samples_split=4, n_estimators=100, score=0.556, total=   0.7s
[CV] bootstrap=True, max_features=auto, min_samples_split=4, n_estimators=100 
[CV]  bootstrap=True, max_features=auto, min_samples_split=4, n_estimators

[Parallel(n_jobs=1)]: Done  36 out of  36 | elapsed:   20.6s finished


GridSearchCV(cv=2, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=None,
                                              oob_score=False,
                                              rando

In [29]:
RF_genre.best_params_

{'bootstrap': True,
 'max_features': 'auto',
 'min_samples_split': 2,
 'n_estimators': 50}

In [33]:
rf_model = RandomForestClassifier(bootstrap= True,max_features= 'auto',min_samples_split= 2,n_estimators= 50).fit(X_train, y_train)
y_pred = rf_model.predict(X_test)
accuracy_score(y_pred, y_test)

0.5020080321285141

KNN

elbow method for choosing K in KNN

In [None]:
error_rate = []

# Will take some time
for i in range(1,40):
    
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(X_train,y_train)
    pred_i = knn.predict(X_test)
    error_rate.append(np.mean(pred_i != y_test))

In [None]:
plt.figure(figsize=(10,6))
plt.plot(range(1,40),error_rate,color='blue', linestyle='dashed', marker='o',
         markerfacecolor='red', markersize=10)
plt.title('Error Rate vs. K Value')
plt.xlabel('K')
plt.ylabel('Error Rate')

In [27]:
#traing KNN
knn_model = KNeighborsClassifier(n_neighbors=13).fit(X_train, y_train)
y_pred = knn_model.predict(X_test)
accuracy_score(y_test, y_pred)

0.4939759036144578

SVM

In [18]:
param_grid = {'C': [ 10, 100, 1000], 'gamma': [1,0.1,0.01], 'kernel': ['rbf','poly','linear']} 
SVM_genre = GridSearchCV(SVC(),param_grid,refit=True,verbose=3,cv=2)
SVM_genre.fit(X_train,y_train)

Fitting 2 folds for each of 27 candidates, totalling 54 fits
[CV] C=10, gamma=1, kernel=rbf .......................................
[CV] ........... C=10, gamma=1, kernel=rbf, score=0.532, total=   0.1s
[CV] C=10, gamma=1, kernel=rbf .......................................
[CV] ........... C=10, gamma=1, kernel=rbf, score=0.551, total=   0.1s

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s



[CV] C=10, gamma=1, kernel=poly ......................................
[CV] .......... C=10, gamma=1, kernel=poly, score=0.503, total=   0.1s
[CV] C=10, gamma=1, kernel=poly ......................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.1s remaining:    0.0s


[CV] .......... C=10, gamma=1, kernel=poly, score=0.530, total=   0.2s
[CV] C=10, gamma=1, kernel=linear ....................................
[CV] ........ C=10, gamma=1, kernel=linear, score=0.583, total=   0.1s
[CV] C=10, gamma=1, kernel=linear ....................................
[CV] ........ C=10, gamma=1, kernel=linear, score=0.556, total=   0.1s
[CV] C=10, gamma=0.1, kernel=rbf .....................................
[CV] ......... C=10, gamma=0.1, kernel=rbf, score=0.548, total=   0.1s
[CV] C=10, gamma=0.1, kernel=rbf .....................................
[CV] ......... C=10, gamma=0.1, kernel=rbf, score=0.551, total=   0.1s
[CV] C=10, gamma=0.1, kernel=poly ....................................
[CV] ........ C=10, gamma=0.1, kernel=poly, score=0.457, total=   0.1s
[CV] C=10, gamma=0.1, kernel=poly ....................................
[CV] ........ C=10, gamma=0.1, kernel=poly, score=0.492, total=   0.1s
[CV] C=10, gamma=0.1, kernel=linear ..................................
[CV] .

[Parallel(n_jobs=1)]: Done  54 out of  54 | elapsed:   11.3s finished


GridSearchCV(cv=2, error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs=None,
             param_grid={'C': [10, 100, 1000], 'gamma': [1, 0.1, 0.01],
                         'kernel': ['rbf', 'poly', 'linear']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=3)

In [19]:
SVM_artists.best_params_

{'C': 10, 'gamma': 1, 'kernel': 'linear'}

In [26]:
svm_model = SVC(C=10, gamma=1, kernel='linear').fit(X_train,y_train)
y_pred = svm_model.predict(X_test)
accuracy_score(y_test, y_pred)

0.4859437751004016

Predicting genre for songs

In [185]:
temp =songs.drop(['artists','explicit','id','name','popularity','release_date','year'],axis=1)
result = songs.copy()

In [186]:
temp['loudness'] = normalize_column(temp['loudness'])
temp['tempo'] = normalize_column(temp['tempo'])
temp['duration_ms'] = normalize_column(temp['duration_ms'])
temp['key'] = normalize_column(temp['key'])

In [187]:
result['rf_label'] = list(le.inverse_transform(rf_model.predict(temp)))
result['svm_label'] = list(le.inverse_transform(svm_model.predict(temp)))
result['knn_label'] = list(le.inverse_transform(knn_model.predict(temp)))

In [188]:
result

Unnamed: 0,acousticness,artists,danceability,duration_ms,energy,explicit,id,instrumentalness,key,liveness,...,name,popularity,release_date,speechiness,tempo,valence,year,rf_label,svm_label,knn_label
0,0.9950,['Carl Woitschach'],0.708,158648,0.1950,0,6KbQ3uYMLKb5jDxLF7wYDD,0.563000,10,0.1510,...,Singende Bataillone 1. Teil,0,1928,0.0506,118.469,0.7790,1928,jazz,classical,classical
1,0.9940,"['Robert Schumann', 'Vladimir Horowitz']",0.379,282133,0.0135,0,6KuQTIu1KoTTkLXKrwlLPV,0.901000,8,0.0763,...,"Fantasiestücke, Op. 111: Più tosto lento",0,1928,0.0462,83.972,0.0767,1928,classical,classical,classical
2,0.6040,['Seweryn Goszczyński'],0.749,104300,0.2200,0,6L63VW0PibdM1HDSBoqnoM,0.000000,5,0.1190,...,Chapter 1.18 - Zamek kaniowski,0,1928,0.9290,107.177,0.8800,1928,jazz,pop,pop
3,0.9950,['Francisco Canaro'],0.781,180760,0.1300,0,6M94FkXd15sOAOQYRnWPN8,0.887000,1,0.1110,...,Bebamos Juntos - Instrumental (Remasterizado),0,1928-09-25,0.0926,108.003,0.7200,1928,classical,classical,classical
4,0.9900,"['Frédéric Chopin', 'Vladimir Horowitz']",0.210,687733,0.2040,0,6N6tiFZ9vLTSOIxkj8qKrd,0.908000,11,0.0980,...,"Polonaise-Fantaisie in A-Flat Major, Op. 61",1,1928,0.0424,62.149,0.0693,1928,classical,classical,classical
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
169904,0.1730,"['DripReport', 'Tyga']",0.875,163800,0.4430,1,4KppkflX7I3vJQk7urOJaS,0.000032,1,0.0891,...,Skechers (feat. Tyga) - Remix,75,2020-05-15,0.1430,100.012,0.3060,2020,pop,pop,pop
169905,0.0167,"['Leon Bridges', 'Terrace Martin']",0.719,167468,0.3850,0,1ehhGlTvjtHo2e4xJFB0SZ,0.031300,8,0.1110,...,Sweeter (feat. Terrace Martin),64,2020-06-08,0.0403,128.000,0.2700,2020,pop,pop,pop
169906,0.5380,"['Kygo', 'Oh Wonder']",0.514,180700,0.5390,0,52eycxprLhK3lPcRLbQiVk,0.002330,7,0.1080,...,How Would I Know,70,2020-05-29,0.1050,123.700,0.1530,2020,classical,pop,pop
169907,0.0714,"['Cash Cash', 'Andy Grammer']",0.646,167308,0.7610,0,3wYOGJYD31sLRmBgCvWxa4,0.000000,1,0.2220,...,I Found You,70,2020-02-28,0.0385,129.916,0.4720,2020,pop,pop,pop


In [189]:
result.to_csv('songs_with_genre.csv')  

Predicting genre for artists

In [176]:
temp =artists.drop(['count','artists','popularity'],axis=1)
result = artists.copy()

In [178]:
temp['loudness'] = normalize_column(temp['loudness'])
temp['tempo'] = normalize_column(temp['tempo'])
temp['duration_ms'] = normalize_column(temp['duration_ms'])
temp['key'] = normalize_column(temp['key'])

In [180]:
result['rf_label'] = list(le.inverse_transform(rf_model.predict(temp)))
result['svm_label'] = list(le.inverse_transform(svm_model.predict(temp)))
result['knn_label'] = list(le.inverse_transform(knn_model.predict(temp)))

In [181]:
result

Unnamed: 0,artists,acousticness,danceability,duration_ms,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,popularity,key,mode,count,rf_label,svm_label,knn_label
0,"""Cats"" 1981 Original London Cast",0.575083,0.442750,247260.000000,0.386336,0.022717,0.287708,-14.205417,0.180675,115.983500,0.334433,38.000000,5,1,12,pop,pop,pop
1,"""Cats"" 1983 Broadway Cast",0.862538,0.441731,287280.000000,0.406808,0.081158,0.315215,-10.690000,0.176212,103.044154,0.268865,33.076923,5,1,26,pop,pop,pop
2,"""Fiddler On The Roof” Motion Picture Chorus",0.856571,0.348286,328920.000000,0.286571,0.024593,0.325786,-15.230714,0.118514,77.375857,0.354857,34.285714,0,1,7,classical,pop,pop
3,"""Fiddler On The Roof” Motion Picture Orchestra",0.884926,0.425074,262890.962963,0.245770,0.073587,0.275481,-15.639370,0.123200,88.667630,0.372030,34.444444,0,1,27,pop,pop,pop
4,"""Joseph And The Amazing Technicolor Dreamcoat""...",0.605444,0.437333,232428.111111,0.429333,0.037534,0.216111,-11.447222,0.086000,120.329667,0.458667,42.555556,11,1,9,pop,pop,pop
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27616,鳳飛飛,0.884000,0.358000,259387.000000,0.208000,0.000002,0.150000,-9.524000,0.033900,131.261000,0.278000,35.000000,10,1,2,pop,pop,pop
27617,黃品源,0.541000,0.578000,293840.000000,0.334000,0.000006,0.067500,-11.974000,0.026700,135.934000,0.243000,47.000000,9,0,2,pop,pop,pop
27618,黃國隆,0.785455,0.570818,174582.727273,0.148400,0.000083,0.142191,-21.610091,0.054355,119.586273,0.741273,20.000000,5,1,11,pop,pop,blues
27619,黃蜀娟,0.925143,0.322262,212989.857143,0.141490,0.000189,0.196757,-19.256714,0.055571,111.685381,0.267033,23.000000,10,1,42,classical,classical,pop


In [184]:
result.to_csv('artists_with_genre.csv')  