In [2]:
import pandas as pd 
import numpy as np 
from imblearn.over_sampling import BorderlineSMOTE 

In [3]:
df1 = pd.read_csv('train.csv')
print(df1.shape)

(25383, 13)


In [4]:
df2 = pd.read_csv('test.csv')
print(df2.shape)

(16922, 12)


In [5]:
df1.columns.unique()

Index(['ID', 'danceability', 'energy', 'key', 'loudness', 'speechiness',
       'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo',
       'duration', 'genre'],
      dtype='object')

In [95]:
X = df1.drop(["ID", "genre",'key'], axis = 1)
Y = df1['genre']

In [96]:
sampler2 = BorderlineSMOTE()
sampler2.fit(X,Y)
X_up, Y_up = sampler2.fit_resample(X, Y)

In [97]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute   import SimpleImputer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score

In [98]:
X_train, X_test, Y_train, Y_test = train_test_split(X_up, Y_up, test_size=0.05, random_state=42)

In [99]:
pipe_list = [ 
            ('scaler', StandardScaler()),
            ('model', GradientBoostingClassifier())]
pipe_model = Pipeline(pipe_list)
pipe_model

In [100]:
hypeer_parameter = {'model__learning_rate':[0.05],
                    'model__max_depth':[10],
                    'model__random_state':[42],
                    'model__max_features':['sqrt'],
                    'model__loss':['deviance']
                    }
grid_model = GridSearchCV(pipe_model, param_grid=hypeer_parameter, cv=7, 
                         n_jobs = -1 ,scoring='f1')
grid_model.fit(X_train, Y_train)



In [101]:
best_model = grid_model.best_estimator_
best_model

In [102]:
Y_train_pred = best_model.predict(X_train)
Y_test_pred  = best_model.predict(X_test)

In [103]:
df_result = pd.DataFrame()
df_result['Feature'] = X_train.columns
df_result['Importance'] = best_model['model'].feature_importances_

In [104]:
df_result

Unnamed: 0,Feature,Importance
0,danceability,0.100373
1,energy,0.069452
2,loudness,0.075301
3,speechiness,0.071072
4,acousticness,0.062566
5,instrumentalness,0.09753
6,liveness,0.040533
7,valence,0.068666
8,tempo,0.266506
9,duration,0.148001


In [105]:
f1_score(Y_train, Y_train_pred,average = "macro")

0.974981005938194

In [106]:
f1_score(Y_test, Y_test_pred,average = "macro")

0.805663898211168

In [107]:
print(classification_report(Y_train, Y_train_pred))

                 precision    recall  f1-score   support

      Dark Trap       0.95      0.91      0.93      3313
            Emo       0.99      1.00      1.00      3300
         Hiphop       0.95      0.95      0.95      3301
            Pop       0.99      1.00      1.00      3301
            Rap       0.94      0.97      0.96      3285
            RnB       0.97      0.97      0.97      3335
     Trap Metal       0.93      0.96      0.95      3318
Underground Rap       0.90      0.86      0.88      3308
            dnb       1.00      1.00      1.00      3311
      hardstyle       1.00      1.00      1.00      3311
      psytrance       1.00      1.00      1.00      3315
      techhouse       1.00      1.00      1.00      3293
         techno       1.00      1.00      1.00      3308
         trance       1.00      1.00      1.00      3311
           trap       1.00      1.00      1.00      3308

       accuracy                           0.98     49618
      macro avg       0.98   

In [108]:
print(classification_report(Y_test, Y_test_pred))

                 precision    recall  f1-score   support

      Dark Trap       0.61      0.56      0.59       169
            Emo       0.93      0.83      0.88       182
         Hiphop       0.66      0.65      0.66       181
            Pop       0.88      0.94      0.91       181
            Rap       0.72      0.74      0.73       197
            RnB       0.62      0.65      0.64       147
     Trap Metal       0.64      0.68      0.66       164
Underground Rap       0.43      0.44      0.43       174
            dnb       0.98      0.99      0.98       171
      hardstyle       0.97      0.96      0.97       171
      psytrance       0.98      0.95      0.96       167
      techhouse       0.96      0.92      0.94       189
         techno       0.92      0.90      0.91       174
         trance       0.89      0.93      0.91       171
           trap       0.92      0.94      0.93       174

       accuracy                           0.81      2612
      macro avg       0.81   

In [109]:
X1 = df2.drop(["ID",'key'], axis = 1)


In [110]:
X1_test_pred = best_model.predict(X1)

In [111]:
X1_test_pred = pd.DataFrame(X1_test_pred)

In [112]:
X1_test_pred['ID'] = df2['ID']
X1_test_pred['genre'] = X1_test_pred[0]

In [113]:
X1_test_pred = X1_test_pred.drop(columns=[0])
X1_test_pred = X1_test_pred.set_index('ID')

In [114]:
X1_test_pred.to_csv('GB_cv7_std_rmK_230120.csv',mode='w')