In [1]:
import pandas as pd 
import numpy as np 
from imblearn.over_sampling import BorderlineSMOTE 

In [2]:
df1 = pd.read_csv('train.csv')
print(df1.shape)

(25383, 13)


In [3]:
df2 = pd.read_csv('test.csv')
print(df2.shape)

(16922, 12)


In [7]:
df1.columns.unique()

Index(['ID', 'danceability', 'energy', 'key', 'loudness', 'speechiness',
       'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo',
       'duration', 'genre'],
      dtype='object')

In [98]:
X = df1.drop(columns=['ID','genre'])
Y = df1['genre']

In [147]:
sampler2 = BorderlineSMOTE()
sampler2.fit(X,Y)

In [148]:
X_up, Y_up = sampler2.fit_resample(X, Y)

In [128]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute   import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score

In [149]:
X_train, X_test, Y_train, Y_test = train_test_split(X_up, Y_up, test_size=0.4, random_state=42)

In [10]:
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge

In [150]:
pipe_list = [('scaler', StandardScaler()),
            ('model', GradientBoostingClassifier())]
pipe_model = Pipeline(pipe_list)
pipe_model

In [156]:
hypeer_parameter = {'model__subsample':[0.01,0.1,0.5],
                    'model__n_estimators': [100, 500],
                    'model__learning_rate':[0.05,0.1]}
grid_model = GridSearchCV(pipe_model, param_grid=hypeer_parameter, cv=5, 
                         n_jobs = -1 ,scoring='f1')
grid_model.fit(X_train, Y_train)

In [152]:
best_model = grid_model.best_estimator_
best_model

In [153]:
Y_train_pred = best_model.predict(X_train)
Y_test_pred  = best_model.predict(X_test)

In [154]:
f1_score(Y_train, Y_train_pred,average = "macro")

0.7704295057213896

In [155]:
f1_score(Y_test, Y_test_pred,average = "macro")

0.7221170293191846

In [48]:
df_test = pd.read_csv('RF1_230116(0.63.csv')

In [49]:
XX = df_test['genre']

In [50]:
X1 = df2.drop(["ID"], axis = 1)


In [51]:
X1_test_pred = best_model.predict(X1)

In [52]:
print(classification_report( XX, X1_test_pred))

                 precision    recall  f1-score   support

      Dark Trap       0.85      0.83      0.84      1530
            Emo       0.90      0.90      0.90       677
         Hiphop       0.80      0.79      0.80      1153
            Pop       0.67      0.64      0.65        80
            Rap       0.83      0.84      0.84       413
            RnB       0.76      0.76      0.76       759
     Trap Metal       0.83      0.84      0.83       525
Underground Rap       0.88      0.88      0.88      3293
            dnb       0.99      0.99      0.99      1242
      hardstyle       0.97      0.95      0.96      1269
      psytrance       0.98      0.98      0.98      1192
      techhouse       0.97      0.96      0.97      1215
         techno       0.97      0.96      0.97      1201
         trance       0.92      0.95      0.94      1222
           trap       0.95      0.97      0.96      1151

       accuracy                           0.90     16922
      macro avg       0.88   

In [53]:
X1_test_pred = pd.DataFrame(X1_test_pred)

In [54]:
X1_test_pred['ID'] = df2['ID']
X1_test_pred['genre'] = X1_test_pred[0]

In [55]:
X1_test_pred = X1_test_pred.drop(columns=[0])
X1_test_pred = X1_test_pred.set_index('ID')

In [56]:
X1_test_pred.to_csv('RF_230117(0.639).csv',mode='w')