In [1]:
import pandas as pd 
import numpy as np 

In [2]:
df1 = pd.read_csv('train.csv')
print(df1.shape)

(25383, 13)


In [3]:
df2 = pd.read_csv('test.csv')
print(df2.shape)

(16922, 12)


In [4]:
df1.columns.unique()

Index(['ID', 'danceability', 'energy', 'key', 'loudness', 'speechiness',
       'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo',
       'duration', 'genre'],
      dtype='object')

In [5]:
X = df1.drop(["ID", "genre"], axis = 1)
Y = df1['genre']

In [6]:
from imblearn.combine import SMOTETomek
from imblearn.under_sampling import TomekLinks

In [7]:
smoteto = SMOTETomek(tomek=TomekLinks(sampling_strategy='majority'))
X_smt, Y_smt = smoteto.fit_resample(X, Y)

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute   import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score

In [9]:
X_train, X_test, Y_train, Y_test = train_test_split(X_smt, Y_smt, test_size=0.3, random_state=42)

In [10]:
pipe_list = [('impute', SimpleImputer()), 
             ('scaler', MinMaxScaler()),
            ('model', RandomForestClassifier())]
pipe_model = Pipeline(pipe_list)
pipe_model

In [11]:
hypeer_parameter = {'model__class_weight':['balanced']}
grid_model = GridSearchCV(pipe_model, param_grid=hypeer_parameter, cv=7, 
                         n_jobs = -1 ,scoring='f1')
grid_model.fit(X_train, Y_train)



In [12]:
best_model = grid_model.best_estimator_
best_model

In [13]:
Y_train_pred = best_model.predict(X_train)
Y_test_pred  = best_model.predict(X_test)

In [14]:
f1_score(Y_train, Y_train_pred,average = "macro")

0.9824414328768868

In [22]:
f1_score(Y_test, Y_test_pred,average = "macro")

0.7829134902589927

In [24]:
print(classification_report(Y_train, Y_train_pred))

                 precision    recall  f1-score   support

      Dark Trap       0.94      0.95      0.95      2218
            Emo       1.00      1.00      1.00      2438
         Hiphop       0.97      0.97      0.97      2428
            Pop       1.00      1.00      1.00      2389
            Rap       0.98      0.97      0.97      2438
            RnB       0.98      0.99      0.98      2442
     Trap Metal       0.96      0.97      0.96      2473
Underground Rap       0.93      0.90      0.91      2447
            dnb       1.00      1.00      1.00      2403
      hardstyle       1.00      1.00      1.00      2484
      psytrance       1.00      1.00      1.00      2441
      techhouse       1.00      1.00      1.00      2444
         techno       1.00      1.00      1.00      2422
         trance       1.00      1.00      1.00      2408
           trap       1.00      1.00      1.00      2466

       accuracy                           0.98     36341
      macro avg       0.98   

In [25]:
print(classification_report(Y_test, Y_test_pred))

                 precision    recall  f1-score   support

      Dark Trap       0.60      0.48      0.54       950
            Emo       0.82      0.86      0.84      1044
         Hiphop       0.62      0.58      0.60      1054
            Pop       0.85      0.90      0.87      1093
            Rap       0.69      0.74      0.72      1044
            RnB       0.65      0.67      0.66      1040
     Trap Metal       0.61      0.71      0.66      1009
Underground Rap       0.42      0.35      0.38      1035
            dnb       0.98      0.98      0.98      1079
      hardstyle       0.91      0.93      0.92       998
      psytrance       0.94      0.95      0.95      1041
      techhouse       0.95      0.94      0.94      1038
         techno       0.91      0.91      0.91      1060
         trance       0.87      0.89      0.88      1074
           trap       0.91      0.88      0.90      1016

       accuracy                           0.79     15575
      macro avg       0.78   

In [16]:
X1 = df2.drop(["ID"], axis = 1)


In [17]:
X1_test_pred = best_model.predict(X1)

In [18]:
X1_test_pred = pd.DataFrame(X1_test_pred)

In [19]:
X1_test_pred['ID'] = df2['ID']
X1_test_pred['genre'] = X1_test_pred[0]

In [20]:
X1_test_pred = X1_test_pred.drop(columns=[0])
X1_test_pred = X1_test_pred.set_index('ID')

In [21]:
X1_test_pred.to_csv('RF_smt_230116.csv',mode='w')