In [3]:
import numpy as np
import pandas as pd

In [4]:
data = pd.read_csv('music-genre-cleaned-2.csv')
data

Unnamed: 0,Column,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,valence,music_genre
0,1,31.0,0.01270,0.622,218293.0,0.890,0.950000,D,0.124,-7.043,Minor,0.0300,115.002,0.531,Electronic
1,2,28.0,0.00306,0.620,215613.0,0.755,0.011800,G#,0.534,-4.617,Major,0.0345,127.994,0.333,Electronic
2,3,34.0,0.02540,0.774,166875.0,0.700,0.002530,C#,0.157,-4.498,Major,0.2390,128.014,0.270,Electronic
3,4,32.0,0.00465,0.638,222369.0,0.587,0.909000,F#,0.157,-6.266,Major,0.0413,145.036,0.323,Electronic
4,6,46.0,0.02890,0.572,214408.0,0.803,0.000008,B,0.106,-4.294,Major,0.3510,149.995,0.230,Electronic
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40555,49999,56.0,0.13300,0.849,237667.0,0.660,0.000008,C,0.296,-7.195,Major,0.0516,99.988,0.629,Hip-Hop
40556,50001,72.0,0.15700,0.709,251860.0,0.362,0.000000,B,0.109,-9.814,Major,0.0550,122.043,0.113,Hip-Hop
40557,50002,51.0,0.00597,0.693,189483.0,0.763,0.000000,D,0.143,-5.443,Major,0.1460,131.079,0.395,Hip-Hop
40558,50003,65.0,0.08310,0.782,262773.0,0.472,0.000000,G,0.106,-5.016,Minor,0.0441,75.886,0.354,Hip-Hop


In [5]:
# encode categorical data
from sklearn.preprocessing import LabelEncoder
data["key"] = LabelEncoder().fit_transform(data["key"])
data["mode"] = LabelEncoder().fit_transform(data["mode"])
data.head()

Unnamed: 0,Column,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,valence,music_genre
0,1,31.0,0.0127,0.622,218293.0,0.89,0.95,5,0.124,-7.043,1,0.03,115.002,0.531,Electronic
1,2,28.0,0.00306,0.62,215613.0,0.755,0.0118,11,0.534,-4.617,0,0.0345,127.994,0.333,Electronic
2,3,34.0,0.0254,0.774,166875.0,0.7,0.00253,4,0.157,-4.498,0,0.239,128.014,0.27,Electronic
3,4,32.0,0.00465,0.638,222369.0,0.587,0.909,9,0.157,-6.266,0,0.0413,145.036,0.323,Electronic
4,6,46.0,0.0289,0.572,214408.0,0.803,8e-06,2,0.106,-4.294,0,0.351,149.995,0.23,Electronic


In [6]:
features = data.drop("music_genre", axis=1).drop("Column", axis=1)
targets = data['music_genre']
print(features)
print(targets)

       popularity  acousticness  danceability  duration_ms  energy  \
0            31.0       0.01270         0.622     218293.0   0.890   
1            28.0       0.00306         0.620     215613.0   0.755   
2            34.0       0.02540         0.774     166875.0   0.700   
3            32.0       0.00465         0.638     222369.0   0.587   
4            46.0       0.02890         0.572     214408.0   0.803   
...           ...           ...           ...          ...     ...   
40555        56.0       0.13300         0.849     237667.0   0.660   
40556        72.0       0.15700         0.709     251860.0   0.362   
40557        51.0       0.00597         0.693     189483.0   0.763   
40558        65.0       0.08310         0.782     262773.0   0.472   
40559        67.0       0.10200         0.862     267267.0   0.642   

       instrumentalness  key  liveness  loudness  mode  speechiness    tempo  \
0              0.950000    5     0.124    -7.043     1       0.0300  115.002   

In [7]:
# scale features
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)

In [8]:
# split data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(scaled_features,targets,test_size=0.1,stratify=targets)

In [10]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier

param_grid = {    
    "criterion": ["gini", "entropy", "log_loss"],
    "max_depth": [5, 10, 15, 20, 25],
    "min_samples_split": [2, 3, 4, 5],    
    "min_samples_leaf": [1, 2, 3, 4, 5],
    "max_features": ["auto", "log2"]
}
estimator = DecisionTreeClassifier()
grid_search = GridSearchCV(estimator,param_grid=param_grid,cv=5)
grid_search.fit(X_train,y_train)
grid_search.best_params_



{'criterion': 'log_loss',
 'max_depth': 10,
 'max_features': 'log2',
 'min_samples_leaf': 3,
 'min_samples_split': 3}

In [None]:
{'criterion': 'log_loss',
 'max_depth': 10,
 'max_features': 'log2',
 'min_samples_leaf': 3,
 'min_samples_split': 3}
estimator = DecisionTreeClassifier()



In [11]:
# f1 score
from sklearn.metrics import f1_score
model = DecisionTreeClassifier(criterion='log_loss',
                               max_depth=10, 
                               max_features='log2',
                               min_samples_leaf=3,
                               min_samples_split=3)
model.fit(X_train,y_train)
predictions = model.predict(X_test)
print(f"F1 score: {f1_score(y_test, predictions, average = 'weighted')}")

F1 score: 0.46891236190469876


In [13]:
from sklearn.ensemble import BaggingClassifier

bagging = BaggingClassifier(model, random_state=0)
parameters = {'max_features': [i for i in np.arange(0.1, 1, 0.1)]}
grid_search = GridSearchCV(bagging,param_grid=parameters,cv=5,n_jobs=-1)
grid_search.fit(X_train,y_train)
grid_search.best_params_

{'max_features': 0.9}

In [15]:
bagging_model = BaggingClassifier(model, random_state=0, max_features=0.9)
bagging_model.fit(X_train,y_train)
predictions = bagging_model.predict(X_test)
print(f"F1 score: {f1_score(y_test, predictions, average = 'weighted')}")

F1 score: 0.5538836172566607
