In [2]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
import pandas as pd

# Load the dataset
df = pd.read_csv('music_genre.csv')
print(df.head())

   instance_id           artist_name            track_name  popularity  \
0      32894.0              Röyksopp  Röyksopp's Night Out        27.0   
1      46652.0  Thievery Corporation      The Shining Path        31.0   
2      30097.0        Dillon Francis             Hurricane        28.0   
3      62177.0              Dubloadz                 Nitro        34.0   
4      24907.0           What So Not      Divide & Conquer        32.0   

   acousticness  danceability  duration_ms  energy  instrumentalness key  \
0       0.00468         0.652         -1.0   0.941           0.79200  A#   
1       0.01270         0.622     218293.0   0.890           0.95000   D   
2       0.00306         0.620     215613.0   0.755           0.01180  G#   
3       0.02540         0.774     166875.0   0.700           0.00253  C#   
4       0.00465         0.638     222369.0   0.587           0.90900  F#   

   liveness  loudness   mode  speechiness               tempo obtained_date  \
0     0.115    -5.2

By quickly taking a look at the tables above, we can immediatly observe some issues:

1)  duration_ms has a negative value -1
2)  key needs to be encoded not
3)  mode needs to be encoded as a binary
4)  tempo was read as a string, so needs to be converted to a numerical value
5)  obtained_data is irrelevant and can be dropped


In [3]:
# Drop rows with NaN values across the dataset
df.dropna(inplace=True)

# No need to handle 'duration_ms' and 'tempo' with apply and fillna now since NaNs are dropped

# Drop 'obtained_date' column because irrelevant
df.drop('obtained_date', axis=1, inplace=True)

# Check for NaN values after all cleaning
print("\nNaN values after cleaning:")
print(df.isnull().sum())

# Separate features and target
X = df.drop(['instance_id', 'artist_name', 'track_name', 'music_genre'], axis=1)
Y = df['music_genre']

# Encode categorical variables ('key', 'mode')
categorical_features = ['key', 'mode']
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Create a pipeline for numerical features
numerical_pipeline = make_pipeline(
    SimpleImputer(strategy='median'),  # This will fill NaNs with the median value of the feature
    StandardScaler()
)

# Create a pipeline for categorical features
categorical_pipeline = make_pipeline(
    SimpleImputer(strategy='most_frequent'),  # This fills NaNs with the most frequent value of the feature
    OneHotEncoder(handle_unknown='ignore')  # This handles any unknown categories encountered during transformation
)

# Create the column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_pipeline, numerical_features),
        ('cat', categorical_pipeline, categorical_features)
    ]
)

# Update the pipeline
pipeline = make_pipeline(preprocessor, LogisticRegression(max_iter=1000))

# Split the data into training and test sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# After setting up the pipeline and before fitting the model, perform a final check for NaNs
print("\nFinal check for NaN values in X_train before fitting the model:")
print(X_train.isnull().sum())

# Train the model
pipeline.fit(X_train, Y_train)

# Predict and evaluate the model
predictions = pipeline.predict(X_test)
print('Accuracy:', accuracy_score(Y_test, predictions))
print(classification_report(Y_test, predictions))


NaN values after cleaning:
instance_id         0
artist_name         0
track_name          0
popularity          0
acousticness        0
danceability        0
duration_ms         0
energy              0
instrumentalness    0
key                 0
liveness            0
loudness            0
mode                0
speechiness         0
tempo               0
valence             0
music_genre         0
dtype: int64

Final check for NaN values in X_train before fitting the model:
popularity          0
acousticness        0
danceability        0
duration_ms         0
energy              0
instrumentalness    0
key                 0
liveness            0
loudness            0
mode                0
speechiness         0
tempo               0
valence             0
dtype: int64
Accuracy: 0.5276
              precision    recall  f1-score   support

 Alternative       0.39      0.29      0.33      1008
       Anime       0.61      0.60      0.61      1034
       Blues       0.53      0.48      0.

This gives an accuracy of 53% which isn't that great, let's move on to some strategies to improve the accuracy.

In [4]:
from sklearn.ensemble import RandomForestClassifier

# Update the pipeline to use a Random Forest classifier
pipeline = make_pipeline(preprocessor, RandomForestClassifier(random_state=42))

# Fit the model
pipeline.fit(X_train, Y_train)

# Predict and evaluate the model
predictions = pipeline.predict(X_test)
print('Accuracy:', accuracy_score(Y_test, predictions))
print(classification_report(Y_test, predictions))

Accuracy: 0.5321
              precision    recall  f1-score   support

 Alternative       0.40      0.33      0.36      1008
       Anime       0.76      0.74      0.75      1034
       Blues       0.59      0.54      0.56      1021
   Classical       0.81      0.85      0.83       955
     Country       0.54      0.58      0.56       986
  Electronic       0.64      0.61      0.63      1009
     Hip-Hop       0.32      0.35      0.33       995
        Jazz       0.52      0.48      0.50       985
         Rap       0.30      0.28      0.29      1030
        Rock       0.45      0.58      0.51       977

    accuracy                           0.53     10000
   macro avg       0.53      0.53      0.53     10000
weighted avg       0.53      0.53      0.53     10000



In [13]:
from sklearn.model_selection import GridSearchCV

# Define a set of parameters to test
param_grid = {
    'randomforestclassifier__n_estimators': [100, 200],
    'randomforestclassifier__max_depth': [None, 10],
    'randomforestclassifier__min_samples_split': [2, 5],
    'randomforestclassifier__min_samples_leaf': [1, 2]
}

# Create a GridSearchCV object
grid_search = GridSearchCV(pipeline, param_grid, cv=5, verbose=2, n_jobs=4)

# Fit the model on the training data
grid_search.fit(X_train, Y_train)

# Print the best parameters and the best score
print("Best parameters:", grid_search.best_params_)
print("Best cross-validation score:", grid_search.best_score_)

# Predict and evaluate the model using the best found parameters
best_predictions = grid_search.predict(X_test)
print('Accuracy:', accuracy_score(Y_test, best_predictions))
print(classification_report(Y_test, best_predictions))


Fitting 5 folds for each of 16 candidates, totalling 80 fits


KeyboardInterrupt: 

Using these parameters:

In [5]:
from sklearn.ensemble import RandomForestClassifier

# Update the pipeline to use a Random Forest classifier with the best parameters found
pipeline = make_pipeline(
    preprocessor,
    RandomForestClassifier(
        n_estimators=200,
        max_depth=10,
        min_samples_split=5,
        min_samples_leaf=1,
        random_state=42
    )
)

# Fit the model
pipeline.fit(X_train, Y_train)

# Predict and evaluate the model
predictions = pipeline.predict(X_test)
print('Accuracy:', accuracy_score(Y_test, predictions))
print(classification_report(Y_test, predictions))


Accuracy: 0.5563
              precision    recall  f1-score   support

 Alternative       0.52      0.26      0.35      1008
       Anime       0.76      0.70      0.73      1034
       Blues       0.60      0.49      0.54      1021
   Classical       0.80      0.86      0.83       955
     Country       0.52      0.57      0.54       986
  Electronic       0.59      0.63      0.61      1009
     Hip-Hop       0.42      0.57      0.48       995
        Jazz       0.52      0.46      0.49       985
         Rap       0.39      0.25      0.31      1030
        Rock       0.47      0.81      0.60       977

    accuracy                           0.56     10000
   macro avg       0.56      0.56      0.55     10000
weighted avg       0.56      0.56      0.55     10000



In [6]:
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder

# Define the XGBoost classifier with some default parameters
xgb_classifier = xgb.XGBClassifier(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=5,
    subsample=0.8,
    colsample_bytree=0.8,
    use_label_encoder=False,
    eval_metric='mlogloss',  # For multiclass classification
    random_state=42
)

# Encode string class labels to integers
label_encoder = LabelEncoder()
Y_train_encoded = label_encoder.fit_transform(Y_train)
Y_test_encoded = label_encoder.transform(Y_test)

# Update your pipeline
pipeline = make_pipeline(preprocessor, xgb_classifier)

# Fit the model on the encoded training data
pipeline.fit(X_train, Y_train_encoded)

# Predict and evaluate the model on the encoded test data
predictions_encoded = pipeline.predict(X_test)
predictions = label_encoder.inverse_transform(predictions_encoded)  # Decode the predictions back to original labels

# Predict and evaluate the model
predictions = pipeline.predict(X_test)
print('Accuracy:', accuracy_score(Y_test_encoded, predictions_encoded))
print(classification_report(Y_test_encoded, predictions_encoded, target_names=label_encoder.classes_))



Accuracy: 0.5851
              precision    recall  f1-score   support

 Alternative       0.47      0.36      0.41      1008
       Anime       0.81      0.74      0.77      1034
       Blues       0.64      0.55      0.59      1021
   Classical       0.85      0.84      0.84       955
     Country       0.56      0.59      0.58       986
  Electronic       0.67      0.63      0.65      1009
     Hip-Hop       0.42      0.46      0.44       995
        Jazz       0.54      0.53      0.53       985
         Rap       0.43      0.41      0.42      1030
        Rock       0.51      0.75      0.61       977

    accuracy                           0.59     10000
   macro avg       0.59      0.59      0.58     10000
weighted avg       0.59      0.59      0.58     10000



In [7]:
from sklearn.model_selection import GridSearchCV

# Define a new set of parameters to test
param_grid = {
    'xgbclassifier__n_estimators': [100, 300, 500],
    'xgbclassifier__learning_rate': [0.05, 0.1, 0.2],
    'xgbclassifier__max_depth': [3, 5, 7],
    'xgbclassifier__subsample': [0.7, 0.8, 0.9],
    'xgbclassifier__colsample_bytree': [0.7, 0.8, 0.9],
    'xgbclassifier__reg_lambda': [1, 1.5, 2],
    'xgbclassifier__reg_alpha': [0, 0.5, 1]
}

# Create a GridSearchCV object
grid_search = GridSearchCV(pipeline, param_grid, cv=5, verbose=2, n_jobs=-1)

# Fit the model on the encoded training data
grid_search.fit(X_train, Y_train_encoded)

# Best parameters
print("Best parameters:", grid_search.best_params_)

# Best score
print("Best cross-validation score:", grid_search.best_score_)

# Predict and evaluate the model using the best found parameters
best_predictions_encoded = grid_search.predict(X_test)
best_predictions = label_encoder.inverse_transform(best_predictions_encoded)

print('Accuracy:', accuracy_score(Y_test_encoded, best_predictions_encoded))
print(classification_report(Y_test_encoded, best_predictions_encoded, target_names=label_encoder.classes_))



Fitting 5 folds for each of 2187 candidates, totalling 10935 fits


KeyboardInterrupt: 