In [7]:
!pip install rich



In [8]:
!pip install xgboost



In [9]:
import numpy as np
import pandas as pd
from pandas.core.dtypes.api import is_numeric_dtype, is_string_dtype, is_categorical_dtype
import matplotlib.pyplot as plt
import seaborn as sns
from plotly.subplots import make_subplots

from sklearn.tree import DecisionTreeClassifier, export_graphviz
import graphviz, pickle
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE

In [10]:
from rich.console import Console
console = Console()

## **Loading Data**

In [11]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("purumalgi/music-genre-classification")

console.print("Path to dataset files:", path)

In [12]:
df_data_music = pd.read_csv(f"/{path}/train.csv",header=0,sep=',')

console.print(df_data_music.shape)
df_data_music.head()

Unnamed: 0,Artist Name,Track Name,Popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_in min/ms,time_signature,Class
0,Bruno Mars,That's What I Like (feat. Gucci Mane),60.0,0.854,0.564,1.0,-4.964,1,0.0485,0.0171,,0.0849,0.899,134.071,234596.0,4,5
1,Boston,Hitch a Ride,54.0,0.382,0.814,3.0,-7.23,1,0.0406,0.0011,0.00401,0.101,0.569,116.454,251733.0,4,10
2,The Raincoats,No Side to Fall In,35.0,0.434,0.614,6.0,-8.334,1,0.0525,0.486,0.000196,0.394,0.787,147.681,109667.0,4,6
3,Deno,Lingo (feat. J.I & Chunkz),66.0,0.853,0.597,10.0,-6.528,0,0.0555,0.0212,,0.122,0.569,107.033,173968.0,4,5
4,Red Hot Chili Peppers,Nobody Weird Like Me - Remastered,53.0,0.167,0.975,2.0,-4.279,1,0.216,0.000169,0.0161,0.172,0.0918,199.06,229960.0,4,10


## **Creating Pipeline for Cleaning**

In [13]:
X = df_data_music.drop(['Class'],axis=1)
Y = df_data_music['Class'].values

In [14]:
from sklearn.compose import ColumnTransformer

drop_columns_processor = ColumnTransformer(
    transformers=[
        ('drop_cols', 'drop', ["Artist Name", "Track Name", "energy", "key", "tempo"]),
    ],
    remainder='passthrough',
    force_int_remainder_cols=False,
    verbose_feature_names_out=False
)

In [15]:
def convert_duration_to_seconds(df):
    df_copy = df.copy()
    df_copy["duration_in_min_ms"] = df_copy["duration_in_min_ms"] / 1000
    df_copy = df_copy.rename(columns={"duration_in_min_ms": "duration_in_min_sg"})
    return df_copy

In [16]:
def clean_column_names(df):
    df_copy = df.copy()
    df_copy.columns = df_copy.columns.str.lower().str.replace(' ', '_').str.replace('/', '_')
    return df_copy

In [17]:
def remove_duplicates(df):
    df_copy = df.copy()
    df_copy.drop_duplicates(inplace=True)
    return df_copy

In [36]:
def impute_missing_values(df):
  df_copy = df.copy()

  df_copy['popularity'] = df_copy['popularity'].fillna(df_copy['popularity'].median())
  df_copy['instrumentalness'] = df_copy['instrumentalness'].fillna(df_copy['instrumentalness'].median())

  return df_copy

In [19]:
def remove_outliers_iqr(df): # Removed 'column' argument
  df_copy = df.copy()

  for col in df_copy.columns: # Iterate through all columns
    if is_numeric_dtype(df_copy[col]): # Apply only to numeric columns
      Q1 = df_copy[col].quantile(0.25)
      Q3 = df_copy[col].quantile(0.75)
      IQR = Q3 - Q1
      lower_bound = Q1 - 1.5 * IQR
      upper_bound = Q3 + 1.5 * IQR

      df_copy[col] = df_copy[col].clip(lower=lower_bound, upper=upper_bound)

  return df_copy # Return the modified DataFrame

In [20]:
def transform_log_columns(df):
  df_copy = df.copy()
  log_columns = ['speechiness', 'acousticness', 'liveness', 'instrumentalness', 'duration_in_min_sg']
  for col in log_columns:
    df_copy[f'{col}_log'] = np.log1p(df_copy[col])

  df_copy.drop(log_columns, axis=1, inplace=True)
  return df_copy

In [21]:
def remove_variance_columns(df):
  df_copy = df.copy()
  cols_to_drop = df_copy.columns[df_copy.nunique() == 1].tolist()
  df_copy = df_copy.drop(columns=cols_to_drop) # Drop the columns
  return df_copy # Return the modified DataFrame

In [22]:
def remove_columns_high_collinearity(df):
  df_copy = df.copy()
  numeric_cols = df_copy.select_dtypes(include=np.number).columns
  corr_matrix = df_copy[numeric_cols].corr()
  upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
  to_drop = [column for column in upper_tri.columns if any(upper_tri[column] > 0.60)]
  df_copy = df_copy.drop(columns=to_drop)
  return df_copy

In [37]:
from sklearn.preprocessing import FunctionTransformer

pipeline_clean_data = Pipeline(steps=[
    ('drop_columns', drop_columns_processor.set_output(transform='pandas')),
    ('clean_column_names', FunctionTransformer(clean_column_names).set_output(transform='pandas')),
    ('convert_duration_to_seconds', FunctionTransformer(convert_duration_to_seconds).set_output(transform='pandas')),
    ('remove_duplicates', FunctionTransformer(remove_duplicates).set_output(transform='pandas')),
    ('impute_missing_values', FunctionTransformer(impute_missing_values).set_output(transform='pandas')),
    ('remove_outliers_iqr', FunctionTransformer(remove_outliers_iqr).set_output(transform='pandas')),
    ('remove_columns_high_collinearity', FunctionTransformer(remove_columns_high_collinearity).set_output(transform='pandas')),
    ('transform_log_columns', FunctionTransformer(transform_log_columns).set_output(transform='pandas')),
    ('remove_variance_columns', FunctionTransformer(remove_variance_columns).set_output(transform='pandas')),
])

pipeline_clean_data

## **Split Data**

In [38]:
X_train, X_test, y_train, y_test = train_test_split(X,Y,test_size=0.2, stratify = Y, random_state=90)

## **Model Creation**

In [39]:
import xgboost as xgb
from scipy.stats import uniform, randint

param_grid_xgb = {
    'classifier__n_estimators': randint(50, 300),
    'classifier__learning_rate': uniform(0.01, 0.3),
    'classifier__max_depth': randint(3, 10),
    'classifier__min_child_weight': randint(1, 10),
    'classifier__gamma': uniform(0, 0.5),
    'classifier__subsample': uniform(0.6, 0.4),
    'classifier__colsample_bytree': uniform(0.6, 0.4),
    'classifier__reg_alpha': uniform(0, 0.5),
    'classifier__reg_lambda': uniform(1, 0.5)
}

In [40]:
num_classes = len(np.unique(Y))

xgb_classifier = xgb.XGBClassifier(objective='multi:softmax',
                                   use_label_encoder=False,
                                   eval_metric='mlogloss',
                                   num_class=num_classes,
                                   random_state=42)

In [41]:
preprocessor_standard_scaler = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ["popularity", "danceability", "loudness", "mode", "valence", "speechiness_log", "acousticness_log", "liveness_log", "instrumentalness_log", "duration_in_min_sg_log"])
    ])

In [49]:
pipeline_xgb = ImbPipeline(steps=[
    ('drop_columns', drop_columns_processor.set_output(transform='pandas')),
    ('clean_column_names', FunctionTransformer(clean_column_names).set_output(transform='pandas')),
    ('convert_duration_to_seconds', FunctionTransformer(convert_duration_to_seconds).set_output(transform='pandas')),
    ('impute_missing_values', FunctionTransformer(impute_missing_values).set_output(transform='pandas')),
    ('remove_outliers_iqr', FunctionTransformer(remove_outliers_iqr).set_output(transform='pandas')),
    ('transform_log_columns', FunctionTransformer(transform_log_columns).set_output(transform='pandas')),
    ('remove_variance_columns', FunctionTransformer(remove_variance_columns).set_output(transform='pandas')),
    ('balanced_data', SMOTE(random_state=90)),
    ('scaler', StandardScaler()),
    ('classifier', xgb_classifier)
])

In [50]:
pipeline_xgb

In [61]:
random_search_xgb = RandomizedSearchCV(
    pipeline_xgb,
    param_distributions=param_grid_xgb,
    n_iter=1,
    cv=2,
    scoring='accuracy',
    random_state=42,
    n_jobs=-1,
    verbose=1
)

In [62]:
import pandas as pd

# Convert y_train to a pandas Series with the same index as X_train
y_train_series = pd.Series(y_train, index=X_train.index, name='class')

# Combine X_train and y_train_series into a single DataFrame for consistent duplicate removal
train_df = pd.concat([X_train, y_train_series], axis=1)

# Apply duplicate removal to the combined DataFrame
train_df_processed = remove_duplicates(train_df)

# Split back into X_train_processed and y_train_processed and reset index
X_train_processed = train_df_processed.drop(columns=['class']).reset_index(drop=True)
y_train_processed = train_df_processed['class'].reset_index(drop=True)

In [63]:
random_search_xgb.fit(X_train_processed, y_train_processed)

Fitting 2 folds for each of 1 candidates, totalling 2 fits


Parameters: { "use_label_encoder" } are not used.



In [46]:
console.print("Best parameters found: ", random_search_xgb.best_params_)
console.print("Best accuracy found: ", random_search_xgb.best_score_)

xgb_classifier_best_model = random_search_xgb.best_estimator_

In [47]:
console.print(f'Accuracy xgb: {xgb_classifier_best_model.score(X_test, y_test) * 100} %')

In [135]:
scores_xgb_cv = cross_val_score(estimator=pipeline_xgb,
                              X=X,
                              y=Y,
                              cv=4,
                              scoring='accuracy',
                              n_jobs=-1)

console.print(f'Accuracy CV XGBoost (on full dataset with best model): {scores_xgb_cv}')
console.print(f'Accuracy CV XGBoost (on full dataset with best model): {np.mean(scores_xgb_cv):.3f}' f' +/- {np.std(scores_xgb_cv):.3f}')

In [138]:
import cloudpickle

with open('xgb_classifier_best_model_pipeline.pkl', 'wb') as f:
    cloudpickle.dump(xgb_classifier_best_model, f)

console.print("Pipeline exported")

# Documentation and Process Summary

This notebook demonstrates a process for music genre classification using an XGBoost model with a data cleaning and preprocessing pipeline.

## 1. Setup and Data Loading

- Necessary libraries such as `pandas`, `numpy`, `sklearn`, `imblearn`, and `xgboost` are imported.
- The dataset is downloaded from Kaggle using `kagglehub` and loaded into a pandas DataFrame (`df_data_music`).

## 2. Data Preparation and Cleaning Pipeline

- The target variable (`Class`) is separated from the features (`X`).
- A series of custom functions and scikit-learn transformers are defined for data cleaning and preprocessing:
    - `clean_column_names`: Converts column names to lowercase and replaces spaces/slashes with underscores.
    - `convert_duration_to_seconds`: Converts duration from milliseconds to seconds and renames the column.
    - `remove_duplicates`: Removes duplicate rows from the DataFrame.
    - `impute_missing_values`: Fills missing values in 'popularity' with the mean and 'instrumentalness' with the median.
    - `remove_outliers_iqr`: Clips numerical outliers based on the Interquartile Range (IQR).
    - `remove_columns_high_collinearity`: Removes columns with high correlation.
    - `transform_log_columns`: Applies log transformation to specified columns.
    - `remove_variance_columns`: Removes columns with zero variance.
- These steps are assembled into an `ImbPipeline` (`pipeline_xgb`) which includes preprocessing, SMOTE for handling class imbalance, a `StandardScaler` for feature scaling, and the `XGBClassifier`.

## 3. Data Splitting

- The data is split into training (`X_train`, `y_train`) and testing (`X_test`, `y_test`) sets using `train_test_split`, with stratification to maintain class distribution.

## 4. Model Setup and Tuning

- An XGBoost classifier (`xgb_classifier`) is initialized with parameters suitable for multi-class classification.
- A parameter grid (`param_grid_xgb`) is defined for hyperparameter tuning using `RandomizedSearchCV`.
- `RandomizedSearchCV` is set up with the `pipeline_xgb` and the parameter grid to find the best hyperparameters based on accuracy using cross-validation.

## 5. Model Training and Evaluation

- The `RandomizedSearchCV.fit()` method is called with the training data (`X_train_processed`, `y_train_processed`) to perform the hyperparameter tuning and model training with cross-validation.

## 6. Exporting Models

- Code is included to export the best trained XGBoost model with the pipeline and the fitted StandardScaler using `joblib`.