In [None]:
!pip install catboost



In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier, ExtraTreesClassifier, VotingClassifier
from sklearn.impute import SimpleImputer
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import f1_score, accuracy_score, classification_report

In [None]:
# Load the training data
train_set = pd.read_csv("train (1).csv")
test_data = pd.read_csv('test (2).csv')

In [None]:
# Drop irrelevant columns
train_set.drop(columns=["Id", "Track Name", "Artist Name"], inplace=True)
test_data.drop(columns=["Id", "Track Name", "Artist Name"], inplace=True)

In [None]:
train_set["Popularity"].fillna(train_set["Popularity"].mean(), inplace=True)
train_set["key"].fillna(train_set["key"].mode()[0], inplace=True)
train_set["instrumentalness"].fillna(0, inplace=True)

In [None]:
test_data["Popularity"].fillna(test_data["Popularity"].mean(), inplace=True)
test_data["key"].fillna(test_data["key"].mode()[0], inplace=True)
test_data["instrumentalness"].fillna(0, inplace=True)

In [None]:
train_set.isna().sum()

Popularity            0
danceability          0
energy                0
key                   0
loudness              0
mode                  0
speechiness           0
acousticness          0
instrumentalness      0
liveness              0
valence               0
tempo                 0
duration_in min/ms    0
time_signature        0
Class                 0
dtype: int64

In [None]:

# Separate features and target variable
X = train_set.drop(columns=["Class"])
y = train_set["Class"]


In [None]:

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

In [None]:

# Feature engineering

X_train['loudness_energy_ratio'] = X_train['loudness'] / X_train['energy']
X_test['loudness_energy_ratio'] = X_test['loudness'] / X_test['energy']


In [None]:

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
# Initialize classifiers with hyperparameter tuning
rf_clf = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
param_grid_rf = {
    'n_estimators': [100, 150, 200],
    'max_depth': [None, 10, 20, 30]
}
grid_rf = GridSearchCV(rf_clf, param_grid_rf, cv=3, scoring='f1_macro')
grid_rf.fit(X_train_scaled, y_train)
rf_clf = grid_rf.best_estimator_

lgbm_clf = LGBMClassifier(random_state=42)
param_grid_lgbm = {
    'n_estimators': [100, 150, 200],
    'max_depth': [None, 10, 20, 30]
}
grid_lgbm = GridSearchCV(lgbm_clf, param_grid_lgbm, cv=3, scoring='f1_macro')
grid_lgbm.fit(X_train_scaled, y_train)
lgbm_clf = grid_lgbm.best_estimator_

cat_clf = CatBoostClassifier(random_state=42, verbose=0)

extra_trees_clf = ExtraTreesClassifier(random_state=42)
param_grid_extra_trees = {
    'n_estimators': [100, 150, 200],
    'max_depth': [None, 10, 20, 30]
}

grid_extra_trees = GridSearchCV(extra_trees_clf, param_grid_extra_trees, cv=3, scoring='f1_macro')
grid_extra_trees.fit(X_train_scaled, y_train)
extra_trees_clf = grid_extra_trees.best_estimator_


In [None]:

# Stacking Classifier with RandomForest, LGBM, CatBoost, and ExtraTrees as base models
# and RandomForest as the meta-classifier
stacking_clf = StackingClassifier(
    estimators=[
        ('rf', rf_clf),
        ('lgbm', lgbm_clf),
        ('cat', cat_clf),
        ('extra_trees', extra_trees_clf)
    ],
    final_estimator=RandomForestClassifier(random_state=42),
    stack_method='predict_proba'  # Use predict_proba for meta-features
)

# Train the stacking classifier
stacking_clf.fit(X_train_scaled, y_train)

In [None]:
# Make predictions on the test set
y_pred = stacking_clf.predict(X_test_scaled)



In [None]:
# Calculate F1 score
f1 = f1_score(y_test, y_pred, average='macro')
print("F1 score:", f1)
class_report = classification_report(y_test, y_pred)
print("Classification Report:\n", class_report)

F1 score: 0.6266756138209956
Classification Report:
               precision    recall  f1-score   support

           0       0.70      0.81      0.75       160
           1       0.48      0.25      0.33       315
           2       0.58      0.43      0.49       327
           3       0.86      0.72      0.78       100
           4       0.71      0.62      0.66       105
           5       0.75      0.73      0.74       361
           6       0.47      0.43      0.45       610
           7       0.92      0.94      0.93       125
           8       0.65      0.59      0.62       435
           9       0.56      0.55      0.56       595
          10       0.52      0.67      0.58      1186

    accuracy                           0.58      4319
   macro avg       0.65      0.61      0.63      4319
weighted avg       0.58      0.58      0.57      4319



* **Summary:
The highest accuracy I achieved was through ensemble stacking method**