In [2]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2-cp310-cp310-manylinux2014_x86_64.whl (98.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.6/98.6 MB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2


In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss, confusion_matrix, classification_report
from catboost import CatBoostClassifier
from sklearn.impute import SimpleImputer

In [4]:
#load data
train_data = pd.read_csv('train (1).csv')



In [5]:
#dealing with null values
train_data["Popularity"].fillna(train_data["Popularity"].mean(), inplace=True)
train_data["key"].fillna(train_data["key"].mode()[0], inplace=True)
train_data["instrumentalness"].fillna(0, inplace=True)

In [None]:
# changes the data type of the columns "Artist Name" and "Track Name" in  dataFrame to strings.
cat_cols = ['Artist Name','Track Name']
for col in cat_cols:
    train_data[col] = train_data[col].astype('str')

In [None]:
X= train_data.drop('Class' ,axis = 1)
y = train_data['Class']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
#using catboost classifier
model = CatBoostClassifier(loss_function='MultiClass', verbose=False)
model.fit(X_train, y_train, cat_features=cat_cols)

<catboost.core.CatBoostClassifier at 0x7d417a062e00>

In [None]:
#calculates the log loss of the model predictions
y_pred_train = model.predict_proba(X_train)
y_pred_test = model.predict_proba(X_test)
train_log_loss = log_loss(y_train, y_pred_train)
test_log_loss = log_loss(y_test, y_pred_test)
print(f"Train log loss: {train_log_loss}")
print(f"Test log loss: {test_log_loss}")

Train log loss: 1.4056001653830368
Test log loss: 0.8601866602119906


In [None]:
# Additional evaluation

y_pred_test_labels = model.predict(X_test)

test_classification_report = classification_report(y_test, y_pred_test_labels)

print("Test Classification Report:")
print(test_classification_report)

Test Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.88      0.87       104
           1       0.44      0.21      0.28       204
           2       0.72      0.62      0.67       235
           3       0.90      0.93      0.91        68
           4       0.85      0.81      0.83        68
           5       0.79      0.81      0.79       236
           6       0.54      0.51      0.53       403
           7       0.98      0.94      0.96        87
           8       0.74      0.74      0.74       292
           9       0.70      0.67      0.68       406
          10       0.62      0.76      0.68       777

    accuracy                           0.68      2880
   macro avg       0.74      0.72      0.72      2880
weighted avg       0.67      0.68      0.67      2880



In [None]:
#using random forest classifier
from sklearn.ensemble import RandomForestClassifier

tree = RandomForestClassifier(
    n_estimators=100,
    criterion='gini',
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=1,
    class_weight=None,  # Adjust based on class imbalance
    bootstrap=True,  # Use bootstrap sampling
    oob_score=True  # Enable out-of-bag (OOB) scoring
)
X_train.drop(columns=["Id", "Track Name", "Artist Name"], inplace=True)

X_test.drop(columns=["Id", "Track Name", "Artist Name"], inplace=True)



tree.fit(X_train,y_train)



In [None]:
print(y_train)

6913     10
13624     9
13587     7
8140      2
11691     6
         ..
5191      5
13418     1
5390     10
860       5
7270      6
Name: Class, Length: 11516, dtype: int64


In [None]:
y_pred_train = tree.predict_proba(X_train)
y_pred_test =tree.predict_proba(X_test)
train_log_loss = log_loss(y_train, y_pred_train)
test_log_loss = log_loss(y_test, y_pred_test)
print(f"Train log loss: {train_log_loss}")
print(f"Test log loss: {test_log_loss}")


Train log loss: 0.3269464971273684
Test log loss: 1.5836106416725007


In [None]:


y_pred_test_labels = tree.predict(X_test)
test_classification_report = classification_report(y_test, y_pred_test_labels)
print("Test Classification Report:")
print(test_classification_report)


Test Classification Report:
              precision    recall  f1-score   support

           0       0.66      0.82      0.73       104
           1       0.03      0.01      0.02       204
           2       0.60      0.28      0.38       235
           3       0.88      0.72      0.79        68
           4       0.69      0.68      0.68        68
           5       0.74      0.67      0.70       236
           6       0.35      0.33      0.34       403
           7       0.95      0.94      0.95        87
           8       0.64      0.54      0.58       292
           9       0.53      0.55      0.54       406
          10       0.45      0.61      0.52       777

    accuracy                           0.51      2880
   macro avg       0.59      0.56      0.57      2880
weighted avg       0.51      0.51      0.50      2880



In [None]:

test_data = pd.read_csv('test (2).csv')



In [None]:
test_data["Popularity"].fillna(test_data["Popularity"].mean(), inplace=True)
test_data["key"].fillna(test_data["key"].mode()[0], inplace=True)
test_data["instrumentalness"].fillna(0, inplace=True)


In [None]:
y_pred = model.predict_proba(test_data)

test_data["Class"] = model.predict(test_data)

# Select only the 'Class' and 'id' columns
selected_columns = ['Id','Class']
submission = test_data[selected_columns]

# Save the adjusted predictions to a CSV file
submission.to_csv("best model.csv", index=False)



In [None]:
print(test_data['Class'].value_counts())

10    1237
6      555
9      404
8      309
5      302
2      233
1      156
0      145
7      111
3       76
4       72
Name: Class, dtype: int64


* Summary:
The highest accuracy was achieved using the CatBoost classifier.