In [10]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


In [11]:
import pandas as pd
import xgboost as xgb
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score, confusion_matrix, classification_report

In [12]:
from google.colab import files
uploaded = files.upload()

Saving spotify_cleaned.csv to spotify_cleaned.csv


In [13]:
# --- LOAD DATA ---
try:
    df = pd.read_csv('spotify_cleaned.csv')
    print("File loaded successfully.")

    # Bucket 'popularity'
    df['popularity_bucket'] = pd.cut(
        df['popularity'],
        bins=[-1, 20, 40, 60, 80, 100],
        labels=[0, 1, 2, 3, 4]
    ).astype(int)

    print("Bucketing completed. Here's a sample:")
    print(df[['popularity', 'popularity_bucket']].head())

except FileNotFoundError:
    print("Error: File 'spotify_cleaned.csv' not found.")
except pd.errors.EmptyDataError:
    print("Error: 'spotify_cleaned.csv' is empty.")
except pd.errors.ParserError:
    print("Error: Unable to parse 'spotify_cleaned.csv'. Check file format.")
except Exception as e:
    print(f"An unexpected error occurred: {e}")

File loaded successfully.
Bucketing completed. Here's a sample:
   popularity  popularity_bucket
0          22                  1
1          16                  0
2           0                  0
3          36                  1
4          26                  1


Use a tree model like XGBoost. Better for tabular data and known features.

In [18]:
print(y_train.value_counts(normalize=True)) # Check if classes are balanced

popularity_bucket
0    0.415938
2    0.322750
1    0.218750
3    0.041688
4    0.000875
Name: proportion, dtype: float64


In [15]:
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.linear_model import LogisticRegression

In [22]:
# --- MODEL TRAINING ---
n_runs = 10
models = {
    "Random Forest": RandomForestClassifier(n_estimators=400, random_state=69),
    "XGBoost": xgb.XGBClassifier(
        objective='multi:softmax',
        num_class=5,
        eval_metric='mlogloss',
        use_label_encoder=False,
        random_state=69),
    "CatBoost": CatBoostClassifier(
        iterations=400,
        depth=8,
        learning_rate=0.1,
        loss_function='MultiClass',
        verbose=False,
        random_state=69),
    "Logistic Regression": LogisticRegression(
        max_iter=1000,
        multi_class='ovr',
        class_weight='balanced')
}
all_results = {model_name: [] for model_name in models.keys()}

for run in range(n_runs):
    print("Starting run", run + 1)
    df = df.sample(20000, random_state=run)

if 'df' in locals():
    try:
        X = df.drop(['popularity', 'popularity_bucket'], axis=1)
        y = df['popularity_bucket']

        # Identify and label encode categorical columns
        categorical_cols = X.select_dtypes(include=['object']).columns.tolist()

        le = LabelEncoder()
        for col in categorical_cols:
            X[col] = le.fit_transform(X[col])

        print(f"Categorical columns encoded: {categorical_cols}")

        # Split data into training and testing sets
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


        results = {}

        # Train and evaluate
        for name, model in models.items():
            print(f"\nTraining {name}...")
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)
            acc = accuracy_score(y_test, y_pred)
            results[name] = acc
            all_results[name].append(acc)
            print("name:", name)

            print("Classification Report:")
            print(classification_report(y_test, y_pred))


    except KeyError as e:
        print(f"Error: Target column '{e}' not found in the DataFrame.")
    except Exception as e:
        print(f"An error occurred during model training or evaluation: {e}")

Starting run 1
Starting run 2
Starting run 3
Starting run 4
Starting run 5
Starting run 6
Starting run 7
Starting run 8
Starting run 9
Starting run 10
Categorical columns encoded: ['artist_name', 'track_name', 'track_id', 'genre', 'decade']

Training Random Forest...
name: Random Forest
Classification Report:
              precision    recall  f1-score   support

           0       0.76      0.93      0.83      1659
           1       0.69      0.31      0.43       905
           2       0.90      0.98      0.94      1272
           3       0.88      0.99      0.93       163
           4       0.00      0.00      0.00         1

    accuracy                           0.81      4000
   macro avg       0.65      0.64      0.63      4000
weighted avg       0.79      0.81      0.78      4000


Training XGBoost...


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Parameters: { "use_label_encoder" } are not used.



name: XGBoost
Classification Report:
              precision    recall  f1-score   support

           0       0.81      0.91      0.86      1659
           1       0.73      0.48      0.58       905
           2       0.90      0.98      0.94      1272
           3       0.89      0.97      0.93       163
           4       0.00      0.00      0.00         1

    accuracy                           0.84      4000
   macro avg       0.67      0.67      0.66      4000
weighted avg       0.83      0.84      0.82      4000


Training CatBoost...
name: CatBoost
Classification Report:
              precision    recall  f1-score   support

           0       0.79      0.90      0.84      1659
           1       0.69      0.41      0.51       905
           2       0.91      0.98      0.94      1272
           3       0.89      1.00      0.94       163
           4       0.00      0.00      0.00         1

    accuracy                           0.82      4000
   macro avg       0.65      0.66 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS R

name: Logistic Regression
Classification Report:
              precision    recall  f1-score   support

           0       0.59      0.47      0.52      1659
           1       0.31      0.29      0.30       905
           2       0.51      0.27      0.35      1272
           3       0.09      0.28      0.14       163
           4       0.00      0.00      0.00         1

    accuracy                           0.35      4000
   macro avg       0.30      0.26      0.26      4000
weighted avg       0.48      0.35      0.40      4000



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [23]:
import numpy as np
average_results = {name: np.mean(accs) for name, accs in all_results.items()}
avg_df = pd.DataFrame(average_results.items(), columns=["Model", "Average Accuracy"])
avg_df = avg_df.sort_values("Average Accuracy", ascending=False)

print("Average Accuracy for Each Model:")
print(avg_df)

Average Accuracy for Each Model:
                 Model  Average Accuracy
1              XGBoost           0.83600
2             CatBoost           0.81875
0        Random Forest           0.80675
3  Logistic Regression           0.35500
