In [None]:
!pip install catboost

In [16]:
import pandas as pd
import xgboost as xgb
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score, confusion_matrix, classification_report

In [13]:
# --- LOAD DATA ---
try:
    df = pd.read_csv('spotify_cleaned.csv')
    print("File loaded successfully.")

    # Bucket 'popularity'
    df['popularity_bucket'] = pd.cut(
        df['popularity'],
        bins=[-1, 20, 40, 60, 80, 100],
        labels=[0, 1, 2, 3, 4]
    ).astype(int)

    print("Bucketing completed. Here's a sample:")
    print(df[['popularity', 'popularity_bucket']].head())

except FileNotFoundError:
    print("Error: File 'spotify_cleaned.csv' not found.")
except pd.errors.EmptyDataError:
    print("Error: 'spotify_cleaned.csv' is empty.")
except pd.errors.ParserError:
    print("Error: Unable to parse 'spotify_cleaned.csv'. Check file format.")
except Exception as e:
    print(f"An unexpected error occurred: {e}")

Collecting catboost
  Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8
File loaded successfully.
Bucketing completed. Here's a sample:
   popularity  popularity_bucket
0          22                  1
1          16                  0
2           0                  0
3          36                  1
4          26                  1


In [14]:
print(y_train.value_counts(normalize=True)) # Check if classes are balanced

popularity_bucket
0    0.416100
2    0.321678
1    0.221284
3    0.039897
4    0.001041
Name: proportion, dtype: float64


Use a tree model like XGBoost. Better for tabular data and known features.

In [22]:
# --- MODEL TRAINING ---
if 'df' in locals():
    try:
        X = df.drop(['popularity', 'popularity_bucket'], axis=1)
        y = df['popularity_bucket']

        # Identify and label encode categorical columns
        categorical_cols = X.select_dtypes(include=['object']).columns.tolist()

        le = LabelEncoder()
        for col in categorical_cols:
            X[col] = le.fit_transform(X[col])

        print(f"Categorical columns encoded: {categorical_cols}")

        # Split data into training and testing sets
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

        # Models to try
        models = {
            "XGBoost": xgb.XGBClassifier(
                objective='multi:softmax',
                num_class=5,
                eval_metric='mlogloss',
                use_label_encoder=False,
                random_state=42
            )
        }

        results = {}

        # Train and evaluate
        for name, model in models.items():
            print(f"\nTraining {name}...")
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)
            acc = accuracy_score(y_test, y_pred)
            results[name] = acc

            print("Classification Report:")
            print(classification_report(y_test, y_pred))


    except KeyError as e:
        print(f"Error: Target column '{e}' not found in the DataFrame.")
    except Exception as e:
        print(f"An error occurred during model training or evaluation: {e}")

Categorical columns encoded: ['artist_name', 'track_name', 'track_id', 'genre', 'decade']

Training XGBoost...


Parameters: { "use_label_encoder" } are not used.



Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.92      0.87     27998
           1       0.78      0.52      0.62     15117
           2       0.91      0.98      0.94     21813
           3       0.85      0.99      0.92      2719
           4       1.00      0.25      0.40        84

    accuracy                           0.85     67731
   macro avg       0.88      0.73      0.75     67731
weighted avg       0.85      0.85      0.84     67731

