In [1]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


In [2]:
import pandas as pd
import xgboost as xgb
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score, confusion_matrix, classification_report

In [3]:
from google.colab import files
uploaded = files.upload()

Saving spotify_cleaned.csv to spotify_cleaned.csv


In [4]:
# --- LOAD DATA ---
try:
    df = pd.read_csv('spotify_cleaned.csv')
    print("File loaded successfully.")

    # Bucket 'popularity'
    df['popularity_bucket'] = pd.cut(
        df['popularity'],
        bins=[-1, 20, 40, 60, 80, 100],
        labels=[0, 1, 2, 3, 4]
    ).astype(int)

    print("Bucketing completed. Here's a sample:")
    print(df[['popularity', 'popularity_bucket']].head())

except FileNotFoundError:
    print("Error: File 'spotify_cleaned.csv' not found.")
except pd.errors.EmptyDataError:
    print("Error: 'spotify_cleaned.csv' is empty.")
except pd.errors.ParserError:
    print("Error: Unable to parse 'spotify_cleaned.csv'. Check file format.")
except Exception as e:
    print(f"An unexpected error occurred: {e}")

File loaded successfully.
Bucketing completed. Here's a sample:
   popularity  popularity_bucket
0          22                  1
1          16                  0
2           0                  0
3          36                  1
4          26                  1


Use a tree model like XGBoost. Better for tabular data and known features.

In [11]:
# Re-bucket popularity into High / Moderate / Low
q90 = df['popularity'].quantile(0.90)
q40 = df['popularity'].quantile(0.40)

def bucket_popularity(p):
    if p >= q90:
        return 'High'
    elif p >= q40:
        return 'Moderate'
    else:
        return 'Low'

df['success_bucket'] = df['popularity'].apply(bucket_popularity)


In [13]:
print(y_train.value_counts(normalize=True)) # Check if classes are balanced

NameError: name 'y_train' is not defined

In [12]:
X = df.drop(['popularity', 'popularity_bucket', 'success_bucket'], axis=1)
y = df['success_bucket']

In [14]:
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()

le = LabelEncoder()
for col in categorical_cols:
    X[col] = le.fit_transform(X[col])

In [15]:
from sklearn.preprocessing import StandardScaler

numeric_cols = ['loudness', 'tempo', 'danceability', 'energy', 'valence']

scaler = StandardScaler()
X[numeric_cols] = scaler.fit_transform(X[numeric_cols])

In [6]:
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.linear_model import LogisticRegression

In [23]:
n_runs = 5
models = {
    "Random Forest": RandomForestClassifier(n_estimators=400, random_state=69),
    "XGBoost": xgb.XGBClassifier(
        objective='multi:softmax',
        num_class=5,
        eval_metric='mlogloss',
        use_label_encoder=False,
        random_state=69),
    "CatBoost": CatBoostClassifier(
        iterations=400,
        depth=8,
        learning_rate=0.1,
        loss_function='MultiClass',
        verbose=False,
        random_state=69),
    "Logistic Regression": LogisticRegression(
        max_iter=1000,
        multi_class='ovr',
        class_weight='balanced')
}

all_results = {model_name : [] for model_name in models.keys()}

In [24]:
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler

# Create Cross-Validation object
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=69)

for run in range(n_runs):
    print("Starting run", run + 1)

    # Sample new 20,000 rows each time
    df_sample = df.sample(10000, random_state=run)

    if 'df_sample' in locals():
        try:
            # 1. Create X and y
            X = df_sample.drop(['popularity', 'popularity_bucket'], axis=1)
            y = df_sample['popularity_bucket']

            # 2. Label encode categorical columns
            categorical_cols = X.select_dtypes(include=['object']).columns.tolist()
            le = LabelEncoder()
            for col in categorical_cols:
                X[col] = le.fit_transform(X[col])

            print(f"Categorical columns encoded: {categorical_cols}")

            # 3. Normalize numeric columns
            numeric_cols = ['loudness', 'tempo', 'danceability', 'energy', 'valence']
            scaler = StandardScaler()
            X[numeric_cols] = scaler.fit_transform(X[numeric_cols])

            # 4. Train and cross-validate models
            for name, model in models.items():
                print("Cross-validating", name)
                scores = cross_val_score(model, X, y, cv=cv, scoring='accuracy')
                mean_acc = scores.mean()
                all_results[name].append(mean_acc)
                print(f"Mean Accuracy across 5 folds: {mean_acc:.4f}")

        except KeyError as e:
            print(f"Error: Target column '{e}' not found in the DataFrame.")
        except Exception as e:
            print(f"An error occurred during model training or evaluation: {e}")


Starting run 1
Categorical columns encoded: ['artist_name', 'track_name', 'track_id', 'genre', 'decade', 'success_bucket']
Cross-validating Random Forest
Mean Accuracy across 5 folds: 0.9369
Cross-validating XGBoost


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Mean Accuracy across 5 folds: 0.9341
Cross-validating CatBoost
Mean Accuracy across 5 folds: 0.9371
Cross-validating Logistic Regression


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Mean Accuracy across 5 folds: 0.8178
Starting run 2
Categorical columns encoded: ['artist_name', 'track_name', 'track_id', 'genre', 'decade', 'success_bucket']
Cross-validating Random Forest
Mean Accuracy across 5 folds: 0.9327
Cross-validating XGBoost


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Mean Accuracy across 5 folds: 0.9323
Cross-validating CatBoost
Mean Accuracy across 5 folds: 0.9340
Cross-validating Logistic Regression


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Mean Accuracy across 5 folds: 0.8009
Starting run 3
Categorical columns encoded: ['artist_name', 'track_name', 'track_id', 'genre', 'decade', 'success_bucket']
Cross-validating Random Forest
Mean Accuracy across 5 folds: 0.9369
Cross-validating XGBoost


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Mean Accuracy across 5 folds: 0.9353
Cross-validating CatBoost
Mean Accuracy across 5 folds: 0.9375
Cross-validating Logistic Regression


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Mean Accuracy across 5 folds: 0.7959
Starting run 4
Categorical columns encoded: ['artist_name', 'track_name', 'track_id', 'genre', 'decade', 'success_bucket']
Cross-validating Random Forest
Mean Accuracy across 5 folds: 0.9331
Cross-validating XGBoost


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Mean Accuracy across 5 folds: 0.9308
Cross-validating CatBoost
Mean Accuracy across 5 folds: 0.9334
Cross-validating Logistic Regression


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Mean Accuracy across 5 folds: 0.7715
Starting run 5
Categorical columns encoded: ['artist_name', 'track_name', 'track_id', 'genre', 'decade', 'success_bucket']
Cross-validating Random Forest
Mean Accuracy across 5 folds: 0.9373
Cross-validating XGBoost


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Mean Accuracy across 5 folds: 0.9360
Cross-validating CatBoost
Mean Accuracy across 5 folds: 0.9383
Cross-validating Logistic Regression


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Mean Accuracy across 5 folds: 0.7380


In [25]:
import numpy as np
average_results = {name: np.mean(accs) for name, accs in all_results.items()}
avg_df = pd.DataFrame(average_results.items(), columns=["Model", "Average Accuracy"])
avg_df = avg_df.sort_values("Average Accuracy", ascending=False)

print("Average Accuracy for Each Model:")
print(avg_df)

Average Accuracy for Each Model:
                 Model  Average Accuracy
2             CatBoost           0.93606
0        Random Forest           0.93538
1              XGBoost           0.93370
3  Logistic Regression           0.78482
