<a href="https://colab.research.google.com/github/junurisreeja/Python_project/blob/main/python_project_code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
"""
train_all_classifiers.py

Implements multiple supervised classifiers and outputs classification reports
for the "Artist Collaboration and Popularity Boost" problem using the
'spotify 2023.csv.bz2' dataset.

Outputs:
 - results/classification_metrics_summary.csv
 - results/classification_reports_taskA.txt
 - results/classification_reports_taskB.txt
 - results/*.png (plots)
 - results/rf_feature_importance_*.csv
 - results/lr_coefficients_*.csv
"""

import os
from pathlib import Path
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score

# classifiers
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC, SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB

# stats for EDA
from scipy.stats import spearmanr, f_oneway, chi2_contingency

# -----------------------
# Config
# -----------------------
DATA_PATH = "/content/spotify-2023.csv"   # update if file is elsewhere
OUT_DIR = Path("results")
OUT_DIR.mkdir(parents=True, exist_ok=True)
RANDOM_STATE = 42
TEST_SIZE = 0.25

# -----------------------
# 1) Load dataset robustly
# -----------------------
def load_df(path):
    try:
        df = pd.read_csv(path, encoding="latin1")
    except Exception:
        df = pd.read_csv(path, encoding="latin1", engine="python", on_bad_lines="skip")
    return df

df = load_df(DATA_PATH)
print("Loaded data shape:", df.shape)

# normalize column names
df.columns = [c.strip() for c in df.columns]

# -----------------------
# 2) Clean & coerce types
# -----------------------
# numeric columns we expect; some may be missing depending on CSV
numeric_cols = [
    "artist_count","released_year","released_month","released_day",
    "in_spotify_playlists","in_spotify_charts",
    "in_apple_playlists","in_apple_charts",
    "in_deezer_playlists","in_deezer_charts","in_shazam_charts",
    "bpm","danceability_%","valence_%","energy_%",
    "acousticness_%","instrumentalness_%","liveness_%","speechiness_%","streams"
]

# sanitize streams (remove commas/non-digits) and coerce numeric
if "streams" in df.columns:
    df["streams"] = df["streams"].astype(str).str.replace(",", "", regex=False).str.replace(r"[^\d.]", "", regex=True).replace("", np.nan)

for c in numeric_cols:
    if c in df.columns:
        df[c] = pd.to_numeric(df[c], errors="coerce")

# basic cleaning for categorical cols
if "mode" in df.columns:
    df["mode"] = df["mode"].astype(str).str.strip()
if "key" in df.columns:
    df["key"] = df["key"].astype(str).str.strip()

# drop rows without artist_count (can't study collaboration)
if "artist_count" in df.columns:
    df = df[~df["artist_count"].isna()].copy()

print("After cleaning shape:", df.shape)

# -----------------------
# 3) Targets (two tasks)
# -----------------------
# Task A: chart presence on ANY platform
df["chart_any"] = 0
for c in ["in_spotify_charts","in_apple_charts","in_deezer_charts","in_shazam_charts"]:
    if c in df.columns:
        df["chart_any"] = df["chart_any"] | (df[c].fillna(0) > 0)
df["chart_any"] = df["chart_any"].astype(int)

# Task B: top quartile of total playlist inclusions (spotify+apple+deezer)
df["total_playlists"] = 0
for c in ["in_spotify_playlists","in_apple_playlists","in_deezer_playlists"]:
    if c in df.columns:
        df["total_playlists"] = df["total_playlists"] + df[c].fillna(0)
df["playlist_top_quartile"] = (df["total_playlists"] >= df["total_playlists"].quantile(0.75)).astype(int)

print("Target distributions:")
print(" chart_any:", df["chart_any"].value_counts(normalize=True).to_dict())
print(" playlist_top_quartile:", df["playlist_top_quartile"].value_counts(normalize=True).to_dict())

# -----------------------
# 4) Exploratory checks (optional)
# -----------------------
# Spearman correlation (artist_count vs streams) if streams present
if "streams" in df.columns:
    df_tmp = df.dropna(subset=["artist_count","streams"])
    if not df_tmp.empty:
        r, p = spearmanr(df_tmp["artist_count"], df_tmp["streams"])
        print(f"Spearman(artist_count, streams): r={r:.4f}, p={p:.4g}")

# -----------------------
# 5) Modeling pipeline
# -----------------------
# Feature candidates (artist_count is included - the independent variable of interest)
feature_candidates = [
    "artist_count", "released_year", "released_month", "released_day",
    "bpm", "danceability_%", "valence_%", "energy_%",
    "acousticness_%", "instrumentalness_%", "liveness_%", "speechiness_%"
]
features = [c for c in feature_candidates if c in df.columns]
categorical = [c for c in ["key","mode"] if c in df.columns]

print("Features (numeric):", features)
print("Categorical:", categorical)

# create X (features) once; will be sliced per train/test
X_all = df[features + categorical].copy()

# preprocessing
num_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])
cat_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])
preprocessor = ColumnTransformer([
    ("num", num_pipeline, features),
    ("cat", cat_pipeline, categorical)
])

# models to run
models = {
    "LogisticRegression": LogisticRegression(max_iter=2000, class_weight="balanced", random_state=RANDOM_STATE),
    "LinearSVC": LinearSVC(class_weight="balanced", max_iter=10000, random_state=RANDOM_STATE),
    "SVC_rbf": SVC(kernel="rbf", class_weight="balanced", random_state=RANDOM_STATE),
    "KNN": KNeighborsClassifier(n_neighbors=7),
    "DecisionTree": DecisionTreeClassifier(class_weight="balanced", random_state=RANDOM_STATE),
    "RandomForest": RandomForestClassifier(n_estimators=300, class_weight="balanced_subsample", random_state=RANDOM_STATE),
    "GradientBoosting": GradientBoostingClassifier(random_state=RANDOM_STATE),
    "GaussianNB": GaussianNB()
}

tasks = {
    "TaskA_chart_any": "chart_any",
    "TaskB_playlist_top_quartile": "playlist_top_quartile"
}

# storage for metrics and reports
metrics_rows = []
reports = {task: [] for task in tasks.keys()}

# we will also save RF importances and LR coefficients per task
for task_label, target_col in tasks.items():
    print(f"\n=== Running {task_label} ===")
    y = df[target_col].copy()
    if y.nunique() < 2:
        print(f"Skipping {task_label} (only one class present).")
        continue

    # split
    X_train, X_test, y_train, y_test = train_test_split(X_all, y, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=y)

    for model_name, model in models.items():
        # For GaussianNB we will ensure dense arrays via FunctionTransformer (inside pipeline)
        if model_name == "GaussianNB":
            pipe = Pipeline([("pre", preprocessor),
                             ("to_dense", FunctionTransformer(lambda x: x.toarray() if hasattr(x, "toarray") else x)),
                             ("clf", model)])
        else:
            pipe = Pipeline([("pre", preprocessor), ("clf", model)])

        # fit & predict
        pipe.fit(X_train, y_train)
        y_pred = pipe.predict(X_test)

        # metrics
        acc = accuracy_score(y_test, y_pred)
        prec = precision_score(y_test, y_pred, zero_division=0)
        rec = recall_score(y_test, y_pred, zero_division=0)
        f1 = f1_score(y_test, y_pred, zero_division=0)

        metrics_rows.append({
            "task": task_label,
            "model": model_name,
            "accuracy": acc,
            "precision": prec,
            "recall": rec,
            "f1": f1
        })

        # textual classification report
        rep = classification_report(y_test, y_pred, digits=4, zero_division=0)
        header = f"=== {task_label} | {model_name} ===\n"
        reports[task_label].append(header + rep)
        print(header)
        print(rep)

    # Feature importance (RandomForest) and LR coefficients
    if "RandomForest" in models:
        rf_pipe = Pipeline([("pre", preprocessor), ("rf", RandomForestClassifier(n_estimators=300, random_state=RANDOM_STATE))])
        rf_pipe.fit(X_train, y_train)
        # get feature names
        num_names = features
        ohe_names = []
        if categorical:
            ohe = rf_pipe.named_steps["pre"].named_transformers_["cat"].named_steps["onehot"]
            ohe_names = list(ohe.get_feature_names_out(categorical))
        final_feature_names = num_names + ohe_names
        importances = rf_pipe.named_steps["rf"].feature_importances_
        fi_df = pd.DataFrame({"feature": final_feature_names, "importance": importances}).sort_values("importance", ascending=False)
        fi_df.to_csv(OUT_DIR / f"rf_feature_importance_{task_label}.csv", index=False)
        print(f"Saved RF importances: {OUT_DIR / f'rf_feature_importance_{task_label}.csv'}")

    if "LogisticRegression" in models:
        lr_pipe = Pipeline([("pre", preprocessor), ("lr", LogisticRegression(max_iter=2000, random_state=RANDOM_STATE))])
        lr_pipe.fit(X_train, y_train)
        ohe_names = []
        if categorical:
            ohe = lr_pipe.named_steps["pre"].named_transformers_["cat"].named_steps["onehot"]
            ohe_names = list(ohe.get_feature_names_out(categorical))
        final_feature_names = features + ohe_names
        coefs = lr_pipe.named_steps["lr"].coef_[0]
        coef_df = pd.DataFrame({"feature": final_feature_names, "coefficient": coefs}).sort_values("coefficient", key=abs, ascending=False)
        coef_df.to_csv(OUT_DIR / f"lr_coefficients_{task_label}.csv", index=False)
        print(f"Saved LR coefficients: {OUT_DIR / f'lr_coefficients_{task_label}.csv'}")

# -----------------------
# 6) Save metrics & reports
# -----------------------
metrics_df = pd.DataFrame(metrics_rows).sort_values(["task", "f1"], ascending=[True, False])
metrics_df.to_csv(OUT_DIR / "classification_metrics_summary.csv", index=False)
print("Saved metrics summary to:", OUT_DIR / "classification_metrics_summary.csv")

# save textual classification reports
for task_label, reps in reports.items():
    if reps:
        with open(OUT_DIR / f"classification_reports_{task_label}.txt", "w", encoding="utf-8") as f:
            f.write("\n\n".join(reps))
        print("Saved report:", OUT_DIR / f"classification_reports_{task_label}.txt")

# -----------------------
# 7) Basic visualization
# -----------------------
if not metrics_df.empty:
    plt.figure(figsize=(10,6))
    sns.barplot(data=metrics_df, x="model", y="f1", hue="task")
    plt.title("F1 by model and task")
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.savefig(OUT_DIR / "f1_by_model_task.png")
    plt.close()
    print("Saved plot:", OUT_DIR / "f1_by_model_task.png")

print("\nDone. All outputs are in the 'results' folder.")


Loaded data shape: (953, 24)
After cleaning shape: (953, 24)
Target distributions:
 chart_any: {1: 0.9412381951731374, 0: 0.05876180482686254}
 playlist_top_quartile: {0: 0.7492130115424974, 1: 0.25078698845750264}
Spearman(artist_count, streams): r=-0.1565, p=1.2e-06
Features (numeric): ['artist_count', 'released_year', 'released_month', 'released_day', 'bpm', 'danceability_%', 'valence_%', 'energy_%', 'acousticness_%', 'instrumentalness_%', 'liveness_%', 'speechiness_%']
Categorical: ['key', 'mode']

=== Running TaskA_chart_any ===
=== TaskA_chart_any | LogisticRegression ===

              precision    recall  f1-score   support

           0     0.0619    0.4286    0.1081        14
           1     0.9437    0.5956    0.7302       225

    accuracy                         0.5858       239
   macro avg     0.5028    0.5121    0.4192       239
weighted avg     0.8920    0.5858    0.6938       239

=== TaskA_chart_any | LinearSVC ===

              precision    recall  f1-score   supp