# 🎵 Section C - Model Training: Single vs Album Classification

## 📥 Load data and prepare features

In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

# Load cleaned dataset
df = pd.read_csv("../output/Processed_Spotify_Youtube.csv")

# Binary target: 1 = single, 0 = album/compilation
df['Target'] = df['Album_type'].apply(lambda x: 1 if x == 'single' else 0)

# Features/labels
X = df.drop(columns=['Album_type', 'Target'])
y = df['Target']

# One-hot encode categoricals
X = pd.get_dummies(X, drop_first=True)

# Split to train/val/test: 80/10/10
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.10, random_state=42, stratify=y)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.1111, random_state=42, stratify=y_temp)

## 🧪 Evaluation helper

In [None]:
def evaluate_model(name, model):
    print(f"\n📊 Results for: {name}")
    y_pred = model.predict(X_test)
    print(classification_report(y_test, y_pred))
    acc = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {acc:.3f}")
    return name, acc

## 🌳 Random Forest

In [None]:
rf = RandomForestClassifier(random_state=42)
params_rf = {'n_estimators': [50, 100], 'max_depth': [None, 10, 20]}
gs_rf = GridSearchCV(rf, params_rf, cv=3)
gs_rf.fit(X_train, y_train)
best_rf = gs_rf.best_estimator_

## 📈 Logistic Regression

In [None]:
lr = LogisticRegression(max_iter=500)
params_lr = {'C': [0.1, 1, 10]}
gs_lr = GridSearchCV(lr, params_lr, cv=3)
gs_lr.fit(X_train, y_train)
best_lr = gs_lr.best_estimator_

## 🧠 Support Vector Machine (SVM)

In [None]:
svc = SVC()
params_svm = {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']}
gs_svm = GridSearchCV(svc, params_svm, cv=3)
gs_svm.fit(X_train, y_train)
best_svm = gs_svm.best_estimator_

## 📉 Evaluation & Comparison

In [None]:
results = []
for name, model in [("Random Forest", best_rf), ("Logistic Regression", best_lr), ("SVM", best_svm)]:
    results.append(evaluate_model(name, model))

In [None]:
names, scores = zip(*results)
plt.figure(figsize=(8, 5))
sns.barplot(x=list(names), y=list(scores))
plt.title("Test Accuracy of Models")
plt.ylabel("Accuracy")
plt.ylim(0, 1)
plt.show()