# Section C - Predicting Single vs Album
We will evaluate the performance of three models:
- Logistic Regression
- Random Forest
- AdaBoost

All models include hyperparameter tuning and are evaluated on a 80/10/10 split.

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# Load cleaned data
df = pd.read_csv("../output/B_cleaned_ready_for_modeling.csv")

# Drop if Album_type still exists
df = df.drop(columns=['Album_type'], errors='ignore')

# Split features and label
X = df.drop(columns=["Target"])
y = df["Target"]

# Train/val/test split (80/10/10)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

## Model 1: Logistic Regression

In [None]:
lr = LogisticRegression(max_iter=1000)
params_lr = {'C': [0.01, 0.1, 1, 10]}
gs_lr = GridSearchCV(lr, params_lr, cv=3)
gs_lr.fit(X_train_scaled, y_train)
best_lr = gs_lr.best_estimator_
y_pred_lr = best_lr.predict(X_test_scaled)
print("Best Logistic Regression Params:", gs_lr.best_params_)
print(classification_report(y_test, y_pred_lr))

## Model 2: Random Forest

In [None]:
rf = RandomForestClassifier(random_state=42)
params_rf = {'n_estimators': [50, 100], 'max_depth': [None, 10]}
gs_rf = GridSearchCV(rf, params_rf, cv=3)
gs_rf.fit(X_train, y_train)
best_rf = gs_rf.best_estimator_
y_pred_rf = best_rf.predict(X_test)
print("Best Random Forest Params:", gs_rf.best_params_)
print(classification_report(y_test, y_pred_rf))

## Model 3: AdaBoost

In [None]:
ada = AdaBoostClassifier(random_state=42)
params_ada = {'n_estimators': [50, 100], 'learning_rate': [0.5, 1.0, 1.5]}
gs_ada = GridSearchCV(ada, params_ada, cv=3)
gs_ada.fit(X_train, y_train)
best_ada = gs_ada.best_estimator_
y_pred_ada = best_ada.predict(X_test)
print("Best AdaBoost Params:", gs_ada.best_params_)
print(classification_report(y_test, y_pred_ada))

## Comparison of Results

In [None]:
models = {
    'Logistic Regression': best_lr,
    'Random Forest': best_rf,
    'AdaBoost': best_ada
}
scores = {
    name: accuracy_score(y_test, model.predict(X_test_scaled if 'Regression' in name else X_test))
    for name, model in models.items()
}
plt.figure(figsize=(8,5))
sns.barplot(x=list(scores.keys()), y=list(scores.values()))
plt.title("Model Comparison - Accuracy on Test Set")
plt.ylabel("Accuracy")
plt.ylim(0,1)
plt.show()