In [4]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns


In [5]:

# Load the cleaned dataset (make sure the path is correct)
df = pd.read_csv("../output/B_cleaned_ready_for_modeling.csv")

# Convert label to binary: 'single' = 1, else = 0
df['Target'] = df['Album_type'].apply(lambda x: 1 if x == 'single' else 0)

# Drop unneeded columns
drop_cols = ['Album_type', 'Album', 'Track', 'Artist', 'Title', 'Url_spotify', 'Url_youtube', 'Channel', 'Description']
X = df.drop(columns=drop_cols + ['Target'], errors='ignore')
y = df['Target']


In [6]:

# 80/10/10 split
X_train_full, X_test, y_train_full, y_test = train_test_split(X, y, test_size=0.10, random_state=42, stratify=y)
X_train, X_val, y_train, y_val = train_test_split(X_train_full, y_train_full, test_size=0.1111, random_state=42, stratify=y_train_full)


In [7]:

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)


In [9]:
print(X_train.shape, y_train.shape)
print(X_train.columns)
print(set(df.columns) - set(X_train.columns) - {'Target'})

(16574, 18) (16574,)
Index(['Danceability', 'Energy', 'Loudness', 'Speechiness', 'Acousticness',
       'Instrumentalness', 'Liveness', 'Valence', 'Tempo', 'Duration_ms',
       'Views', 'Likes', 'Comments', 'Stream', 'Key_Code', 'Loudness_norm',
       'Fitness_for_Clubs', 'Song_Name_Length'],
      dtype='object')
{'Album_type', 'Album', 'Artist'}


In [10]:
print(X_train.dtypes.value_counts())

float64    16
int64       2
Name: count, dtype: int64


In [12]:
gs_rf.best_estimator_.score(X_val, y_val)

0.8296332046332047

In [13]:
from sklearn.metrics import f1_score

y_pred = gs_rf.predict(X_val)
f1 = f1_score(y_val, y_pred)

In [8]:

models = {}
results = {}

# Logistic Regression
lr = LogisticRegression()
params_lr = {'C': [0.1, 1, 10]}
gs_lr = GridSearchCV(lr, params_lr, cv=3)
gs_lr.fit(X_train_scaled, y_train)
models['Logistic Regression'] = gs_lr.best_estimator_
results['Logistic Regression'] = gs_lr.best_estimator_.score(X_val_scaled, y_val)

# Random Forest
rf = RandomForestClassifier(random_state=42)
params_rf = {'n_estimators': [50, 100], 'max_depth': [None, 10]}
gs_rf = GridSearchCV(rf, params_rf, cv=3)
gs_rf.fit(X_train, y_train)
models['Random Forest'] = gs_rf.best_estimator_
results['Random Forest'] = gs_rf.best_estimator_.score(X_val, y_val)

# SVM
svm = SVC()
params_svm = {'C': [0.1, 1, 10], 'kernel': ['rbf', 'linear']}
gs_svm = GridSearchCV(svm, params_svm, cv=3)
gs_svm.fit(X_train_scaled, y_train)
models['SVM'] = gs_svm.best_estimator_
results['SVM'] = gs_svm.best_estimator_.score(X_val_scaled, y_val)


KeyboardInterrupt: 

In [None]:

for name, model in models.items():
    if name in ['Random Forest']:
        y_pred = model.predict(X_test)
    else:
        y_pred = model.predict(X_test_scaled)
    print(f"--- {name} ---")
    print(classification_report(y_test, y_pred))
    print(confusion_matrix(y_test, y_pred))


In [None]:

plt.figure(figsize=(8, 5))
sns.barplot(x=list(results.keys()), y=list(results.values()))
plt.ylim(0.5, 1)
plt.title("Validation Accuracy of Models")
plt.ylabel("Accuracy")
plt.show()
