In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

file_path = r'C:\Users\aryan\OneDrive\Desktop\Work\archive\spotify_songs.csv'
df = pd.read_csv(file_path)

df.drop_duplicates(inplace=True)
df.dropna(inplace=True)

df['acousticness'] = np.log1p(df['acousticness'])
df['instrumentalness'] = np.log1p(df['instrumentalness'])
df['liveness'] = np.log1p(df['liveness'])
df['duration_ms'] = np.log1p(df['duration_ms'])

df['danceability'] = np.sqrt(df['danceability'])
df['energy'] = np.sqrt(df['energy'])
df['tempo'] = np.sqrt(df['tempo'])

df = df.drop(['track_id', 'track_album_id', 'playlist_name', 'playlist_id', 'playlist_genre'], axis=1)

numeric_columns = ['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 
                   'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms']
categorical_columns = ['track_name', 'track_artist', 'track_album_name', 'playlist_subgenre']

df['popularity_class'] = pd.qcut(df['track_popularity'], q=3, labels=['Low', 'Medium', 'High'])

X = df[numeric_columns + categorical_columns]
y = df['popularity_class']

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_columns),
        ('cat', categorical_transformer, categorical_columns)
    ])

clf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
])

clf.fit(X_train, y_train)

y_val_pred = clf.predict(X_val)
val_accuracy = accuracy_score(y_val, y_val_pred)
print(f'Validation Accuracy: {val_accuracy}')
print(classification_report(y_val, y_val_pred))
print(confusion_matrix(y_val, y_val_pred))

y_test_pred = clf.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f'Test Accuracy: {test_accuracy}')
print(classification_report(y_test, y_test_pred))
print(confusion_matrix(y_test, y_test_pred))
