### Define Data

In [86]:
import pandas as pd
from sklearn.model_selection import train_test_split

df = pd.read_csv("data_train.csv", index_col=0)

In [88]:
#Overview of the top 'n' genres within each cluster, and the number of songs within each cluster (training set)
n = 5

song_count = len(df)
genre_dict = {cluster: list(df['track_genre'][df['cluster'] == cluster].value_counts().index[:n]) for cluster in df['cluster'].unique()}
cluster_df = pd.DataFrame({"cluster":df['cluster'], "genre": df['track_genre']})
count_dict_unformatted = cluster_df['cluster'].value_counts().to_dict()
count_dict = pd.DataFrame({k: [f"{v/song_count*100:.0f}%"] for k,v in count_dict_unformatted.items()})

summary_df = pd.concat([pd.DataFrame(genre_dict), pd.DataFrame(count_dict)], ignore_index = True)
summary_df.rename({i: i+1 for i in range(len(summary_df))}).rename({len(summary_df): "Percent of Data"})

Unnamed: 0,1,2,4,0,7,3,5,6
1,tango,kids,comedy,pagode,k-pop,metalcore,new-age,minimal-techno
2,romance,latin,show-tunes,samba,pop-film,heavy-metal,classical,detroit-techno
3,honky-tonk,party,funk,sertanejo,forro,death-metal,sleep,techno
4,cantopop,salsa,kids,mpb,deep-house,hardstyle,ambient,chicago-house
5,jazz,reggae,children,gospel,turkish,grunge,piano,trance
Percent of Data,18%,19%,1%,6%,21%,18%,6%,10%


In [92]:
#Example for binary cluster classification
selected_cluster = 3


#Coerce boolean to binary
df[f'cluster{selected_cluster}_binary'] = (df['cluster'] == selected_cluster) *1

#Drop columns (should we ddrop track_genre??) and create training/testing 
#NOTE a testing csv has already been witheld
X = df.drop(labels = ['track_genre', 'cluster'], axis=1)
y = X.pop(f'cluster{selected_cluster}_binary')

X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=42,test_size = 0.2)

### Assess Models

In [93]:
#Negative Model
y_test.value_counts()[0]/len(y_test)

0.8212171052631579

In [94]:
#Logistic model
from sklearn.linear_model import LogisticRegression

log_clf = LogisticRegression(random_state=42)
log_clf.fit(X_train, y_train)
yhat = log_clf.predict(X_test)
(yhat == y_test).mean()

0.8212171052631579

In [95]:
#Adaboost model
from sklearn.ensemble import AdaBoostClassifier

abc_clf = AdaBoostClassifier(n_estimators=100, algorithm="SAMME", random_state=42)
abc_clf.fit(X_train, y_train)
yhat = abc_clf.predict(X_test)
(y_test == yhat).mean()

0.950109649122807

In [51]:
#SVC model
from sklearn import svm

svc_clf = svm.SVC()
svc_clf.fit(X_train, y_train)
yhat = svc_clf.predict(X_test)
(y_test == yhat).mean()