In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from statistics import mean
from scipy import stats
from scipy.stats import chi2_contingency 
import time
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score
from sklearn.linear_model import LogisticRegression
from sklearn import tree
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LinearRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import RocCurveDisplay
from sklearn.metrics import PrecisionRecallDisplay

In [None]:
df3 = pd.read_csv('cleaned_spotify.csv')
print("number of samples: " + str(len(df3)))
X1 = df3[['loudness', 'energy', 'instrumentalness', 'acousticness', 'liveness', 'duration_ms', 'speechiness', 'valence', 'danceability', 'tempo']]
X2 = df3[['loudness', 'energy', 'instrumentalness', 'acousticness', 'liveness', 'duration_ms', 'speechiness', 'valence', 'danceability', 'tempo', 'explicit', 'key', 'mode', 'year']]
X3 = df3[['loudness', 'energy', 'instrumentalness', 'acousticness', 'liveness', 'duration_ms', 'speechiness', 'valence', 'danceability', 'tempo', 'explicit', 'key', 'mode']]
y = df3['popularity']
mean = y.mean()
print("the mean is: " + str(mean))

age = []
for i in range(len(df3['year'])):
    age.append(2022 - df3['year'][i])
X3['age'] = age

X_train, X_test, y_train, y_test = train_test_split(X3,y,test_size=0.2,random_state=0)

## Classify data
y_bool_train = [i > mean for i in y_train]
for i in range(len(y_bool_train)):
    if y_bool_train[i]:
        y_bool_train[i] = 1
    else:
        y_bool_train[i] = 0

y_bool_test = [i > mean for i in y_test]
for i in range(len(y_bool_test)):
    if y_bool_test[i]:
        y_bool_test[i] = 1
    else:
        y_bool_test[i] = 0

print("number of popular samples: " + str(sum(y_bool_test) + sum(y_bool_train)))

In [None]:
#classification

print('Logistic Regression')
model1 = LogisticRegression(solver='liblinear')
lrscores = cross_val_score(model1, X_train, y_bool_train, scoring='f1', cv=5)
print(lrscores)
print('avg cv score: ' + str(sum(lrscores)/len(lrscores)))
model1.fit(X_train, y_bool_train)
pred = model1.predict(X_test)
print('score on test data: ' + str(f1_score(y_bool_test, pred)))

print()
print('Decision Tree Classifier')
model2 = tree.DecisionTreeClassifier(criterion='entropy', max_depth=11, min_samples_leaf=12)
dtscores = cross_val_score(model2, X_train, y_bool_train, scoring='f1', cv=5)
print(dtscores)
print('avg cv score: ' + str(sum(dtscores)/len(dtscores)))
model2.fit(X_train, y_bool_train)
pred = model2.predict(X_test)
print('score on test data: ' + str(f1_score(y_bool_test, pred)))

print()
print('K Nearest Neighbor Classifier')
model3 = KNeighborsClassifier()
knnscores = cross_val_score(model3, X_train, y_bool_train, scoring='f1', cv=5)
print(knnscores)
print('avg cv score: ' + str(sum(knnscores)/len(knnscores)))
model3.fit(X_train, y_bool_train)
pred = model3.predict(X_test)
print('score on test data: ' + str(f1_score(y_bool_test, pred)))

print()
print('Voting Classifier with Decision Tree')
classifier = tree.DecisionTreeClassifier(criterion='entropy', max_depth=11, min_samples_leaf=12)
v = []
for i in range(11):
    v.append(('DTC'+str(i), classifier))
model4 = VotingClassifier(estimators=v, voting ='hard')
vcscores = cross_val_score(model4, X_train, y_bool_train, scoring='f1', cv=5)
print(vcscores)
print('avg cv score: ' + str(sum(vcscores)/len(vcscores)))
model4.fit(X_train, y_bool_train)
pred = model4.predict(X_test)
print('score on test data: ' + str(f1_score(y_bool_test, pred)))

print()
print('Bagging Classifier with Decision Tree')
classifier = tree.DecisionTreeClassifier(criterion='entropy', max_depth=11, min_samples_leaf=12)
model5 = BaggingClassifier(base_estimator=classifier)
bcscores = cross_val_score(model5, X_train, y_bool_train, scoring='f1', cv=5)
print(bcscores)
print('avg cv score: ' + str(sum(bcscores)/len(bcscores)))
model5.fit(X_train, y_bool_train)
pred = model5.predict(X_test)
print('score on test data: ' + str(f1_score(y_bool_test, pred)))

print()
print('Voting Classifier with Decision Tree and KNN')
classifier = tree.DecisionTreeClassifier(criterion='entropy', max_depth=11, min_samples_leaf=12)
v = []
for i in range(5):
    v.append(('DTC'+str(i), classifier))
classifier = KNeighborsClassifier()
v = []
for i in range(5):
    v.append(('KNN'+str(i), classifier))
model6 = VotingClassifier(estimators=v, voting ='hard')
vc2scores = cross_val_score(model6, X_train, y_bool_train, scoring='f1', cv=5)
print(vc2scores)
print('avg cv score: ' + str(sum(vc2scores)/len(vc2scores)))
model6.fit(X_train, y_bool_train)
pred = model6.predict(X_test)
print('score on test data: ' + str(f1_score(y_bool_test, pred)))

print()
print('Random Forest Classifier')
model7 = RandomForestClassifier()
rfcscores = cross_val_score(model7, X_train, y_bool_train, scoring='f1', cv=5)
print(rfcscores)
print('avg cv score: ' + str(sum(rfcscores)/len(rfcscores)))
model7.fit(X_train, y_bool_train)
pred = model7.predict(X_test)
print('score on test data: ' + str(f1_score(y_bool_test, pred)))

print()
print('Extra Tree Classifier')
model8 = ExtraTreesClassifier()
etcscores = cross_val_score(model8, X_train, y_bool_train, scoring='f1', cv=5)
print(etcscores)
print('avg cv score: ' + str(sum(etcscores)/len(etcscores)))
model8.fit(X_train, y_bool_train)
pred = model8.predict(X_test)
print('score on test data: ' + str(f1_score(y_bool_test, pred)))

print()
print('Ada Boosting Classifier')
model9 = AdaBoostClassifier()
abcscores = cross_val_score(model9, X_train, y_bool_train, scoring='f1', cv=5)
print(abcscores)
print('avg cv score: ' + str(sum(abcscores)/len(abcscores)))
model9.fit(X_train, y_bool_train)
pred = model9.predict(X_test)
print('score on test data: ' + str(f1_score(y_bool_test, pred)))

print()
print('Gradient Boosting Classifier')
model10 = GradientBoostingClassifier()
gbcscores = cross_val_score(model10, X_train, y_bool_train, scoring='f1', cv=5)
print(gbcscores)
print('avg cv score: ' + str(sum(gbcscores)/len(gbcscores)))
model10.fit(X_train, y_bool_train)
pred = model10.predict(X_test)
print('score on test data: ' + str(f1_score(y_bool_test, pred)))

print()
print('Histogram Gradient Boosting Classifier')
model11 = HistGradientBoostingClassifier()
hgbcscores = cross_val_score(model11, X_train, y_bool_train, scoring='f1', cv=5)
print(hgbcscores)
print('avg cv score: ' + str(sum(hgbcscores)/len(hgbcscores)))
model11.fit(X_train, y_bool_train)
pred = model11.predict(X_test)
print('score on test data: ' + str(f1_score(y_bool_test, pred)))

In [None]:
print('Voting Classifier with Random Forest')
classifier = RandomForestClassifier()
v = []
for i in range(11):
    v.append(('RFC'+str(i), classifier))
model12 = VotingClassifier(estimators=v, voting ='hard')
vc3scores = cross_val_score(model12, X_train, y_bool_train, scoring='f1', cv=5)
print(vc3scores)
print('avg cv score: ' + str(sum(vc3scores)/len(vc3scores)))
model12.fit(X_train, y_bool_train)
pred = model12.predict(X_test)
print('score on test data: ' + str(f1_score(y_bool_test, pred)))

print()
print('Bagging Classifier with Random Forest')
classifier = RandomForestClassifier()
model13 = BaggingClassifier(base_estimator=classifier)
bc2scores = cross_val_score(model13, X_train, y_bool_train, scoring='f1', cv=5)
print(bc2scores)
print('avg cv score: ' + str(sum(bc2scores)/len(bc2scores)))
model13.fit(X_train, y_bool_train)
pred = model13.predict(X_test)
print('score on test data: ' + str(f1_score(y_bool_test, pred)))

In [None]:
m1_disp = RocCurveDisplay.from_estimator(model1, X_test, y_bool_test, color='red')
ax = plt.gca()
m2_disp = RocCurveDisplay.from_estimator(model2, X_test, y_bool_test, color='blue', ax=ax)
m3_disp = RocCurveDisplay.from_estimator(model3, X_test, y_bool_test, color='green', ax=ax)
m4_disp = RocCurveDisplay.from_estimator(model10, X_test, y_bool_test, color='brown', ax=ax)
m5_disp = RocCurveDisplay.from_estimator(model7, X_test, y_bool_test, color='purple', ax=ax)
m6_disp = RocCurveDisplay.from_estimator(model12, X_test, y_bool_test, color='black', ax=ax)

m1_disp = PrecisionRecallDisplay.from_estimator(model1, X_test, y_bool_test, color='red')
ax = plt.gca()
m2_disp = PrecisionRecallDisplay.from_estimator(model2, X_test, y_bool_test, color='blue', ax=ax)
m3_disp = PrecisionRecallDisplay.from_estimator(model3, X_test, y_bool_test, color='green', ax=ax)
m4_disp = PrecisionRecallDisplay.from_estimator(model10, X_test, y_bool_test, color='brown', ax=ax)
m5_disp = PrecisionRecallDisplay.from_estimator(model7, X_test, y_bool_test, color='purple', ax=ax)
m6_disp = PrecisionRecallDisplay.from_estimator(model12, X_test, y_bool_test, color='black', ax=ax)