In [None]:
import pandas as pd
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import accuracy_score
from sklearn.linear_model import Lasso, LassoCV, RidgeClassifier, RidgeClassifierCV
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn import preprocessing
import matplotlib.pyplot as plt
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import plot_confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

# Here we upload our dataset

In [None]:
data_raw = pd.read_csv(r"dataset-of-90s.csv") ; data_raw.head()

# We drop useless features

In [None]:
data = data_raw.drop(['track', 'artist', 'uri'], axis=1) ;data


In [None]:
data.info()

# We change all values to float64 format

In [None]:
data = data.astype(float) ; data.info()

# We are looking for describe function's features to understand our data better

In [None]:
data.describe().T

# Here, we are seperating our target and predictor set

In [None]:
X = data.iloc[:,0:-1]
y = data.iloc[:,-1]

In [None]:
X.head()

In [None]:
y

# Here, we are splitting our data to train and test set to get realistic results

In [None]:
X_train, X_test, y_train, y_test =train_test_split(X, y, test_size = 0.2, random_state = 0)

In [None]:
X_train.shape

In [None]:
X_test.shape

# We perform logistic regression to our dataset

In [None]:
log_model = LogisticRegression()
log_model.fit(X_train, y_train)
y_pred = log_model.predict(X_test)

In [None]:
y_pred.sum()

In [None]:
y_test.sum()

In [None]:
table=pd.DataFrame({'True':y_test, 'Predicted':y_pred}) ; table

In [None]:
score = accuracy_score(y_pred,y_test) ; score

# Here we have seen our score as 0.4990942028985507 and we manually check the result 

In [None]:
print("Sum of '1''s is equal to:", y_pred.sum(),"at the prediction set")
print("Sum of '1''s is equal to:", y_test.sum(),"at the test set")

In [None]:
print(y_test.sum()/y_pred.sum())

# We use max_iter parameter but nothing changed.

In [None]:
log_model = LogisticRegression(max_iter=1000)
log_model.fit(X_train, y_train)
y_pred = log_model.predict(X_test)

In [None]:
table=pd.DataFrame({'True':y_test, 'Predicted':y_pred}) ; table


In [None]:
score = accuracy_score(y_pred,y_test) ; score

# Here we perform MinMaxScaler also known as Normalization


In [None]:
scaler = MinMaxScaler()

data_normalized = scaler.fit_transform(data)

data_normalized = pd.DataFrame(data_normalized, columns = data.columns) ; data_normalized


In [None]:
X_normalized = data_normalized.drop('target',axis=1).astype(float)
y = data_normalized["target"]

In [None]:
from sklearn.model_selection import train_test_split
X_train_normalized, X_test_normalized, y_train, y_test =train_test_split(X_normalized, y, test_size = 0.2, random_state = 0)

In [None]:
X_train_normalized.head()

In [None]:
log_model = LogisticRegression()
log_model.fit(X_train_normalized, y_train)
y_pred = log_model.predict(X_test_normalized)

In [None]:
table=pd.DataFrame({'True':y_test, 'Predicted':y_pred}) ;table


In [None]:
score = precision_score(y_pred,y_test) ;score

In [None]:
cfmatrix = confusion_matrix(y_test, y_pred) ;cfmatrix

In [None]:
disp = ConfusionMatrixDisplay(confusion_matrix=cfmatrix) 
disp.plot()

# Here, we perform standardization and logistic regression again

In [None]:
scaler = preprocessing.StandardScaler()
X_stand = scaler.fit_transform(X)

In [None]:
X_train_stand, X_test_stand, y_train, y_test =train_test_split(X_stand, y, test_size = 0.2, random_state = 0)

In [None]:
log_model = LogisticRegression()
log_model.fit(X_train_stand, y_train)
y_pred = log_model.predict(X_test_stand)

In [None]:
table=pd.DataFrame({'True':y_test, 'Predicted':y_pred}) ; table


In [None]:
score = precision_score(y_pred,y_test) ;score

In [None]:
cfmatrix = confusion_matrix(y_test, y_pred) ; cfmatrix


In [None]:
disp = ConfusionMatrixDisplay(confusion_matrix=cfmatrix) 
disp.plot()

# Until model selection. We reach best accuracy with standardization

# Model selection

# First we perform on normalized data

In [None]:
logmodel = LogisticRegression()
sbs = SFS(logmodel, 
           k_features=1,
           forward=False, 
           floating=False, 
           verbose=2,
           scoring='accuracy',
           cv=10)
feature_names = ('danceability', 'energy', 'key', 'loudness', 'mode','speechiness','acousticness','instrumentalness','liveness','valence','tempo','duration_ms','time_signature','chorus_hit','sections')
sbs = sbs.fit(X_train_normalized, y_train, custom_feature_names=feature_names)
pd.DataFrame.from_dict(sbs.get_metric_dict()).T

In [None]:
pd.DataFrame.from_dict(sbs.get_metric_dict()).T.sort_values('avg_score',ascending=False) 

# Now we perform on raw data

In [None]:
logmodel = LogisticRegression()
sbs = SFS(logmodel, 
           k_features=1,
           forward=False, 
           floating=False, 
           verbose=2,
           scoring='accuracy',
           cv=10)
feature_names = ('danceability', 'energy', 'key', 'loudness', 'mode','speechiness','acousticness','instrumentalness','liveness','valence','tempo','duration_ms','time_signature','chorus_hit','sections')
sbs = sbs.fit(X_train, y_train, custom_feature_names=feature_names)
pd.DataFrame.from_dict(sbs.get_metric_dict()).T

In [None]:
pd.DataFrame.from_dict(sbs.get_metric_dict()).T.sort_values('avg_score',ascending=False)

# At last, we perform on standardized data 

In [None]:
logmodel = LogisticRegression()
sbs = SFS(logmodel, 
           k_features=1,
           forward=False, 
           floating=False, 
           verbose=2,
           scoring='accuracy',
           cv=10)
feature_names = ('danceability', 'energy', 'key', 'loudness', 'mode','speechiness','acousticness','instrumentalness','liveness','valence','tempo','duration_ms','time_signature','chorus_hit','sections')
sbs = sbs.fit(X_train_stand, y_train, custom_feature_names=feature_names)
pd.DataFrame.from_dict(sbs.get_metric_dict()).T

In [None]:
pd.DataFrame.from_dict(sbs.get_metric_dict()).T.sort_values('avg_score',ascending=False) 

# (0, 1, 2, 3, 4, 5, 6, 7, 9, 10, 11)  Predictors gave us the best results with standardized data

In [None]:
X_selected = X.iloc[:,[0, 1, 2, 3, 4, 5, 6, 7, 9, 10, 11]]
scaler = preprocessing.StandardScaler()
X_selected_stand = scaler.fit_transform(X_selected)
X_selected_stand = pd.DataFrame(X_selected_stand, columns = X_selected.columns)


In [None]:
X_train_selected_stand, X_test_selected_stand, y_train, y_test =train_test_split(X_selected_stand, y, test_size = 0.2, random_state = 0)

# We perform logistic regression again with our new predictor set and we directly work with normalization

In [None]:
log_model = LogisticRegression()
log_model.fit(X_train_selected_stand, y_train)
y_pred = log_model.predict(X_test_selected_stand)

In [None]:
table=pd.DataFrame({'True':y_test, 'Predicted':y_pred}) ; table

In [None]:
score = precision_score(y_pred,y_test) ;score

In [None]:
cfmatrix = confusion_matrix(y_test, y_pred) ;cfmatrix

In [None]:
disp = ConfusionMatrixDisplay(confusion_matrix=cfmatrix) 
disp.plot()

# We find best with accuracy 0.8185117967332124 

# Ridge

In [None]:
alphas = 10**np.linspace(5,-2,100)*0.5
alphas

# We perform for raw data, selected data, and scaled data

In [None]:
ridgecv = RidgeClassifierCV(alphas = alphas,cv = 10, scoring = 'accuracy')
ridgecv.fit(X_train, y_train)
ridgecv.alpha_

In [None]:
ridge = RidgeClassifier(alpha = ridgecv.alpha_)
ridge.fit(X_train, y_train)
print("Accuracy score for raw data is: ",accuracy_score(y_test, ridge.predict(X_test)))

In [None]:
ridgecv = RidgeClassifierCV(alphas = alphas,cv = 10, scoring = 'accuracy')
ridgecv.fit(X_train_selected, y_train)
ridgecv.alpha_

In [None]:
ridge = RidgeClassifier(alpha = ridgecv.alpha_)
ridge.fit(X_train_selected, y_train)
print("Accurarcy score for selected data is: ",accuracy_score(y_test, ridge.predict(X_test_selected)))

In [None]:
ridgecv = RidgeClassifierCV(alphas = alphas,cv = 10, scoring = 'accuracy')
ridgecv.fit(X_train_selected_stand, y_train)
ridgecv.alpha_

In [None]:
ridge = RidgeClassifier(alpha = ridgecv.alpha_)
ridge.fit(X_train_selected_stand, y_train)
print("Accurarcy score for standardized data is: ",accuracy_score(y_test, ridge.predict(X_test_selected_stand)))

In [None]:
X_train, X_test , y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0,stratify=y)

# Creating a KNN classifier with 10 neighbors

In [None]:
knn= KNeighborsClassifier(n_neighbors= 10)
knn.fit(X_train,y_train)

In [None]:
knn.predict(X_test)

In [None]:
knn.score(X_test, y_test)

# Creating a KNN classifier with 10 neighbors with standardized data

In [None]:
scaler = preprocessing.StandardScaler()
X_stand = scaler.fit_transform(X)
X_stand = pd.DataFrame(X_stand, columns = X.columns)

In [None]:
X_train_stand, X_test_stand , y_train, y_test = train_test_split(X_stand, y, test_size=0.2, random_state=0,stratify=y)

In [None]:
knn= KNeighborsClassifier(n_neighbors= 10)
knn.fit(X_train_stand,y_train)
knn.predict(X_test_stand)
knn.score(X_test_stand, y_test)

# Creating KNN classifier with 65 neighbors and perform with both standardized and raw data

In [None]:
knn= KNeighborsClassifier(n_neighbors= 65)
knn.fit(X_train, y_train)
knn.predict(X_test)
knn.score(X_test, y_test)

In [None]:
knn= KNeighborsClassifier(n_neighbors= 65)
knn.fit(X_train_stand, y_train)
knn.predict(X_test_stand)
knn.score(X_test_stand, y_test)

# Making Cross Validation for the best neighbor size with standardized data

In [None]:
k_range = range(1, 200)

k_scores = []
k_parameter = []

for k in k_range:
    knn = KNeighborsClassifier(n_neighbors=k)
    scores = cross_val_score(knn, X_train_stand, y_train, cv=10, scoring='accuracy') # It's 10 fold cross validation with 'accuracy' scoring
    k_scores.append(scores.mean())
    k_parameter.append(k)

In [None]:
frame = {'Validation Scores':k_scores,
        'knn_parameter':k_parameter}
 
# Create DataFrame
df = pd.DataFrame(frame)

# We find the best neighbor size as 13

In [None]:
df_knn= pd.DataFrame(k_scores, columns=['Validation Scores'])
df_knn.sort_values(by='Validation Scores', ascending=False)

# Creating KNN classifier with the optimum neighbor size

In [None]:
knn= KNeighborsClassifier(n_neighbors=13)
knn.fit(X_train_stand, y_train)
knn.predict(X_test_stand)
knn.score(X_test_stand, y_test)

In [None]:
knn_model = KNeighborsClassifier(n_neighbors=13)
sbs = SFS(knn_model, 
           k_features=1,
           forward=False, 
           floating=False, 
           verbose=2,
           scoring='accuracy',
           cv=10)
feature_names = ('danceability', 'energy', 'key', 'loudness', 'mode','speechiness','acousticness','instrumentalness','liveness','valence','tempo','duration_ms','time_signature','chorus_hit','sections')
sbs = sbs.fit(X_train_stand, y_train, custom_feature_names=feature_names)
pd.DataFrame.from_dict(sbs.get_metric_dict()).T

In [None]:
pd.DataFrame.from_dict(sbs.get_metric_dict()).T.sort_values('avg_score',ascending=False) 

# We define best subset is (0, 1, 6, 7, 9, 11)	 and we perform respect to these predictors

In [None]:
X_selected = X.iloc[:,[0, 1, 6, 7, 9, 11]]
scaler = preprocessing.StandardScaler()
X_selected_stand = scaler.fit_transform(X_selected)
X_selected_stand = pd.DataFrame(X_selected_stand, columns = X_selected.columns)

In [None]:
X_train_selected_stand, X_test_selected_stand, y_train, y_test =train_test_split(X_selected_stand, y, test_size = 0.2, random_state = 0)

In [None]:
knn= KNeighborsClassifier(n_neighbors=13)
knn.fit(X_train_selected_stand, y_train)
knn.predict(X_test_selected_stand)
knn.score(X_test_selected_stand, y_test)

# Desicion Trees

In [None]:
X_train, X_test , y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0,stratify=y)

# We are creating adecision tree without any penalty at first.

In [None]:
X_col=X.columns ; X_col

In [None]:
classifier = DecisionTreeClassifier(random_state = 0)  
classifier.fit(X_train, y_train) 
fig = plt.figure(figsize=(20,15))
Dtree = tree.plot_tree(classifier, feature_names=X_col, filled=True)

In [None]:
y_pred = classifier.predict(X_test)
accuracy_score(y_test,y_pred)

# Create a range for alpha values that we are going to use for adding some penalty to number of internal nodes

In [None]:
classifier = DecisionTreeClassifier(random_state = 0) 
path = classifier.cost_complexity_pruning_path(X_train, y_train) ;path

# We are using cross validation to determine the optimal level of complexity of three

In [None]:
cv = KFold(n_splits=10, random_state=0, shuffle=True)
CVErrors=[]
for i in path.ccp_alphas:
    for train_index, validation_index in cv.split(X_train):
        X_trainp, X_valid = X_train.iloc[train_index], X_train.iloc[validation_index]
        y_trainp, y_valid = y_train.iloc[train_index], y_train.iloc[validation_index]
        classifier = DecisionTreeClassifier(random_state = 0, ccp_alpha=i)
        classifier.fit(X_trainp, y_trainp) 
        y_pred=classifier.predict(X_valid)
        CVErrors.append([i,accuracy_score(y_trainp, classifier.predict(X_trainp)),accuracy_score(y_valid, y_pred)])    
df = pd.DataFrame(CVErrors,columns=['alpha','Training Accuracy','Validation Accuracy'])
kfoldCV_by_alpha = df.groupby('alpha')
kfoldCV_by_alpha = kfoldCV_by_alpha.mean()
kfoldCV_by_alpha = kfoldCV_by_alpha.reset_index()
kfoldCV_by_alpha

# We are sorting alpha values to find one which gives us best validation accuracy 

In [None]:
kfoldCV_by_alpha=kfoldCV_by_alpha.sort_values(by=['Validation Accuracy'],ascending=False) ; kfoldCV_by_alpha

# we are creating a new decision tree with the best alpha value which gives us best validation accuracy

In [None]:
classifier = DecisionTreeClassifier(random_state = 0,ccp_alpha=0.000997)  
classifier.fit(X_train, y_train) 
fig = plt.figure(figsize=(20,15))
Dtree = tree.plot_tree(classifier, class_names=['Non-Hit','Hit'],feature_names=X_col, filled=True)
y_pred = classifier.predict(X_test)

In [None]:
comp=pd.DataFrame({'Actual':y_test, 'Predicted':y_pred})
comp = pd.concat([X_test.reset_index(drop=True), comp.reset_index(drop=True)], axis= 1) ; comp

In [None]:
y_pred = classifier.predict(X_test)
accuracy_score(y_test,y_pred)

# we are going to fit a random forest / we are trying different numbers of trees 

In [None]:
Oob_Accuracy=[]
for i in np.linspace(start = 100, stop = 1000, num = 10):
    clf=RandomForestClassifier(random_state=0,n_estimators=int(i),oob_score=True)
    clf.fit(X_train,y_train)
    y_pred=clf.predict(X_test)
    Oob_Accuracy.append([i,np.array(clf.oob_score_)])
df = pd.DataFrame(Oob_Accuracy,columns=['Number_of_Trees','Oob Accuracy'])
df

In [None]:
df=df.sort_values(by=['Oob Accuracy'],ascending=False) ; df


# We are plotting a scatter graph to pick the number of trees that gives the highest accuracy 

In [None]:
fig=plt.figure(figsize=(5,4), dpi=100)
ax = fig.add_subplot(1, 1, 1)
ax.scatter (df['Number_of_Trees'].values, df['Oob Accuracy'].values, label = 'Oob Accuracy')
ax.set_xlabel('Number_of_Trees')
ax.set_ylabel('Accuracy')
ax.tick_params(axis='x', labelsize=8)
ax.legend(loc='best')

In [None]:
clf=RandomForestClassifier(random_state=0,n_estimators=600,oob_score=True)
clf.fit(X_train,y_train)
y_pred=clf.predict(X_test)
accuracy_score(y_test, y_pred)

# We are trying different numbers of features at each split

In [None]:
Oob_Accuracy=[]
for i in range(1,16):
    clf=RandomForestClassifier(random_state=0,n_estimators=600,max_features=i,oob_score=True)
    clf.fit(X_train,y_train)
    y_pred=clf.predict(X_test)
    Oob_Accuracy.append([i,np.array(clf.oob_score_)])
df = pd.DataFrame(Oob_Accuracy,columns=['Number_of_Features','Oob Accuracy'])
df

In [None]:
df=df.sort_values(by=['Oob Accuracy'],ascending=False) ; df

# We are plotting a scatter graph to pick the number of features that gives the highest accuracy 

In [None]:
fig=plt.figure(figsize=(5,4), dpi=100)
ax = fig.add_subplot(1, 1, 1)
ax.scatter(df['Number_of_Features'].values,df['Oob Accuracy'].values,label = 'Oob Accuracy')
ax.set_xlabel('Number_of_Features')
ax.set_ylabel('Accuracy')
ax.tick_params(axis='x', labelsize=8)
ax.legend(loc='best')

# We are creating a new random forest with the best parameters that we choose above.

In [None]:
clf=RandomForestClassifier(random_state=0,n_estimators=600,max_features=5,oob_score=True)
clf.fit(X_train,y_train)
y_pred=clf.predict(X_test)
print('Test Accuracy:', accuracy_score(y_test, y_pred))

# We are forming a confusion matrix  

In [None]:
cm=confusion_matrix(y_test, y_pred) ; cm

# this will visualize the confusion matrix and help us to interpret our model's prediction.

In [None]:
disp = ConfusionMatrixDisplay(confusion_matrix=cm) 
disp.plot()

In [None]:
clf=RandomForestClassifier(random_state=0)
# number of trees in random forest
n_estimators = [100,200,300,400,500,600]
# number of features at every split
max_features = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
# create grid
params = {
 'n_estimators': n_estimators,
 'max_features': max_features,
 }

#We are using a nested cross-validation for both parameter selection and the test performance

cv = KFold(n_splits=5, random_state=1, shuffle=True)
CVAccuracy=[]
for train_index, validation_index in cv.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[validation_index], 
    y_train, y_test = y.iloc[train_index], y.iloc[validation_index]
    # Grid search of parameters
    clf_grid = GridSearchCV(estimator = clf, param_grid = params, 
                                cv = 5, verbose=2, scoring='accuracy',n_jobs = -1)
    # Fit the model
    clf_grid.fit(X_train, y_train)
    # print results
    print(clf_grid.best_params_)
    #After finding best parameters fit the model
    clf=RandomForestClassifier(**clf_grid.best_params_)
    clf.fit(X_train,y_train)
    y_pred=clf.predict(X_test)
    #Test the performance on the test set
    CVAccuracy.append(accuracy_score(y_test, y_pred))

In [None]:
CVAccuracy

In [None]:
np.mean(CVAccuracy)

# We are plotting the impurity-based feature importances of the forest

In [None]:
feats = {} # a dict to hold feature_name: feature_importance
importances = clf.feature_importances_
X_col=X.columns
for feature, importance in zip(X_col, clf.feature_importances_):
    feats[feature] = importance #add the name/value pair 
importances = pd.DataFrame.from_dict(feats, orient='index').rename(columns={0: 'Gini-importance'})
importances = importances.sort_values(by='Gini-importance',ascending=False)
print(importances)
sorted_idx = clf.feature_importances_.argsort()
plt.barh(X_col[sorted_idx], clf.feature_importances_[sorted_idx])