In [1]:
# Import modules
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler

In [2]:
data = pd.read_csv('Resources/data.csv')


In [97]:
t = data.loc[data['speechiness']>0.22]
t = t.loc[t['speechiness']<0.37]
t = t.loc[t['loudness']>-8]
t = t.loc[t['danceability']>0.90]
t = t.loc[t['danceability']<0.98]
t = t.loc[t['acousticness']<0.04]
t = t.loc[t['liveness']<0.1]
best_songs = t
print(f"Best Songs Playlist")
print(f"Playlist Length: {len(best_songs)}")
print(f"Average Popularity: {best_songs['popularity'].mean()}")

Best Songs Playlist
Playlist Length: 34
Average Popularity: 57.23529411764706


In [21]:
bins = [-1,33,100]
labels = ['0','1',]

data['popularity_bin'] = pd.cut(data['popularity'], bins, labels=labels)
data_clean = data.drop(columns=['artists','name','id','release_date','popularity','year','key'])
data_clean

Unnamed: 0,acousticness,danceability,duration_ms,energy,explicit,instrumentalness,liveness,loudness,mode,speechiness,tempo,valence,popularity_bin
0,0.9950,0.708,158648,0.1950,0,0.563000,0.1510,-12.428,1,0.0506,118.469,0.7790,0
1,0.9940,0.379,282133,0.0135,0,0.901000,0.0763,-28.454,1,0.0462,83.972,0.0767,0
2,0.6040,0.749,104300,0.2200,0,0.000000,0.1190,-19.924,0,0.9290,107.177,0.8800,0
3,0.9950,0.781,180760,0.1300,0,0.887000,0.1110,-14.734,0,0.0926,108.003,0.7200,0
4,0.9900,0.210,687733,0.2040,0,0.908000,0.0980,-16.829,1,0.0424,62.149,0.0693,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
169904,0.1730,0.875,163800,0.4430,1,0.000032,0.0891,-7.461,1,0.1430,100.012,0.3060,1
169905,0.0167,0.719,167468,0.3850,0,0.031300,0.1110,-10.907,1,0.0403,128.000,0.2700,1
169906,0.5380,0.514,180700,0.5390,0,0.002330,0.1080,-9.332,1,0.1050,123.700,0.1530,1
169907,0.0714,0.646,167308,0.7610,0,0.000000,0.2220,-2.557,1,0.0385,129.916,0.4720,1


In [22]:
# Seperate the features, X,  from the target variable, y
y = data_clean['popularity_bin']
X = data_clean.drop(columns='popularity_bin')

# Preview the features data
X[:5]

Unnamed: 0,acousticness,danceability,duration_ms,energy,explicit,instrumentalness,liveness,loudness,mode,speechiness,tempo,valence
0,0.995,0.708,158648,0.195,0,0.563,0.151,-12.428,1,0.0506,118.469,0.779
1,0.994,0.379,282133,0.0135,0,0.901,0.0763,-28.454,1,0.0462,83.972,0.0767
2,0.604,0.749,104300,0.22,0,0.0,0.119,-19.924,0,0.929,107.177,0.88
3,0.995,0.781,180760,0.13,0,0.887,0.111,-14.734,0,0.0926,108.003,0.72
4,0.99,0.21,687733,0.204,0,0.908,0.098,-16.829,1,0.0424,62.149,0.0693


In [23]:
# Preview the first five entries for the target variable
y[:5]

0    0
1    0
2    0
3    0
4    0
Name: popularity_bin, dtype: category
Categories (2, object): ['0' < '1']

In [24]:
# Split the dataset using train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [25]:
# Instantiate a StandardScaler instance
scaler = StandardScaler()

# Fit the training data to the standard scaler
X_scaler = scaler.fit(X_train)

# Transform the training data using the scaler
X_train_scaled = X_scaler.transform(X_train)

# Transform the testing data using the scaler
X_test_scaled = X_scaler.transform(X_test)

In [26]:
# Import the KNeighborsClassifier module from sklearn
from sklearn.neighbors import KNeighborsClassifier

# Instantiate the KNeighborsClassifier model with n_neighbors = 3 
knn = KNeighborsClassifier(n_neighbors=3)

In [27]:
# Train the model using the training data
knn.fit(X_train_scaled, y_train)

In [28]:
# Create predictions using the testing data
y_pred = knn.predict(X_test_scaled)

In [29]:
# Print the classification report comparing the testing data to the model predictions
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.75      0.75      0.75     21268
           1       0.75      0.75      0.75     21210

    accuracy                           0.75     42478
   macro avg       0.75      0.75      0.75     42478
weighted avg       0.75      0.75      0.75     42478



In [30]:
prediction = knn.predict_proba(X_test_scaled)[:, 1]

In [31]:
print((prediction[0:100]))

[0.33333333 0.         0.66666667 0.33333333 0.66666667 0.
 0.33333333 0.66666667 0.         0.         0.         1.
 0.66666667 0.         0.         0.33333333 0.66666667 0.66666667
 0.33333333 0.         0.66666667 0.66666667 0.         0.33333333
 0.         0.33333333 0.66666667 0.         1.         1.
 1.         0.66666667 0.         1.         0.         1.
 0.         0.         0.66666667 0.         0.         0.66666667
 0.33333333 0.66666667 0.         1.         1.         0.66666667
 1.         0.66666667 0.         1.         1.         0.33333333
 0.33333333 1.         0.66666667 0.         0.66666667 0.66666667
 0.         0.66666667 0.         0.33333333 0.         0.33333333
 0.33333333 1.         0.         0.         0.33333333 0.33333333
 1.         0.33333333 0.         0.66666667 0.         0.
 0.66666667 1.         1.         0.66666667 0.66666667 0.66666667
 0.33333333 0.33333333 0.66666667 0.33333333 1.         1.
 0.66666667 1.         0.33333333 0.       

In [32]:
from sklearn.metrics import roc_auc_score
roc_auc_score(y_test, prediction)

0.8077273380633423