In [1]:
# Import modules
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler

In [2]:
data = pd.read_csv('Resources/data.csv')


In [25]:
# Export data for website
export_data2015 = data.loc[data['year'] == 2015]
export_data2016 = data.loc[data['year'] == 2016]
export_data2017 = data.loc[data['year'] == 2017]
export_data2018 = data.loc[data['year'] == 2018]
export_data2019 = data.loc[data['year'] == 2019]
export_data2020 = data.loc[data['year'] == 2020]
export_data = pd.concat([export_data2015, export_data2016, export_data2017, export_data2018, export_data2019, export_data2020])
export_data = export_data.sort_values(by=['name'])
export_data = export_data.drop_duplicates(subset=['name'])
export_data.to_csv('Resources/export_data.csv')

11656
10393


In [23]:
# Convert export data to JSON
import csv
import json
gvn_csvfile = 'Resources/export_data.csv'
gvn_jsonfile = 'Resources/export_data.json'
json_dictionary = {}
with open(gvn_csvfile) as gvn_csvfile:
    csvfiledata = csv.DictReader(gvn_csvfile)
    json_dictionary['data']= []
    for row_data in csvfiledata:
        print(row_data)
        json_dictionary['data'].append(row_data)
with open(gvn_jsonfile, 'w') as gvn_jsonfile:
    gvn_jsonfile.write(json.dumps(json_dictionary, indent=4))

{'': '70583', 'acousticness': '0.932', 'artists': "['Alejandro Sanz']", 'danceability': '0.466', 'duration_ms': '142436', 'energy': '0.3229999999999999', 'explicit': '0', 'id': '2O9RO8AxGmTywfJv3s454y', 'instrumentalness': '0.0', 'key': '4', 'liveness': '0.243', 'loudness': '-7.336', 'mode': '1', 'name': '#ElMundoFuera (Improvisación)', 'popularity': '0', 'release_date': '2020-06-12', 'speechiness': '0.0338', 'tempo': '110.526', 'valence': '0.55', 'year': '2020'}
{'': '144075', 'acousticness': '0.0636', 'artists': "['XXXTENTACION', 'Matt Ox']", 'danceability': '0.774', 'duration_ms': '130403', 'energy': '0.507', 'explicit': '1', 'id': '65u1dHQyQyE4y4aN2eDmMF', 'instrumentalness': '0.0', 'key': '1', 'liveness': '0.138', 'loudness': '-6.952000000000001', 'mode': '0', 'name': '$$$ - with Matt Ox', 'popularity': '68', 'release_date': '2018-03-16', 'speechiness': '0.065', 'tempo': '140.067', 'valence': '0.508', 'year': '2018'}
{'': '23735', 'acousticness': '0.325', 'artists': "['Healy']", '

In [44]:
bins = [-1,50,100]
labels = ['0','1',]

data['popularity_bin'] = pd.cut(data['popularity'], bins, labels=labels)
data_clean = data.drop(columns=['artists','name','id','release_date','popularity','year'])
data_clean

Unnamed: 0,acousticness,danceability,duration_ms,energy,explicit,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,valence,year,popularity_bin
0,0.9950,0.708,158648,0.1950,0,0.563000,10,0.1510,-12.428,1,0.0506,118.469,0.7790,1928,0-25
1,0.9940,0.379,282133,0.0135,0,0.901000,8,0.0763,-28.454,1,0.0462,83.972,0.0767,1928,0-25
2,0.6040,0.749,104300,0.2200,0,0.000000,5,0.1190,-19.924,0,0.9290,107.177,0.8800,1928,0-25
3,0.9950,0.781,180760,0.1300,0,0.887000,1,0.1110,-14.734,0,0.0926,108.003,0.7200,1928,0-25
4,0.9900,0.210,687733,0.2040,0,0.908000,11,0.0980,-16.829,1,0.0424,62.149,0.0693,1928,0-25
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
169904,0.1730,0.875,163800,0.4430,1,0.000032,1,0.0891,-7.461,1,0.1430,100.012,0.3060,2020,51-75
169905,0.0167,0.719,167468,0.3850,0,0.031300,8,0.1110,-10.907,1,0.0403,128.000,0.2700,2020,51-75
169906,0.5380,0.514,180700,0.5390,0,0.002330,7,0.1080,-9.332,1,0.1050,123.700,0.1530,2020,51-75
169907,0.0714,0.646,167308,0.7610,0,0.000000,1,0.2220,-2.557,1,0.0385,129.916,0.4720,2020,51-75


In [45]:
# Seperate the features, X,  from the target variable, y
y = data_clean['popularity_bin']
X = data_clean.drop(columns='popularity_bin')

# Preview the features data
X[:5]

Unnamed: 0,acousticness,danceability,duration_ms,energy,explicit,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,valence,year
0,0.995,0.708,158648,0.195,0,0.563,10,0.151,-12.428,1,0.0506,118.469,0.779,1928
1,0.994,0.379,282133,0.0135,0,0.901,8,0.0763,-28.454,1,0.0462,83.972,0.0767,1928
2,0.604,0.749,104300,0.22,0,0.0,5,0.119,-19.924,0,0.929,107.177,0.88,1928
3,0.995,0.781,180760,0.13,0,0.887,1,0.111,-14.734,0,0.0926,108.003,0.72,1928
4,0.99,0.21,687733,0.204,0,0.908,11,0.098,-16.829,1,0.0424,62.149,0.0693,1928


In [46]:
# Preview the first five entries for the target variable
y[:5]

0    0-25
1    0-25
2    0-25
3    0-25
4    0-25
Name: popularity_bin, dtype: category
Categories (4, object): ['0-25' < '26-50' < '51-75' < '75-100']

In [48]:
# Split the dataset using train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [49]:
# Instantiate a StandardScaler instance
scaler = StandardScaler()

# Fit the training data to the standard scaler
X_scaler = scaler.fit(X_train)

# Transform the training data using the scaler
X_train_scaled = X_scaler.transform(X_train)

# Transform the testing data using the scaler
X_test_scaled = X_scaler.transform(X_test)

In [50]:
# Import the KNeighborsClassifier module from sklearn
from sklearn.neighbors import KNeighborsClassifier

# Instantiate the KNeighborsClassifier model with n_neighbors = 3 
knn = KNeighborsClassifier(n_neighbors=3)

In [51]:
# Train the model using the training data
knn.fit(X_train_scaled, y_train)

In [52]:
# Create predictions using the testing data
y_pred = knn.predict(X_test_scaled)

In [53]:
# Print the classification report comparing the testing data to the model predictions
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

        0-25       0.80      0.85      0.82     16107
       26-50       0.65      0.67      0.66     17470
       51-75       0.57      0.50      0.53      8526
      75-100       0.12      0.02      0.04       375

    accuracy                           0.70     42478
   macro avg       0.54      0.51      0.51     42478
weighted avg       0.69      0.70      0.69     42478



In [None]:
prediction = knn.predict_proba(X_test_scaled)[:, 1]

In [None]:
print((prediction[0:100]))

In [None]:
from sklearn.metrics import roc_auc_score
roc_auc_score(y_test, prediction)