In [17]:
# Import modules
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler

In [29]:
# URL to the CSV file
url = "https://github.com/jossharlequin/spotify-popularity-project/raw/main/Resources/data.csv"

# Read the CSV file into a Pandas DataFrame
df = pd.read_csv(url)

# Show the first few rows of the DataFrame
df.head()

Unnamed: 0,acousticness,artists,danceability,duration_ms,energy,explicit,id,instrumentalness,key,liveness,loudness,mode,name,popularity,release_date,speechiness,tempo,valence,year
0,0.995,['Carl Woitschach'],0.708,158648,0.195,0,6KbQ3uYMLKb5jDxLF7wYDD,0.563,10,0.151,-12.428,1,Singende Bataillone 1. Teil,0,1928,0.0506,118.469,0.779,1928
1,0.994,"['Robert Schumann', 'Vladimir Horowitz']",0.379,282133,0.0135,0,6KuQTIu1KoTTkLXKrwlLPV,0.901,8,0.0763,-28.454,1,"Fantasiestücke, Op. 111: Più tosto lento",0,1928,0.0462,83.972,0.0767,1928
2,0.604,['Seweryn Goszczyński'],0.749,104300,0.22,0,6L63VW0PibdM1HDSBoqnoM,0.0,5,0.119,-19.924,0,Chapter 1.18 - Zamek kaniowski,0,1928,0.929,107.177,0.88,1928
3,0.995,['Francisco Canaro'],0.781,180760,0.13,0,6M94FkXd15sOAOQYRnWPN8,0.887,1,0.111,-14.734,0,Bebamos Juntos - Instrumental (Remasterizado),0,1928-09-25,0.0926,108.003,0.72,1928
4,0.99,"['Frédéric Chopin', 'Vladimir Horowitz']",0.21,687733,0.204,0,6N6tiFZ9vLTSOIxkj8qKrd,0.908,11,0.098,-16.829,1,"Polonaise-Fantaisie in A-Flat Major, Op. 61",1,1928,0.0424,62.149,0.0693,1928


In [19]:
# Drop the specified columns from the DataFrame
df_clean = df.drop(columns=['artists', 'name', 'id', 'release_date'])

# Display the DataFrame after dropping the columns
df_clean.head()

Unnamed: 0,acousticness,danceability,duration_ms,energy,explicit,instrumentalness,key,liveness,loudness,mode,popularity,speechiness,tempo,valence,year
0,0.995,0.708,158648,0.195,0,0.563,10,0.151,-12.428,1,0,0.0506,118.469,0.779,1928
1,0.994,0.379,282133,0.0135,0,0.901,8,0.0763,-28.454,1,0,0.0462,83.972,0.0767,1928
2,0.604,0.749,104300,0.22,0,0.0,5,0.119,-19.924,0,0,0.929,107.177,0.88,1928
3,0.995,0.781,180760,0.13,0,0.887,1,0.111,-14.734,0,0,0.0926,108.003,0.72,1928
4,0.99,0.21,687733,0.204,0,0.908,11,0.098,-16.829,1,1,0.0424,62.149,0.0693,1928


In [20]:
# Seperate the features, X,  from the target variable, y
y = df_clean['popularity']
X = df_clean.drop(columns='popularity')

In [21]:
# Preview the features data
X[:5]

Unnamed: 0,acousticness,danceability,duration_ms,energy,explicit,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,valence,year
0,0.995,0.708,158648,0.195,0,0.563,10,0.151,-12.428,1,0.0506,118.469,0.779,1928
1,0.994,0.379,282133,0.0135,0,0.901,8,0.0763,-28.454,1,0.0462,83.972,0.0767,1928
2,0.604,0.749,104300,0.22,0,0.0,5,0.119,-19.924,0,0.929,107.177,0.88,1928
3,0.995,0.781,180760,0.13,0,0.887,1,0.111,-14.734,0,0.0926,108.003,0.72,1928
4,0.99,0.21,687733,0.204,0,0.908,11,0.098,-16.829,1,0.0424,62.149,0.0693,1928


In [22]:
# Preview the first five entries for the target variable
y[:5]

0    0
1    0
2    0
3    0
4    1
Name: popularity, dtype: int64

In [23]:
# Split the dataset using train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [24]:
# Instantiate a StandardScaler instance
scaler = StandardScaler()

# Fit the training data to the standard scaler
X_scaler = scaler.fit(X_train)

# Transform the training data using the scaler
X_train_scaled = X_scaler.transform(X_train)

# Transform the testing data using the scaler
X_test_scaled = X_scaler.transform(X_test)

In [25]:
# Import the KNeighborsClassifier module from sklearn
from sklearn.neighbors import KNeighborsClassifier

# Instantiate the KNeighborsClassifier model with n_neighbors = 3 
knn = KNeighborsClassifier(n_neighbors=3)

In [26]:
# Train the model using the training data
knn.fit(X_train_scaled, y_train)

In [27]:
# Create predictions using the testing data
y_pred = knn.predict(X_test_scaled)

In [28]:
# Print the classification report comparing the testing data to the model predictions
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.64      0.91      0.75      6887
           1       0.07      0.06      0.07       547
           2       0.04      0.03      0.03       395
           3       0.05      0.04      0.05       316
           4       0.02      0.02      0.02       285
           5       0.02      0.02      0.02       237
           6       0.04      0.05      0.04       260
           7       0.03      0.03      0.03       318
           8       0.02      0.03      0.02       298
           9       0.03      0.04      0.03       339
          10       0.03      0.04      0.04       301
          11       0.03      0.03      0.03       335
          12       0.04      0.05      0.04       321
          13       0.05      0.06      0.05       329
          14       0.04      0.05      0.04       315
          15       0.03      0.03      0.03       296
          16       0.03      0.03      0.03       292
          17       0.04    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
