In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from sklearn import neighbors, preprocessing
from sklearn.metrics import confusion_matrix, classification_report
import warnings

warnings.filterwarnings("ignore")

In [14]:
file = pd.read_csv("~/Desktop/Columbia 2024/Most Streamed Spotify Songs 2024.csv", encoding='latin1')

#manipulation on the original dataset
file['Spotify Streams'] = file['Spotify Streams'].replace(",", "", regex=True)
file['Spotify Streams'] = pd.to_numeric(file['Spotify Streams'], downcast='integer', errors='coerce').fillna(0)
file['Spotify Streams'] = file['Spotify Streams'].astype(np.float64)
candidates = [x for x in list(file.columns) if ('View' in x) or ('Like' in x) or ('Count' in x) or ('Rank' in x) or 'Stations' in x or 'Reach' in x or 'Stream' in x or 'Post' in x or 'Spin' in x]
for i in candidates:
    file[i] = file[i].replace(",", "", regex=True)
    file[i] = pd.to_numeric(file[i], downcast='integer', errors='coerce').fillna(0)
    file[i] = file[i].astype(np.float64)

#organize the original dataset into a more artists-oriented format
adf = file.groupby('Artist')['Spotify Streams'].apply(np.mean).rename_axis('Artist').reset_index(name='Average Spotify Streams')
adf['Track Counts'] = adf['Artist'].map(file['Artist'].value_counts())
for i in ['Spotify Playlist Count', 'Spotify Playlist Reach', 'Spotify Popularity', 'YouTube Views', 'YouTube Likes', 'TikTok Posts', 'TikTok Likes', 'TikTok Views', 'YouTube Playlist Reach', 'Apple Music Playlist Count', 'AirPlay Spins', 'SiriusXM Spins', 'Deezer Playlist Count', 'Deezer Playlist Reach', 'Amazon Playlist Count', 'Pandora Streams', 'Pandora Track Stations', 'Soundcloud Streams', 'Shazam Counts']:
    adf['Average ' + i] = (file.groupby('Artist')[i].apply(np.mean).reset_index(name='1'))['1']

#community classification as a training attribute
training_tiers = pd.read_csv("~/Desktop/Columbia 2024/artist_tiers.csv", encoding='latin1')
training_tiers = training_tiers.loc[training_tiers['Tier'].isin(['S','A','B','C','D','E'])]
adf['Is Training'] = adf['Artist'].isin(training_tiers['Artist'])
adf = adf.join(training_tiers.set_index('Artist'), on='Artist')

#split into training and testing dataset
x_train = adf[adf['Is Training'] == True].dropna()
y_train = x_train['Tier']
x_train = x_train.drop(columns=['Artist', 'Is Training', 'Tier'])
x_test = adf[adf['Is Training'] == False]
x_test = x_test.drop(columns=['Artist', 'Is Training', 'Tier']).dropna()

#model and prediction
knn = neighbors.KNeighborsClassifier(n_neighbors= 1)
y_pred = knn.fit(x_train, y_train).predict(x_test)

#use the model to train on the predicted result to predict the training data to see the goodness of fit
y_pred2 = knn.fit(x_test, y_pred).predict(x_train)
print(confusion_matrix(y_train, y_pred2).T)
print(classification_report(y_train,y_pred2))

[[ 7  0  0  1  0  1]
 [ 0 13  1  1  0  1]
 [ 1  0 16  1  0  0]
 [ 3  3  1 19  0  0]
 [ 0  0  0  1 14  0]
 [ 1  1  0  0  0  4]]
              precision    recall  f1-score   support

           A       0.78      0.58      0.67        12
           B       0.81      0.76      0.79        17
           C       0.89      0.89      0.89        18
           D       0.73      0.83      0.78        23
           E       0.93      1.00      0.97        14
           S       0.67      0.67      0.67         6

    accuracy                           0.81        90
   macro avg       0.80      0.79      0.79        90
weighted avg       0.81      0.81      0.81        90



In [21]:
#looking for the best k
for i in range(1, 31):
    knn = neighbors.KNeighborsClassifier(n_neighbors= i)
    y_pred = knn.fit(x_train, y_train).predict(x_test)
    
    y_pred2 = knn.fit(x_test, y_pred).predict(x_train)
    print(i, classification_report(y_train,y_pred2, output_dict=True)['accuracy'])

1 0.8111111111111111
2 0.5222222222222223
3 0.5333333333333333
4 0.5444444444444444
5 0.5111111111111111
6 0.45555555555555555
7 0.43333333333333335
8 0.4
9 0.43333333333333335
10 0.37777777777777777
11 0.43333333333333335
12 0.4111111111111111
13 0.36666666666666664
14 0.34444444444444444
15 0.32222222222222224
16 0.3
17 0.3111111111111111
18 0.2777777777777778
19 0.3
20 0.32222222222222224
21 0.32222222222222224
22 0.3
23 0.3
24 0.3
25 0.3
26 0.3
27 0.3111111111111111
28 0.3111111111111111
29 0.32222222222222224
30 0.3111111111111111
