In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split  
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn import metrics  
from sklearn import tree
from sklearn.linear_model import LogisticRegression
import statsmodels.api as sm
from imblearn.over_sampling import SMOTE 
from imblearn.pipeline import Pipeline
from spotipy.oauth2 import SpotifyClientCredentials
from pandas.io.json import json_normalize
import spotipy
import spotipy.util as util

In [6]:
df=pd.read_csv('billboard_hits_with_features.csv',index_col=0)

In [7]:
df.head()

Unnamed: 0,Year,Songs,Artists,Track_ID,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence
0,2014,Happy,Pharrell Williams,spotify:track:6NPVjNh8Jhru9xOmyQigds,0.286,0.652,233305,0.757,0.0,1,0.0886,-6.819,1,0.153,159.911,4,0.962
1,2014,Dark Horse,Katy Perry featuring Juicy J,spotify:track:5jrdCoLpJSvHHorevXBATy,0.00314,0.645,215672,0.585,0.0,6,0.165,-6.122,1,0.0513,131.931,4,0.353
2,2014,All of Me,John Legend,spotify:track:3U4isOIWM3VvDubwSI3y7a,0.922,0.422,269560,0.264,0.0,8,0.132,-7.064,1,0.0322,119.93,4,0.331
3,2014,Fancy,Iggy Azalea featuring Charli XCX,spotify:track:3oiMJQAWVaxSubJ7b2VUtX,0.102,0.911,199938,0.707,0.0,8,0.049,-4.136,1,0.0696,94.964,4,0.375
4,2014,Counting Stars,OneRepublic,spotify:track:6sy3LkhNFjJWlaeSMNwQ62,0.0649,0.663,257840,0.714,0.0,1,0.116,-4.944,0,0.038,121.99,4,0.468


In [8]:
df['top_10'] = 0
df['top_10'].loc[0:9] = 1
df['top_10'].loc[100:109] = 1
df['top_10'].loc[200:209] = 1
df['top_10'].loc[300:309] = 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [9]:
df_train = df.iloc[:300, :] # 2014-2016
df_test = df.iloc[300:,:] # 2017

In [10]:
df_test.head()

Unnamed: 0,Year,Songs,Artists,Track_ID,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence,top_10
300,2017,Shape of You,Ed Sheeran,spotify:track:7qiZfU4dY1lWllzX7mPBI3,0.581,0.825,233713,0.652,0.0,1,0.0931,-3.183,0,0.0802,95.977,4,0.931,1
301,2017,Despacito,Luis Fonsi and Daddy Yankee featuring Justin B...,spotify:track:6rPO02ozF3bM7NnOV4h6s2,0.222,0.63,228827,0.815,0.0,2,0.091,-4.265,1,0.152,88.357,4,0.813,1
302,2017,That's What I Like,Bruno Mars,spotify:track:0KKkJNfGyhkQ5aFogxQAPU,0.013,0.853,206693,0.56,0.0,1,0.0944,-4.961,1,0.0406,134.066,4,0.86,1
303,2017,Humble,Kendrick Lamar,spotify:track:7KXjTSCq5nL1LoYtL7XAwS,0.000259,0.904,177000,0.611,2e-05,1,0.0976,-6.842,0,0.0888,150.02,4,0.4,1
304,2017,Something Just Like This,The Chainsmokers and Coldplay,spotify:track:1dNIEtp7AY3oDAKCGg2XkH,0.0306,0.607,247627,0.649,2.5e-05,11,0.174,-6.695,0,0.0362,102.996,4,0.505,1


In [11]:
print('Number of observations in the training data:', len(df_train))
print('Number of observations in the test data:',len(df_test))

Number of observations in the training data: 300
Number of observations in the test data: 100


In [12]:
df_features = df.drop(['Year', 'Songs','Artists','Track_ID','top_10'], axis=1)
features = df_features.columns[:]
features

Index(['acousticness', 'danceability', 'duration_ms', 'energy',
       'instrumentalness', 'key', 'liveness', 'loudness', 'mode',
       'speechiness', 'tempo', 'time_signature', 'valence'],
      dtype='object')

In [13]:
df_means = np.mean(df_train[features])
df_std = np.std(df_train[features])
df_train_features = (df_train[features] - df_means)/df_std
df_test_features = (df_test[features] - df_means)/df_std

In [14]:
y = df_train['top_10']
y2 = df_test['top_10']

In [15]:
# Create a random forest Classifier. 
clf = RandomForestClassifier(n_jobs=2, random_state=0, n_estimators=1001, class_weight='balanced')

# Train the Classifier to take the training features and learn how they relate
# to the training y (the species)
clf.fit(df_train_features, y)

RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=1001, n_jobs=2, oob_score=False, random_state=0,
            verbose=0, warm_start=False)

In [16]:
preds = clf.predict(df_test_features)
preds

array([0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0])

In [17]:
clf.predict_proba(df_test_features)[0:10]

array([[ 0.8415738 ,  0.1584262 ],
       [ 0.89735528,  0.10264472],
       [ 0.912978  ,  0.087022  ],
       [ 0.87299454,  0.12700546],
       [ 0.95564436,  0.04435564],
       [ 0.90729271,  0.09270729],
       [ 0.31383879,  0.68616121],
       [ 0.78865685,  0.21134315],
       [ 0.94497515,  0.05502485],
       [ 0.93506494,  0.06493506]])

In [18]:
pd.crosstab(df_test['top_10'], preds, rownames=['Actual Hit'], colnames=['Predicted Hit'])

Predicted Hit,0,1
Actual Hit,Unnamed: 1_level_1,Unnamed: 2_level_1
0,89,1
1,9,1


In [19]:
list(zip(df_train[features], clf.feature_importances_))

[('acousticness', 0.087207032792433276),
 ('danceability', 0.098921327097575493),
 ('duration_ms', 0.096781889837850563),
 ('energy', 0.090343528195510836),
 ('instrumentalness', 0.051386637300595117),
 ('key', 0.073958175239767596),
 ('liveness', 0.085725469617879427),
 ('loudness', 0.10469079172206663),
 ('mode', 0.023683742577832505),
 ('speechiness', 0.087471801537929356),
 ('tempo', 0.091638094754767571),
 ('time_signature', 0.0026525647082364426),
 ('valence', 0.1055389446175555)]

In [20]:
# Logistic Regression without any fine tuning
logreg = LogisticRegression()

In [21]:
logit_model=sm.Logit(y,df_train[features])
result=logit_model.fit()
print(result.summary())

Optimization terminated successfully.
         Current function value: 0.302326
         Iterations 11
                           Logit Regression Results                           
Dep. Variable:                 top_10   No. Observations:                  300
Model:                          Logit   Df Residuals:                      287
Method:                           MLE   Df Model:                           12
Date:                Mon, 13 Aug 2018   Pseudo R-squ.:                 0.07000
Time:                        17:28:39   Log-Likelihood:                -90.698
converged:                       True   LL-Null:                       -97.525
                                        LLR p-value:                    0.3233
                       coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------------
acousticness         0.4689      1.086      0.432      0.666      -1.660       2.598
danceabili

In [22]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV 

clf = Pipeline([('sampling', SMOTE(random_state=42)), ('rf', RandomForestClassifier(random_state=42))])

#Grid search with cross validation
params = {
    'rf__n_estimators': [11, 51, 101, 201],
    'rf__max_depth': [10, 20, None],
    'rf__min_samples_split': [2, 10, 20, 50],
    'rf__min_samples_leaf': [1, 5, 10, 20]
}

model = GridSearchCV(clf, params, scoring='average_precision', n_jobs=-1)

model.fit(df_train_features, y)

print('Best parameters were', model.best_params_)
print('Best average precision was', model.best_score_)

Best parameters were {'rf__max_depth': 20, 'rf__min_samples_leaf': 5, 'rf__min_samples_split': 2, 'rf__n_estimators': 51}
Best average precision was 0.182476836836


In [23]:
from scipy.stats import multivariate_normal
from sklearn.metrics import f1_score

In [24]:
def read_dataset(filePath,delimiter=','):
    return genfromtxt(filePath, delimiter=delimiter)

def feature_normalize(dataset):
    mu = np.mean(dataset,axis=0)
    sigma = np.std(dataset,axis=0)
    return (dataset - mu)/sigma

def estimateGaussian(dataset):
    mu = np.mean(dataset, axis=0)
    sigma = np.cov(dataset.T)
    return mu, sigma
    
def multivariateGaussian(dataset,mu,sigma):
    p = multivariate_normal(mean=mu, cov=sigma)
    return p.pdf(dataset)

In [25]:
def selectThresholdByCV(probs,gt):
    best_epsilon = 0
    best_f1 = 0
    f = 0
    stepsize = (max(probs) - min(probs)) / 1000;
    epsilons = np.arange(min(probs),max(probs),stepsize)
    for epsilon in np.nditer(epsilons):
        predictions = (probs < epsilon)
        f = f1_score(gt, predictions, average = "binary")
        if f > best_f1:
            best_f1 = f
            best_epsilon = epsilon
    return best_f1, best_epsilon

In [26]:
tr_data = df_train_features
cv_data = df_test_features

In [27]:
mu, sigma = estimateGaussian(tr_data)
p = multivariateGaussian(tr_data,mu,sigma)

p_cv = multivariateGaussian(cv_data,mu,sigma)
fscore, ep = selectThresholdByCV(p_cv,y2)
outliers = np.asarray(np.where(p < ep))

print(fscore)

  'precision', 'predicted', average, warn_for)


0.192771084337


In [28]:
# since neither SMOTE nor Anomaly detection gave good results (< 20% f=score), will build a recommendation system
from sklearn.cluster import KMeans

In [29]:
model = KMeans(n_clusters=20)

In [30]:
model.fit(df_train_features)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=20, n_init=10, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [31]:
labels = model.predict(df_train_features)
print(labels)

[13  1 12  1  3  4 13 13  2 12 15  4 13  1  3  1  8  4 12 16 12 10 10 10 10
  2  2 10  2 17 15  1  2 15  0  4  3  1  5  5  1 18  1 12 10 14 15  5 13  2
  0  4 16 16 16  5  9  5  1  3 10  9  1 15 16 10 15 14 16 14  6 17 13 14 15
 13 17  5 14  1  5  3  1  1  5 10 16  5 16 17 13  4  9 15  7  8 19 12 15 17
 17 17 16 13 17 10  1  4  8  3  1  5  0 18 14  3  2 13  0 14  9 13  1  5  4
 12  3 13 15 17  4 13 18 12  0 10  3  5  9 10  4 12 14  1 14  9 17  5  4 16
 19 15  3 14 16  3 14 12 14  9  5  5  8 18  2  5 15  3  5 16 10  4 15  4  2
  8  1 11  4  2 12 17  7 17  1  1 13  7 16 10 15  4 15 10 16 13  9 12  7  5
 19  2  4  1  2  7  0 14 10  1  5  4 16  5  5  1  5  2  3  1 16 15 10 17  5
 14  4  5  3  6 18  3 14  1 16  8  4 11 10  5 16  8 10 12  1 14 15 14  5 18
 15 19 16  9  1  4  7  7 16 12  8  4 15 12 16  3  5  9  2  7 11  5  3 16 17
 11  7 13 14  3  2 16  0  5  3  5 19  4  5 16  1 17  2  1 15  8 14 15 16 10]


In [32]:
new_labels = model.predict(df_test_features) 
print(new_labels)

[18 10  1  2  5 17  1  1  5 15 12  1  7  7  1 17  5 15 17  7 16  2 13  5 17
 17 10  4 16 10 18  1  8 18 16  1  7  1  5 10 17 10  7  5  2 15  7 16 10  5
 14  5  7 14  7  3 12 16  3  7 13  7 13 16 16 16 12  1  5  1  2 15 10  5 10
 15 12 17  7 17  2  1 16  0 10 17  0  3 15 11  1  5  1  4  1  1  3  8  2  5]


In [33]:
print(model.inertia_)

1567.488577


In [34]:
from sklearn.preprocessing import StandardScaler  
from sklearn.pipeline import make_pipeline

In [35]:
scaler = StandardScaler()

kmeans = KMeans(n_clusters=20)

pipeline = make_pipeline(scaler, kmeans)

In [36]:
pipeline.fit(df_train_features)

Pipeline(memory=None,
     steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('kmeans', KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=20, n_init=10, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0))])

In [37]:
labels_p = pipeline.predict(df_train_features)
print(labels_p)

[ 7  4 16 12  0 11 12 12  0  3 12 11  7 12  0 12 13 11  3 17  3  4  4  4 10
  0 14  4  0 15 12 12  0 12  1 15  0 12  5  5 11 15 17  3  4 14  4  5 12 12
  1  9  5 17 17  5  9  5 12  0  3 14 12  7 17  4 12  7 17 14 19 15  7  7 12
  7  4  5  7 12  5  0 15 12  5  4 17  5  1 15  7 11 14 12  1 18  6  3 12  4
 15 15 17 12 15  4 12 11 18  0 12  5  1  3  7  0  0  7  0  7  2 12 12  5 11
  3  0 12  4 15 11 12  3 16  1  4  0  5  9 10 17  3 14 12  7 14 15  5 11 17
  6  4  0 14 15  0  7  3 14  9  5  5 13  3  0  5 17  0  5 17  3 11 12 11 11
 18 12 16 17 14 16 15 17 15 12 12  7  6 17 10  7  9 12  4 17  7 14  3  6  5
  6  0 11 17  0  6  1  7  4  3  5 11 17  5  5 12  5  5  0  4 17 12  4 15  5
 14  4  5  0  8  3  0  7 17 17 18 11 16  4  5 17 18  3 16 12 14  4  7  5  3
  4  6 17  2 12 11  6  7 17  3 18 11 12  3  1  0  5  2  0 14 16  5  0  1 15
 16 14  7  7  0  0 17  1  5  0  5  6 11  5  5 11  4  0  3  4 13 14  4 17  4]


In [39]:
new_labels_p = pipeline.predict(df_test_features) 
print(new_labels_p)

[ 3  4 12  0  5 15  3  3  5 12  3  4  5 17 12 12  5 12 15  7  6  5 12  5 15
 15  4  6 17  4  3 11 18  0 17 17  6 17  5  4  4  4 14  5  0  4  5 17  4  5
  4  5 15  5  1  0  3 17  0  6 12 17 11 17  1 17  3 12  5 17  0 12  4  5  4
  4  3 17 14 15  5  3 17  4 12 15  1  0  7 16 17  5 17 11  7  4  0 18 17  5]


In [40]:
print(kmeans.inertia_)
#lower inertia with standard scaler

1590.30266571


In [41]:
all_labels = np.concatenate((labels_p, new_labels_p))
all_labels

array([ 7,  4, 16, 12,  0, 11, 12, 12,  0,  3, 12, 11,  7, 12,  0, 12, 13,
       11,  3, 17,  3,  4,  4,  4, 10,  0, 14,  4,  0, 15, 12, 12,  0, 12,
        1, 15,  0, 12,  5,  5, 11, 15, 17,  3,  4, 14,  4,  5, 12, 12,  1,
        9,  5, 17, 17,  5,  9,  5, 12,  0,  3, 14, 12,  7, 17,  4, 12,  7,
       17, 14, 19, 15,  7,  7, 12,  7,  4,  5,  7, 12,  5,  0, 15, 12,  5,
        4, 17,  5,  1, 15,  7, 11, 14, 12,  1, 18,  6,  3, 12,  4, 15, 15,
       17, 12, 15,  4, 12, 11, 18,  0, 12,  5,  1,  3,  7,  0,  0,  7,  0,
        7,  2, 12, 12,  5, 11,  3,  0, 12,  4, 15, 11, 12,  3, 16,  1,  4,
        0,  5,  9, 10, 17,  3, 14, 12,  7, 14, 15,  5, 11, 17,  6,  4,  0,
       14, 15,  0,  7,  3, 14,  9,  5,  5, 13,  3,  0,  5, 17,  0,  5, 17,
        3, 11, 12, 11, 11, 18, 12, 16, 17, 14, 16, 15, 17, 15, 12, 12,  7,
        6, 17, 10,  7,  9, 12,  4, 17,  7, 14,  3,  6,  5,  6,  0, 11, 17,
        0,  6,  1,  7,  4,  3,  5, 11, 17,  5,  5, 12,  5,  5,  0,  4, 17,
       12,  4, 15,  5, 14

In [42]:
Songs = df['Songs']
Artists = df['Artists']
df_cluster = pd.DataFrame({'labels': all_labels, 'songs': Songs, 'artists': Artists})
df_cluster.transpose()
df_cluster.head()

Unnamed: 0,artists,labels,songs
0,Pharrell Williams,7,Happy
1,Katy Perry featuring Juicy J,4,Dark Horse
2,John Legend,16,All of Me
3,Iggy Azalea featuring Charli XCX,12,Fancy
4,OneRepublic,0,Counting Stars


In [43]:
df_cluster.groupby('labels').count()

Unnamed: 0_level_0,artists,songs
labels,Unnamed: 1_level_1,Unnamed: 2_level_1
0,39,39
1,13,13
2,3,3
3,30,30
4,42,42
5,51,51
6,13,13
7,26,26
8,1,1
9,5,5


In [44]:
df_cluster.pivot_table(index=['labels','artists','songs'],fill_value=0, aggfunc='size')

labels  artists                                        songs                  
0       Adele                                          Water Under the Bridge     1
        Ariana Grande and The Weeknd                   Love Me Harder             1
        Ariana Grande featuring Iggy Azalea            Problem                    1
        Ariana Grande featuring Zedd                   Break Free                 1
        Avicii                                         Hey Brother                1
        Calvin Harris                                  Summer                     1
        Calvin Harris featuring Frank Ocean and Migos  Slide                      1
        Coldplay                                       Hymn for the Weekend       1
        DJ Snake and Lil Jon                           Turn Down for What         1
        DJ Snake featuring Bipolar Sunshine            Middle                     1
        Demi Lovato                                    Cool for the Summer       

In [45]:
username='username'

my_client_id='my_client_id'
my_client_secret='my_client_secret'
 
scope = 'user-library-read playlist-modify-public playlist-modify-private'

In [53]:
token = util.prompt_for_user_token(username, scope,client_id=my_client_id,client_secret=my_client_secret,redirect_uri='http://localhost:8888/')
myAuth="Bearer " + token

sp = spotipy.Spotify(auth=token)

In [88]:
def song_recommendations(track, artist):
    if 'featuring' in artist:
        artist = artist.split('featuring')[0]
    elif ' and ' in artist:
        artist = artist.split(' and ')[0]

    query = 'track:' + track + ' artist:' + artist
    track_id = sp.search(q=query , type='track')
    tid = track_id['tracks']['items'][0]['uri']
    feature = sp.audio_features(tid)
    df_feature = pd.DataFrame(feature)
    df_feature.drop(['analysis_url', 'id', 'track_href','uri','type'], inplace = True, axis=1)
    df_topredict = (df_feature - df_means)/df_std
    song_label = pipeline.predict(df_topredict)
    recommendations = df_cluster.loc[df_cluster['labels'] == song_label[0]]
    
    return recommendations

In [93]:
f = song_recommendations('Sing', 'My Chemical Romance')
f
# limited to certain genres

Unnamed: 0,artists,labels,songs
1,Katy Perry featuring Juicy J,4,Dark Horse
21,Avicii,4,Wake Me Up
22,Imagine Dragons,4,Demons
23,One Direction,4,Story of My Life
27,Disclosure featuring Sam Smith,4,Latch
44,Justin Timberlake,4,Not a Bad Thing
46,Paramore,4,Ain't It Fun
65,Florida Georgia Line,4,Dirt
76,Chris Brown featuring Usher and Rick Ross,4,New Flame
85,5 Seconds of Summer,4,Amnesia


Doesn't seem entirely accurate from the glance, but if more songs are added, perhaps the clustering will be better as there are some artists with similar styles to My Chemical Romance at least listed here.