In [44]:
from __future__ import division, print_function, unicode_literals

import numpy as np
import os
import pandas as pd
np.random.seed(42)

%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

import seaborn as sns
sns.set_style('white')


import warnings
warnings.filterwarnings(action="ignore", message="^internal gelsd")

In [45]:
songs = pd.read_csv("/Users/kashishsharma/Desktop/project_ds/swap-Spotify.csv")

In [46]:
songs.head()

Unnamed: 0.1,Unnamed: 0,artist,track,peak_position,peak_position_grouped,weeks_in_charts,spotify_id,explicit,spotify_popularity
0,0.0,The Weeknd,Blinding Lights,1,1,31,0VjIjW4GlUZAMYd2vXMi3b,0.0,99
1,1.0,SAINt JHN,Roses,4,1,15,7fPuWrlpwDcHm5aHCH5D9t,1.0,85
2,2.0,Harry Styles,Watermelon Sugar,8,2,15,6UelLqGlWMcVH1E5c4H7lY,0.0,94
3,3.0,Lil Mosey,Blueberry Faygo,9,2,21,6wJYhPfqk3KGhHRG76WzOh,1.0,91
4,4.0,Harry Styles,Adore You,6,2,30,3jjujdWJ72nww5eGnfs2E7,0.0,88


In [47]:
from sklearn import preprocessing
peak_position = songs[['peak_position']].values
min_max_scaler = preprocessing.MinMaxScaler()
peak_position_scaled = min_max_scaler.fit_transform(peak_position)
songs['peak_position'] = pd.DataFrame(peak_position_scaled)

ValueError: could not convert string to float: 'peak_position'

In [None]:
songs_features = songs.copy()
songs_features = songs_features.drop(['track','artist','spotify_id', 'spotify_popularity','spotify_popularity', 'peak_position_grouped','explicit', 'weeks_in_charts'],axis=1)

In [None]:
from sklearn.cluster import KMeans

Sum_of_squared_distances = []
K = range(1,20)
for k in K:
    km = KMeans(n_clusters=k)
    km = km.fit(songs_features)
    Sum_of_squared_distances.append(km.inertia_)

In [None]:
from sklearn.metrics import silhouette_score
for n_clusters in range(2,10):
    clusterer = KMeans (n_clusters=n_clusters)
    preds = clusterer.fit_predict(songs_features)
    centers = clusterer.cluster_centers_

    score = silhouette_score (songs_features, preds, metric='euclidean')
    print ("For n_clusters = {}, silhouette score is {})".format(n_clusters, score))

In [None]:
plt.plot(K, Sum_of_squared_distances, 'gx-')
plt.xlabel('k')
plt.ylabel('Sum_of_squared_distances')
plt.title('Elbow Method For Optimal k')
plt.show()

In [None]:
kmeans = KMeans(n_clusters=2)
kmeans.fit(songs_features)

In [None]:
# PCA

from sklearn.decomposition import PCA
y_kmeans = kmeans.predict(songs_features)
pca = PCA(n_components=2)
principal_components = pca.fit_transform(songs_features)


In [None]:
pc = pd.DataFrame(principal_components)
pc['label'] = y_kmeans
pc.columns = ['x', 'y','label']

#plot data with seaborn
cluster = sns.lmplot(data=pc, x='x', y='y', hue='label', 
                   fit_reg=False, legend=True, legend_out=True)


In [None]:
pca.explained_variance_ratio_

The explained variance tells you how much information (variance) can be attributed to each of the principal components. By looking at the explained_variance_ratio_ we can see the 1st component contains 87% and the 2nd 5% which implies we've retained 92% of the original variance in the data which is awesome! 

In [None]:
print (pd.DataFrame(pca.components_,columns=songs_features.columns,index = ['PC-1','PC-2']))


In [None]:
songs['label'] = y_kmeans
songs = songs.sample(frac=1)
songs['label'].value_counts()

In [None]:
songs[songs['label'] == 0].tail(50)

In [None]:
songs[songs['label']==1].head(15)

In [None]:
songs[songs['label']==2].head(30)

In [None]:
songs[songs['label']==3].head(40)

In [None]:
songs[songs['label']==4].head(40)

In [None]:
#songs[songs['label']==2].hist()

In [None]:
#songs[songs['label']==3].mean()

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.utils.multiclass import unique_labels

X = songs_features
y = y_kmeans

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

rfc = RandomForestClassifier(n_estimators=100,criterion='gini')
rfc.fit(X_train,y_train)

In [None]:

y_pred = rfc.predict(X_test)    

In [None]:
def plot_confusion_matrix(y_true, y_pred, classes,
                          normalize=False,
                          title=None,
                          cmap=plt.cm.Blues):
    
    if not title:
        if normalize:
            title = 'Normalized confusion matrix'
        else:
            title = 'Confusion matrix'

    cm = confusion_matrix(y_true, y_pred)
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    fig, ax = plt.subplots()
    im = ax.imshow(cm, interpolation='nearest', cmap=cmap)
    ax.figure.colorbar(im, ax=ax)
    ax.set(xticks=np.arange(cm.shape[1]),
           yticks=np.arange(cm.shape[0]),
           xticklabels=classes, yticklabels=classes,
           title=title,
           ylabel='True label',
           xlabel='Predicted label')

    plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
             rotation_mode="anchor")

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            ax.text(j, i, format(cm[i, j], fmt),
                    ha="center", va="center",
                    color="white" if cm[i, j] > thresh else "black")
    fig.tight_layout()
    return ax


In [None]:
# GENRE definntions

definitions = ['Chill','Energetic','Cheerful','Romantic','Opera']

In [None]:
features = songs_features.columns
list(zip(songs_features[features], rfc.feature_importances_))

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=3)

knn.fit(X_train,y_train)


In [None]:
knn_pred =knn.predict(X_test)

In [None]:
plot_confusion_matrix(y_test, knn_pred, classes=definitions,
                      title='Confusion matrix for KNN')