In [322]:
import pandas as pd
import numpy as np
from sklearn import preprocessing, cross_validation, neighbors, metrics, svm

songs_dataset = pd.read_json('MasterSongList.json')

In [323]:
songs_dataset.loc[:,'genres'] = songs_dataset['genres'].apply(''.join)
def consolidateGenre(genre):
    if len(genre)>0:
        return genre.split(':')[0]
    else: return genre

songs_dataset.loc[:, 'genres'] = songs_dataset['genres'].apply(consolidateGenre)

In [324]:
audio_feature_list = [audio_feature for audio_feature in songs_dataset['audio_features']]
audio_features_headers = ['key','energy','liveliness','tempo','speechiness','acousticness','instrumentalness','time_signature'
                         ,'duration','loudness','valence','danceability','mode','time_signature_confidence','tempo_confidence'
                         ,'key_confidence','mode_confidence']
audio_features = pd.DataFrame(audio_feature_list, columns=audio_features_headers)
audio_features.loc[:,].dropna(axis=0,how='all',inplace=True)
audio_features['genres'] = songs_dataset['genres']

## Unbalanced data

In [325]:
rock_rap = audio_features.loc[(audio_features['genres'] == 'rock') | (audio_features['genres'] == 'rap')]
rock_rap.reset_index(drop=True)

Unnamed: 0,key,energy,liveliness,tempo,speechiness,acousticness,instrumentalness,time_signature,duration,loudness,valence,danceability,mode,time_signature_confidence,tempo_confidence,key_confidence,mode_confidence,genres
0,11.0,0.398953,0.170640,77.821,0.033343,0.007855,0.457147,1.0,4.0,548.03156,-19.753,0.243535,0.198917,0.901,0.676,0.362,0.996,rock
1,7.0,0.906388,0.130576,127.438,0.122818,0.000353,0.001200,0.0,4.0,210.42667,-5.856,0.318454,0.418888,0.407,0.602,0.603,0.889,rock
2,11.0,0.682443,0.163735,146.216,0.183496,0.030170,0.000033,0.0,4.0,222.58667,-6.162,0.531741,0.766924,0.587,0.342,0.064,0.982,rap
3,5.0,0.699017,0.041737,140.624,0.038933,0.158435,0.000080,1.0,4.0,471.20000,-7.392,0.725072,0.439839,0.594,0.542,0.133,0.691,rock
4,6.0,0.722723,0.074246,89.999,0.340797,0.260922,0.000000,0.0,4.0,193.56000,-2.801,0.807143,0.907577,0.217,0.231,0.603,1.000,rap
5,6.0,0.870259,0.484105,110.075,0.042160,0.003531,0.000000,0.0,4.0,187.52000,-4.185,0.221896,0.488180,0.181,0.475,0.782,0.793,rock
6,2.0,0.738028,0.091445,119.004,0.030686,0.157794,0.000000,1.0,4.0,235.28000,-3.918,0.317212,0.610500,0.376,0.579,0.784,0.950,rock
7,4.0,0.830646,0.290238,102.967,0.049160,0.000291,0.000000,0.0,4.0,211.38667,-3.957,0.714192,0.676519,0.377,0.342,0.930,1.000,rock
8,2.0,0.832268,0.273200,164.070,0.059476,0.061904,0.000000,0.0,4.0,217.66667,-4.548,0.563510,0.685227,0.375,0.293,0.357,0.869,rap
9,5.0,0.782854,0.090648,117.977,0.035536,0.000227,0.000000,0.0,4.0,269.37574,-4.256,0.377586,0.490146,0.649,0.628,0.724,0.724,rock


In [327]:
label_genres = np.array(rock_rap['genres'])
final_features = rock_rap.drop('genres',axis = 1).astype(float)
final_features.isnull().any()

key                          False
energy                       False
liveliness                   False
tempo                        False
speechiness                   True
acousticness                 False
instrumentalness             False
time_signature               False
duration                     False
loudness                     False
valence                      False
danceability                 False
mode                         False
time_signature_confidence    False
tempo_confidence             False
key_confidence               False
mode_confidence              False
dtype: bool

In [328]:
final_features = final_features.fillna(final_features.median())
final_features.isnull().any()

key                          False
energy                       False
liveliness                   False
tempo                        False
speechiness                  False
acousticness                 False
instrumentalness             False
time_signature               False
duration                     False
loudness                     False
valence                      False
danceability                 False
mode                         False
time_signature_confidence    False
tempo_confidence             False
key_confidence               False
mode_confidence              False
dtype: bool

In [329]:
knn = neighbors.KNeighborsClassifier(n_neighbors = 3)
standard_scaler = preprocessing.StandardScaler()
final_features = standard_scaler.fit_transform(final_features)

In [330]:
X_train,X_test,y_train,y_test = cross_validation.train_test_split(final_features,label_genres,test_size = 0.2,random_state = 101)

In [331]:
knn.fit(X_train,y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=3, p=2,
           weights='uniform')

In [332]:
pred = knn.predict(X_test)

In [333]:
from nltk import ConfusionMatrix
print(ConfusionMatrix(list(y_test), list(pred)))

     |         r |
     |    r    o |
     |    a    c |
     |    p    k |
-----+-----------+
 rap | <392> 102 |
rock |   45<1239>|
-----+-----------+
(row = reference; col = test)



In [334]:
print(metrics.classification_report(y_test,pred))

             precision    recall  f1-score   support

        rap       0.90      0.79      0.84       494
       rock       0.92      0.96      0.94      1284

avg / total       0.92      0.92      0.92      1778



In [394]:
scores = cross_validation.cross_val_score(knn,final_features,label_genres,cv=10,scoring = 'accuracy')
print(scores.mean())

0.919334113186


In [398]:
print(metrics.accuracy_score(y_test,pred))

0.922947131609


## here I want to find the optimal K for for the highest mean scores:


In [404]:
def optimal_k(X,y,cv=10,scoring='accuracy'):
    k_list = list(range(1,11))

    # subsetting just the odd ones
    n = [k for k in k_list]

    # empty list that will hold cv scores
    cv_scores = []

    # perform 10-fold cross validation
    for k in n:
        knn = neighbors.KNeighborsClassifier(n_neighbors = k)
        scores = cross_validation.cross_val_score(knn, X, y, cv=10, scoring='accuracy')
        cv_scores.append(scores.mean())
        MSE = [1 - x for x in cv_scores]

    # determining best k
    optimal_k = n[MSE.index(min(MSE))]
    return cv_scores[optimal_k]

In [405]:
print(optimal_k(final_features,label_genres,cv=10,scoring='accuracy')) ##this yields with k =8

0.91944482826


# With balanced data

In [387]:
from collections import Counter
Counter(rock_rap['genres'])

Counter({'rock': 6437, 'rap': 2452})

In [392]:
scores = []
standard_scaler = preprocessing.StandardScaler()
for i in range(10): #this yields 10 random samples
    rock_2000 = rock_rap.loc[rock_rap['genres'] == 'rock'].sample(n=2000)
    rap_2000 = rock_rap.loc[rock_rap['genres'] == 'rap'].sample(n=2000)
    df_2000 = pd.concat([rock_2000,rap_2000], ignore_index=True)
    df_2000 = df_2000.fillna(df_2000.median(),inplace=True)
    knn = neighbors.KNeighborsClassifier(n_neighbors = 6)
    
    label_2000 = np.array(df_2000['genres'])
    final_2000 = df_2000.drop('genres',axis = 1).astype(float)
    final_2000 = standard_scaler.fit_transform(final_2000)
    X_train,X_test,y_train,y_test = cross_validation.train_test_split(final_features,label_genres,test_size = 0.2,random_state = 101)
    knn.fit(X_train,y_train)
    pred = knn.predict(X_test)
    accuracy = metrics.accuracy_score(y_test, pred)
    scores.append(accuracy)

In [393]:
print(scores)

[0.92294713160854891, 0.92294713160854891, 0.92294713160854891, 0.92294713160854891, 0.92294713160854891, 0.92294713160854891, 0.92294713160854891, 0.92294713160854891, 0.92294713160854891, 0.92294713160854891]


## --> so bascially with unbalanced or balanced data, the accuracy scores aren't much different

## I try to with cross_val_score by using the function optimal_k but encounter this error....

In [406]:
scores = []
standard_scaler = preprocessing.StandardScaler()
for i in range(10):
    rock_2000 = rock_rap.loc[rock_rap['genres'] == 'rock'].sample(n=2000)
    rap_2000 = rock_rap.loc[rock_rap['genres'] == 'rap'].sample(n=2000)
    df_2000 = pd.concat([rock_2000,rap_2000], ignore_index=True)
    df_2000 = df_2000.fillna(df_2000.median(),inplace=True)
    
    label_2000 = np.array(df_2000['genres'])
    final_2000 = df_2000.drop('genres',axis = 1).astype(float)
    final_2000 = standard_scaler.fit_transform(final_2000)
    scores.append(optimal_k(final_2000,label_2000,cv=10,scoring='accuracy'))

IndexError: list index out of range