In [453]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
import pickle



## Load datasets

In [278]:
def get_albums_data(path):
    df_albums = pd.read_csv(path)

    return df_albums

df_albums = get_albums_data(path = 'SpotGenTrack/Data Sources/spotify_albums.csv')

In [279]:
df_albums.columns

Index(['Unnamed: 0', 'album_type', 'artist_id', 'available_markets',
       'external_urls', 'href', 'id', 'images', 'name', 'release_date',
       'release_date_precision', 'total_tracks', 'track_id', 'track_name_prev',
       'uri', 'type'],
      dtype='object')

In [438]:
def get_artist_data(path, n):
    """
    :param path: string to file
    :param n: int, top level of genres
    :return: dataframe, dictionary

    """
    df_artists = pd.read_csv(path)

    df_artists['genres'] = df_artists['genres'].astype('string')
    df_artists = df_artists[df_artists['genres'] != '[]']
    df_artists['genres'] = df_artists['genres'].str.strip('[]').str.replace(' ','' ).str.replace("'", '')
    split_df = pd.DataFrame(df_artists.genres.str.split(",").tolist())
    labels = {}
    for i, name in enumerate(list(split_df[0].value_counts()[:n].index)):
        labels[name] = i

    df_artists['genre_1'] = split_df[0]
    df_artists['target'] = df_artists['genre_1'].apply(lambda x: labels.get(x)).fillna(n)
    return df_artists, labels

df_artists, labels = get_artist_data(path='SpotGenTrack/Data Sources/spotify_artists.csv' , n = 40)

df_artists.head(100)

Unnamed: 0.1,Unnamed: 0,artist_popularity,followers,genres,id,name,track_id,track_name_prev,type,genre_1,target
0,0,44,23230,"sertanejo,sertanejopop,sertanejotradicional,se...",4mGnpjhqgx4RUdsIJiURdo,Juliano Cezar,0wmDmAILuW9e2aRttkl4aC,track_9,artist,sertanejo,40.0
2,2,26,1596,danishpoprock,6YVY310fjfUzKi8hiqR7iK,Gangway,1bFqWDbvHmZe2f4Nf9qaD8,track_38,artist,ukalternativepop,40.0
3,3,31,149,ukalternativepop,2VElyouiCfoYPDJluzwJwK,FADES,3MFSUBAidPzRBbIS7BDj1S,track_34,artist,frenchbaroque,40.0
4,4,21,11,frenchbaroque,4agVy03qW8juSysCTUOuDI,Jean-Pierre Guignon,2r3q57FhxdsCyYr0kuDq4b,track_26,artist,classicfinnishpop,40.0
9,9,25,992,classicfinnishpop,5ijoPCUhV0dW8EJ7aPGvXK,Pepe Willberg & The Paradise,6WVlVPoesX2zVvJzZJUj6a,track_8,artist,sertanejo,40.0
...,...,...,...,...,...,...,...,...,...,...,...
252,252,40,88632,"newjackswing,quietstorm,urbancontemporary",4D0WfOUqTzqKysXt33VL3j,Michel'le,1CrWoWuxupYY1vi1XRdpBX,track_32,artist,indier&b,40.0
253,253,36,128,jazzsaxophone,7p1C6cbN2rgBErzfB1u3Z7,Julian Dash,1axzRYUUASTfN4OOtrjfQd,track_98,artist,hebrewpop,19.0
254,254,24,2510,colombianrock,2saVhSlCKjNZnW4QZcFHkE,Velandia y La Tigra,2Cyn2Hzt5EBuB4bN2Vp9Tl,track_73,artist,atlhiphop,40.0
256,256,55,80107,"anthemworship,ccm,christianmusic,deepccm,world...",1vaOLxWPpsv5LVsSHBy9tF,Jason Upton,6tZ1vMQc957RRrNTjuQ4RJ,track_32,artist,brutaldeathmetal,40.0


In [439]:
def get_tracks_data(path):
    df_tracks = pd.read_csv(path)

    return df_tracks
df_tracks = get_tracks_data(path = 'SpotGenTrack/Data Sources/spotify_tracks.csv')

In [440]:
df_tracks.columns

Index(['Unnamed: 0', 'acousticness', 'album_id', 'analysis_url', 'artists_id',
       'available_markets', 'country', 'danceability', 'disc_number',
       'duration_ms', 'energy', 'href', 'id', 'instrumentalness', 'key',
       'liveness', 'loudness', 'lyrics', 'mode', 'name', 'playlist',
       'popularity', 'preview_url', 'speechiness', 'tempo', 'time_signature',
       'track_href', 'track_name_prev', 'track_number', 'uri', 'valence',
       'type'],
      dtype='object')

In [441]:
df_feature = pd.read_csv('SpotGenTrack/Features Extracted/lyrics_features.csv')

df_feature.head()



Unnamed: 0.1,Unnamed: 0,mean_syllables_word,mean_words_sentence,n_sentences,n_words,sentence_similarity,track_id,vocabulary_wealth
0,0,-1.0,-1.0,-1,-1,-1.0,5KIfHjHI5NIsPHNt58qua0,-1.0
1,1,1.1,5.65,31,326,0.043011,13keyz9ikBe6ZpRasw7l4X,0.45
2,2,1.37,4.77,74,532,0.050352,1WugzepXsLjnsM0K4UaWYc,0.59
3,3,1.95,3.38,72,430,0.02856,2MO6oEAlMKcsfI8xP3yoy8,0.49
4,4,1.16,2.99,68,368,0.047849,1i4St7fmSUE9nB3R9n8fol,0.47


### Join Tracks and Artists

In [442]:
def get_joined_dataframes(df_tracks, df_artists, df_albums):

    df_join = df_tracks.set_index('id').join(df_artists.set_index('track_id'), on='id', lsuffix='_left', rsuffix='_right', how='inner')
    df_join = df_join.join(df_albums.set_index('track_id'), on=df_join.index, lsuffix='_left', rsuffix='_right', how = 'inner')

    df_test = df_join[['acousticness', 'danceability', 'energy', 'instrumentalness','liveness','popularity','speechiness','tempo','valence','target']]

    return df_join, df_test

#df_test = df_join[['acousticness', 'danceability', 'energy', 'instrumentalness','liveness','popularity','speechiness','tempo','valence', 'mean_syllables_word', 'mean_words_sentence', 'n_sentences', 'n_words', 'sentence_similarity', 'vocabulary_wealth','target',]]
#df_test = df_join[['acousticness', 'danceability', 'energy', 'instrumentalness','liveness','popularity','speechiness','tempo','valence','target']]
#df_test.head()

In [443]:
df_join, df_test = get_joined_dataframes(df_tracks, df_artists, df_albums)

In [444]:
#maybe delete audiobooks. we will see
non_v = list(df_join[df_join.speechiness >= 0.93]['playlist'].unique())

In [459]:
def save_df_to_csv(df_join, df_test):
    df_join.to_csv('Joined.csv',index=False)
    df_test.to_csv('test.csv',index=False)

In [460]:
save_df_to_csv(df_join, df_test)

## Normalize dataset

In [445]:
def get_normalized_X_y(df_test):
    df_test.dropna(inplace=True)
    scaler = StandardScaler()
    scaler.fit(df_test.drop('target', axis=1))
    scaled_features = scaler.transform(df_test.drop('target',axis=1))
    df_test_feat = pd.DataFrame(scaled_features, columns = df_test.columns[:-1])
    y = df_test.iloc[:,-1].squeeze()
    X = df_test_feat
    return X, y

In [446]:
X,y = get_normalized_X_y(df_test)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test.dropna(inplace=True)


In [447]:
def train_test_data (X,y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

    return  X_train, X_test, y_train, y_test

X_train, X_test, y_train, y_test = train_test_data(X,y)

X_train.shape,X_test.shape, y_train.shape, y_test.shape

((23478, 9), (7827, 9), (23478,), (7827,))

In [448]:
def knn_trainer(X_train, y_train, n):
    knn = KNeighborsClassifier(n_neighbors=n)

    knn.fit(X_train, y_train)

    return knn
knn = knn_trainer(X_train, y_train, n= 7)

### Create Pickle File

In [456]:
def create_pickle(knn):
    knnPickle = open('knn_pickle_file', 'wb')
    pickle.dump(knn, knnPickle)
    knnPickle.close()

In [458]:
create_pickle(knn)

### Stuff for Testing

In [454]:
knnPickle = open('knn_pickle_file', 'wb')

In [455]:
# source, destination
pickle.dump(knn, knnPickle)

# close the file
knnPickle.close()

In [449]:
knn.predict(X_test)

array([40., 40., 40., ..., 40., 40., 40.])

In [450]:
knn.score(X_test, y_test)

0.9063498147438355

In [451]:
a = []

for n in range(1,50):
    knn = knn_trainer(X_train, y_train,n)
    i = knn.score(X_test, y_test)
    a.append(i)

In [452]:
a

[0.8259869681870449,
 0.7499680592819726,
 0.878880797240322,
 0.9007282483710234,
 0.904433371662195,
 0.905838763255398,
 0.9063498147438355,
 0.9063498147438355,
 0.9063498147438355,
 0.9063498147438355,
 0.9063498147438355,
 0.9063498147438355,
 0.9063498147438355,
 0.9063498147438355,
 0.9063498147438355,
 0.9063498147438355,
 0.9063498147438355,
 0.9063498147438355,
 0.9063498147438355,
 0.9063498147438355,
 0.9063498147438355,
 0.9063498147438355,
 0.9063498147438355,
 0.9063498147438355,
 0.9063498147438355,
 0.9063498147438355,
 0.9063498147438355,
 0.9063498147438355,
 0.9063498147438355,
 0.9063498147438355,
 0.9063498147438355,
 0.9063498147438355,
 0.9063498147438355,
 0.9063498147438355,
 0.9063498147438355,
 0.9063498147438355,
 0.9063498147438355,
 0.9063498147438355,
 0.9063498147438355,
 0.9063498147438355,
 0.9063498147438355,
 0.9063498147438355,
 0.9063498147438355,
 0.9063498147438355,
 0.9063498147438355,
 0.9063498147438355,
 0.9063498147438355,
 0.9063498147438