In [27]:
import pandas as pd
import numpy as np
import datetime
from math import pi

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.svm import OneClassSVM
from sklearn.ensemble import IsolationForest

from sklearn.decomposition import PCA
from sklearn.manifold import TSNE



datapath = '../Data/'

In [5]:
def load_playlist(playlist='saved_tracks.h5'):
    '''
    Takes in the path of the data.
    Reads the data, convertes added_at to a datetime object and extract year and month to new column.
    Also standardise the tempo to between 0 and 1.
    Returns a dataframe.
    '''
    
#     read data
    df_playlist = pd.read_hdf(datapath + playlist, key='df')
    df_playlist.drop(['analysis_url', 'track_href', 'uri', 'type'], axis=1, inplace=True)
    
#     convert to datetime, extrated year and month from added_at
    df_playlist['added_at'] = df_playlist['added_at'].apply(lambda x : datetime.datetime.strptime(x,"%Y-%m-%dT%H:%M:%SZ"))
    df_playlist['added_year'] = df_playlist['added_at'].apply(lambda x : x.year)
    df_playlist['added_month'] = df_playlist['added_at'].apply(lambda x : x.month)
    
#     standardise tempo to be between 0 and 1
    df_playlist['tempo_01'] = df_playlist['tempo'] / df_playlist['tempo'].max()
    return df_playlist

In [11]:
df_saved = load_playlist()
df_recent = df_saved[df_saved['added_year'] >= 2017]
print(df_recent.shape)

(239, 20)


# Create Train Test

In [88]:
df_train_full , df_test_full = train_test_split(df_recent, test_size=0.2, random_state=123)

In [89]:
features = ['acousticness', 'danceability','energy','instrumentalness','speechiness']
df_train = df_train_full[features]
df_test = df_test_full[features]

In [90]:
X_train = np.array(df_train)
X_test = np.array(df_test)

In [91]:
clf = OneClassSVM(nu=0.01, kernel="rbf", gamma=0.01)
clf.fit(X_train)

OneClassSVM(cache_size=200, coef0=0.0, degree=3, gamma=0.01, kernel='rbf',
      max_iter=-1, nu=0.01, random_state=None, shrinking=True, tol=0.001,
      verbose=False)

In [92]:
df_test_full['normal'] = clf.predict(X_test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [93]:
df_test_full[df_test_full['normal'] == -1]

Unnamed: 0,acousticness,danceability,duration_ms,energy,id,instrumentalness,key,liveness,loudness,mode,...,tempo,time_signature,valence,added_at,song_title,artists,added_year,added_month,tempo_01,normal
19,0.0243,0.694,194880,0.957,6DRGwsUFQrNerxRexK7KMB,3e-06,9,0.108,-3.115,0,...,106.002,4,0.505,2017-03-18 06:08:01,Hey Ma (with Pitbull & J Balvin feat. Camila C...,"Pitbull,J Balvin,Camila Cabello",2017,3,0.524495,-1


In [95]:
df_test_full.shape

(48, 21)