In [3]:
import pandas as pd
import numpy as np
import json

# For modeling
import sklearn
from sklearn.preprocessing import StandardScaler
from sklearn import model_selection
from sklearn import linear_model
from sklearn import ensemble
from sklearn import metrics

In [4]:
audio_feature_names = [
    'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',
    'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'type',
    'id', 'uri', 'track_href', 'analysis_url', 'duration_ms', 'time_signature', 'playlist'
]

In [5]:
with open('../../data/dinner_features.json') as f:
    dinner = json.load(f)
    
with open('../../data/party_features.json') as f:
    party = json.load(f)
    
with open('../../data/sleep_features.json') as f:
    sleep = json.load(f)
    
with open('../../data/workout_features.json') as f:
    workout = json.load(f)

In [6]:
def json_to_df(testjson, newcol):
    df = pd.DataFrame(testjson)
    df['playlist'] = newcol
    return df

In [7]:
# datalist = [dinner, party, sleep, workout]
datalist = [dinner, party, sleep]
# datalist = [party, sleep]
i = 0
data = pd.DataFrame(columns=audio_feature_names)
while (i<len(datalist)):
    df = pd.DataFrame(datalist[i])
    if i == 0:
        df['playlist'] = 'dinner'
    elif i == 1:
        df['playlist'] = 'party'
    elif i == 2:
        df['playlist'] = 'sleep'
    else:
        df['playlist'] = 'workout'
    data = data.append(df).reset_index().drop('index', axis=1)
    i = i+1

In [12]:
index = []
columns = []
for i in data.playlist.unique():
    actual = str('Actual ' + i)
    predicted = str('Predicted + i')
    index.append(actual)
    columns.append(predicted)

In [13]:
index

['Actual dinner', 'Actual party', 'Actual sleep']

In [520]:
import boto3
AWS_ACCESS_KEY_ID = os.getenv("aws_access_key_id")
AWS_SECRET_ACCESS_KEY = os.getenv("aws_secret_access_key")
S3_BUCKET = "nw-kristian-nikolov-s3"
s3 = boto3.resource('s3', aws_access_key_id=AWS_ACCESS_KEY_ID, aws_secret_access_key=AWS_SECRET_ACCESS_KEY)

In [522]:
df = pd.DataFrame(json.loads(s3.Object(S3_BUCKET, "workout_features.json").get()['Body'].read().decode('utf-8')))

In [523]:
data = pd.DataFrame(columns=audio_feature_names)

# Train Model 

In [1]:
initial_features = [
    'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',
    'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo'
]

In [382]:
scaler = StandardScaler()
scaler.fit(data[initial_features])
scaled_features = pd.DataFrame(scaler.fit_transform(data[initial_features]), columns = initial_features)

In [551]:
def scale_data(array,means=scaler.mean_,stds=scaler.var_ **0.5):
    return (array-means)/stds

In [384]:
scaled_features = pd.DataFrame(scale_data(data[initial_features]), columns = initial_features)

In [385]:
features = scaled_features
target = data['playlist']

In [386]:
#split into train and test set
X_train, X_test, y_train, y_test = model_selection.train_test_split(
    features, target, test_size=0.2, random_state=1337)

In [536]:
features

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo
0,0.672369,-1.228514,-0.0616407,-0.491205,0.793999,-0.589868,1.346013,-0.340139,-0.772375,0.465786,-0.964508
1,0.972342,-0.248367,-0.902196,0.585370,0.793999,-0.569201,0.162940,-0.340185,0.356038,0.361178,-1.310086
2,-1.586613,-1.502320,-1.46257,-0.967694,0.793999,-0.532000,1.406757,-0.340179,0.098060,-1.359043,-1.258528
3,1.260071,-0.061861,-1.46257,-0.108544,0.793999,-0.595379,-1.021868,-0.322252,-0.179027,1.128303,0.239778
4,-0.086747,0.065121,0.778914,-0.206894,0.793999,-0.549911,-0.146568,-0.329543,-0.303238,1.159298,-1.038700
...,...,...,...,...,...,...,...,...,...,...,...
295,-0.392842,1.410343,-0.341826,1.062236,0.793999,1.476847,-0.716409,-0.340185,1.359284,-0.425319,2.264321
296,-0.839741,1.271456,1.61947,0.878536,-1.25945,0.567492,-1.181337,-0.340185,1.703254,-0.855374,-0.755527
297,-0.025528,-0.256303,-0.902196,-0.096674,0.793999,-0.250927,0.148477,-0.340185,-0.609945,-1.281555,0.232170
298,0.041813,-0.688838,-0.902196,-0.115515,0.793999,4.866258,1.244772,0.258970,-0.389231,0.461911,-1.198797


## Logistic Regression

In [387]:
lr = linear_model.LogisticRegression(fit_intercept=False, penalty='l2')

In [388]:
lr.fit(X_train[initial_features], y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=False,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [389]:
ypred_proba_test = lr.predict_proba(X_test[initial_features])[:,1]
ypred_bin_test = lr.predict(X_test[initial_features])

In [390]:
# auc = sklearn.metrics.roc_auc_score(y_test, ypred_proba_test)
confusion = sklearn.metrics.confusion_matrix(y_test, ypred_bin_test)
accuracy = sklearn.metrics.accuracy_score(y_test, ypred_bin_test)
classification_report = sklearn.metrics.classification_report(y_test, ypred_bin_test)

In [542]:
classification_report

'              precision    recall  f1-score   support\n\n      dinner       0.62      0.56      0.59        18\n       party       0.86      0.95      0.90        20\n       sleep       0.68      0.68      0.68        22\n\n    accuracy                           0.73        60\n   macro avg       0.72      0.73      0.72        60\nweighted avg       0.73      0.73      0.73        60\n'

In [394]:
# print('AUC on test: %0.3f' % auc)
print('Accuracy on test: %0.3f' % accuracy)
print()
print(pd.DataFrame(confusion,
                  index=['Actual dinner','Actual party', 'Actual sleep'],
                  columns=['Predicted dinner', 'Predicted party', 'Predicted sleep']))

# print(pd.DataFrame(confusion,
#                   index=['Actual party', 'Actual sleep'],
#                   columns=['Predicted party', 'Predicted sleep']))

Accuracy on test: 0.750

               Predicted dinner  Predicted party  Predicted sleep
Actual dinner                 8                2                8
Actual party                  1               19                0
Actual sleep                  2                2               18


In [395]:
fitted = pd.DataFrame(index=initial_features)

fitted['coefs'] = lr.coef_[0]

fitted['odds_ratio'] = fitted.coefs.apply(np.exp)

fitted = fitted.sort_values(by='odds_ratio', ascending=False)

fitted.head()

Unnamed: 0,coefs,odds_ratio
valence,1.132645,3.103856
acousticness,0.244193,1.27659
mode,0.121859,1.129595
tempo,0.060827,1.062715
key,0.021568,1.021802


## Random Forest

In [528]:
rf = ensemble.RandomForestClassifier()

In [529]:
rf.fit(X_train[initial_features], y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [530]:
ypred_proba_test = rf.predict_proba(X_test[initial_features])[:,1]
ypred_bin_test = rf.predict(X_test[initial_features])

In [531]:
# auc = sklearn.metrics.roc_auc_score(y_test, ypred_proba_test)
confusion = sklearn.metrics.confusion_matrix(y_test, ypred_bin_test)
accuracy = sklearn.metrics.accuracy_score(y_test, ypred_bin_test)
classification_report = sklearn.metrics.classification_report(y_test, ypred_bin_test)

In [532]:
# print('AUC on test: %0.3f' % auc)
print('Accuracy on test: %0.3f' % accuracy)
print()
print(pd.DataFrame(confusion,
                  index=['Actual dinner','Actual party', 'Actual sleep'],
                  columns=['Predicted dinner', 'Predicted party', 'Predicted sleep']))

# print(pd.DataFrame(confusion,
#                   index=['Actual party', 'Actual sleep'],
#                   columns=['Predicted party', 'Predicted sleep']))

Accuracy on test: 0.733

               Predicted dinner  Predicted party  Predicted sleep
Actual dinner                10                1                7
Actual party                  1               19                0
Actual sleep                  5                2               15


In [395]:
fitted = pd.DataFrame(index=initial_features)

fitted['coefs'] = lr.coef_[0]

fitted['odds_ratio'] = fitted.coefs.apply(np.exp)

fitted = fitted.sort_values(by='odds_ratio', ascending=False)

fitted.head()

Unnamed: 0,coefs,odds_ratio
valence,1.132645,3.103856
acousticness,0.244193,1.27659
mode,0.121859,1.129595
tempo,0.060827,1.062715
key,0.021568,1.021802


# Fit a new song

In [393]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import os

SP_USER = "1295675405"  # this is my public user id, not a key and needs to be passed as an argument to use my playlists
CID = os.getenv("spotipy_cid")
SECRET = os.getenv("spotipy_secret")

client_credentials_manager = SpotifyClientCredentials(client_id=CID, client_secret=SECRET)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

In [601]:
teststring = 'cant hold us macklemore'

In [609]:
sp.search(teststring)['tracks']['items'][0]['artists'][0]['name']

'Macklemore & Ryan Lewis'

In [610]:
sp.search(teststring)['tracks']['items'][0]['name']

"Can't Hold Us - feat. Ray Dalton"

In [592]:
testid = sp.search(teststring)['tracks']['items'][0]['id']
testfeatures = sp.audio_features(testid)
testdf = pd.DataFrame(testfeatures)
testdf = pd.DataFrame(scale_data(testdf[initial_features]), columns = initial_features)
ypred_proba_new = lr.predict_proba(testdf[initial_features])[:,1]
ypred_bin_new = lr.predict(testdf[initial_features])
ypred_bin_new

array(['party'], dtype=object)

In [562]:
testid

'6VoIBz0VhCyz7OdEoRYDiA'

In [593]:
DB_HOST = os.environ.get('MYSQL_HOST')
DB_PORT = os.environ.get('MYSQL_PORT')
DB_USER = os.environ.get('MYSQL_USER')
DB_PW = os.environ.get('MYSQL_PASSWORD')

In [600]:
os.environ.get('DATABASE_NAME')

'msia423_spotify_features'

In [611]:
def scale_data(dataframe, array):
    """
    Fits a scaler on a dataframe and applies it on an array of the same shape
    Args:
        dataframe: dataframe which the scaler would calculate column means and variances on
        array: array which will be standardized - needs to be same number of ordered vectors as the dataframe

    Returns: standardized array

    """
    scaler = fit_scaler(dataframe)
    return (array - scaler.mean_) / (scaler.var_ ** 0.5)

In [613]:
song_details = sp.search('ed sheeran i se fire')


In [614]:
artist = song_details['tracks']['items'][0]['artists'][0]['name']
song = song_details['tracks']['items'][0]['name']
song_id = song_details['tracks']['items'][0]['id']
song_features = sp.audio_features(song_id)


In [615]:
df = pd.DataFrame(song_features)


In [633]:
for index, row in df.iterrows():
    print(row['mode'])

0


In [None]:
full_dataset = pd.read_csv(c.FEATURES_RAW_LOCATION)
df = pd.DataFrame(scale_data(full_dataset, df))
ypred_bin_new = lr.predict(df[c.training_features])
df['playlist'] = ypred_bin_new[0]