In [1]:
import os

%matplotlib inline
import string
import sys
from collections import deque

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

sys.path.append("code/.")

from sklearn import datasets
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.dummy import DummyClassifier, DummyRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import (
    GridSearchCV,
    RandomizedSearchCV,
    cross_val_score,
    cross_validate,
    train_test_split,
)
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from utils import *

In [2]:
data = pd.read_csv('data/spotify.csv', index_col=False)
data.drop(columns=["Unnamed: 0", "song_title", "artist"], axis=1, inplace=True)
data.tail()

Unnamed: 0,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence,target
2012,0.00106,0.584,274404,0.932,0.00269,1,0.129,-3.501,1,0.333,74.976,4.0,0.211,0
2013,0.0877,0.894,182182,0.892,0.00167,1,0.0528,-2.663,1,0.131,110.041,4.0,0.867,0
2014,0.00857,0.637,207200,0.935,0.00399,0,0.214,-2.467,1,0.107,150.082,4.0,0.47,0
2015,0.00164,0.557,185600,0.992,0.677,1,0.0913,-2.735,1,0.133,150.011,4.0,0.623,0
2016,0.00281,0.446,204520,0.915,3.9e-05,9,0.218,-6.221,1,0.141,190.013,4.0,0.402,0


In [3]:
X_s, y_s= data.iloc[:, :-1], data.iloc[:, -1]
X_train, X_test, y_train, y_test = train_test_split(
    X_s, y_s, test_size=.99, random_state=42
)

In [4]:
X_train = X_train.head(60)
y_train = y_train.head(60)

In [5]:
X_s.head()

Unnamed: 0,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence
0,0.0102,0.833,204600,0.434,0.0219,2,0.165,-8.795,1,0.431,150.062,4.0,0.286
1,0.199,0.743,326933,0.359,0.00611,1,0.137,-10.401,1,0.0794,160.083,4.0,0.588
2,0.0344,0.838,185707,0.412,0.000234,2,0.159,-7.148,1,0.289,75.044,4.0,0.173
3,0.604,0.494,199413,0.338,0.51,5,0.0922,-15.236,1,0.0261,86.468,4.0,0.23
4,0.18,0.678,392893,0.561,0.512,5,0.439,-11.648,0,0.0694,174.004,4.0,0.904


In [6]:
y_s.head()

0    1
1    1
2    1
3    1
4    1
Name: target, dtype: int64

In [7]:
pipe = make_pipeline(StandardScaler(), SVC())

In [16]:
params = pipe.get_params()
params

{'memory': None,
 'steps': [('standardscaler', StandardScaler()), ('svc', SVC())],
 'verbose': False,
 'standardscaler': StandardScaler(),
 'svc': SVC(),
 'standardscaler__copy': True,
 'standardscaler__with_mean': True,
 'standardscaler__with_std': True,
 'svc__C': 1.0,
 'svc__break_ties': False,
 'svc__cache_size': 200,
 'svc__class_weight': None,
 'svc__coef0': 0.0,
 'svc__decision_function_shape': 'ovr',
 'svc__degree': 3,
 'svc__gamma': 'scale',
 'svc__kernel': 'rbf',
 'svc__max_iter': -1,
 'svc__probability': False,
 'svc__random_state': None,
 'svc__shrinking': True,
 'svc__tol': 0.001,
 'svc__verbose': False}

In [17]:
print(params)

{'memory': None, 'steps': [('standardscaler', StandardScaler()), ('svc', SVC())], 'verbose': False, 'standardscaler': StandardScaler(), 'svc': SVC(), 'standardscaler__copy': True, 'standardscaler__with_mean': True, 'standardscaler__with_std': True, 'svc__C': 1.0, 'svc__break_ties': False, 'svc__cache_size': 200, 'svc__class_weight': None, 'svc__coef0': 0.0, 'svc__decision_function_shape': 'ovr', 'svc__degree': 3, 'svc__gamma': 'scale', 'svc__kernel': 'rbf', 'svc__max_iter': -1, 'svc__probability': False, 'svc__random_state': None, 'svc__shrinking': True, 'svc__tol': 0.001, 'svc__verbose': False}


In [8]:
from sklearn.model_selection import RandomizedSearchCV

In [9]:
param_grid = {
    "svc__gamma" : 10.0 ** np.arange(-20, 10),
    "svc__C": 10.0 ** np.arange(-20, 10)
}

print("Grid size %d" %(np.prod(list(map(len, param_grid.values())))))
param_grid

Grid size 900


{'svc__gamma': array([1.e-20, 1.e-19, 1.e-18, 1.e-17, 1.e-16, 1.e-15, 1.e-14, 1.e-13,
        1.e-12, 1.e-11, 1.e-10, 1.e-09, 1.e-08, 1.e-07, 1.e-06, 1.e-05,
        1.e-04, 1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02, 1.e+03,
        1.e+04, 1.e+05, 1.e+06, 1.e+07, 1.e+08, 1.e+09]),
 'svc__C': array([1.e-20, 1.e-19, 1.e-18, 1.e-17, 1.e-16, 1.e-15, 1.e-14, 1.e-13,
        1.e-12, 1.e-11, 1.e-10, 1.e-09, 1.e-08, 1.e-07, 1.e-06, 1.e-05,
        1.e-04, 1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02, 1.e+03,
        1.e+04, 1.e+05, 1.e+06, 1.e+07, 1.e+08, 1.e+09])}

In [10]:
list(map(len, param_grid.values()))

[30, 30]

In [11]:
random_search = RandomizedSearchCV(
    pipe, param_distributions=param_grid, n_jobs=-1, n_iter=5, cv =5 , random_state=142
)


random_search.fit(X_s, y_s)

In [12]:
random_search.cv_results_

{'mean_fit_time': array([0.51900072, 0.43640141, 0.37559886, 0.25139661, 0.557201  ]),
 'std_fit_time': array([0.05447495, 0.04924414, 0.05742209, 0.03448214, 0.06564559]),
 'mean_score_time': array([0.18199925, 0.19560003, 0.18240042, 0.17299991, 0.18659954]),
 'std_score_time': array([0.02360652, 0.03038142, 0.03829255, 0.00933817, 0.02633311]),
 'param_svc__gamma': masked_array(data=[1e-13, 0.001, 10.0, 1e-19, 100000000.0],
              mask=[False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_svc__C': masked_array(data=[1e-11, 1e-14, 1e-07, 0.001, 1e-12],
              mask=[False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'svc__gamma': 1e-13, 'svc__C': 1e-11},
  {'svc__gamma': 0.001, 'svc__C': 1e-14},
  {'svc__gamma': 10.0, 'svc__C': 1e-07},
  {'svc__gamma': 1e-19, 'svc__C': 0.001},
  {'svc__gamma': 100000000.0, 'svc__C': 1e-12}],
 'split0_test_score': array([0.5049505, 0.5049505, 0.50

In [18]:
random_search.score(X_test, y_test)

0.5072608913370055