In [31]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

from sklearn.preprocessing import StandardScaler
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

In [32]:
from sklearn.cross_validation import cross_val_score
from sklearn.metrics import make_scorer
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold, GridSearchCV

from sklearn import linear_model
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn import svm
from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor

In [33]:
data = pd.read_csv('../data/dfLongterm.csv')

In [34]:
data.head()

Unnamed: 0,fea1,fea2,fea3,fea4,fea5,fea6,fea7,fea8,fea9,fea10,...,fea24,fea25,fea26,fea27,fea28,fea29,fea30,fea31,fea32,fea33
0,137.14,-210.3,70.38,-1.8,13.52,7.67,9.21,-4.23,3.85,-2.07,...,0.33,0.34,0.41,0.43,0.2,0.17,0.26,0.54,0.42,0.45
1,96.0,-321.79,105.26,16.3,33.23,11.2,28.95,12.95,16.3,2.92,...,0.55,0.59,0.41,0.38,0.27,0.22,0.09,0.05,0.12,0.25
2,174.55,-216.06,88.31,-8.95,41.89,18.97,7.79,-2.24,5.94,0.0,...,0.25,0.21,0.44,0.21,0.09,0.18,0.31,0.67,0.4,0.53
3,49.23,-185.45,85.01,-8.47,3.75,-17.62,6.1,-14.41,-8.74,-7.15,...,0.24,0.5,0.53,0.2,0.26,0.09,0.14,0.17,0.31,0.33
4,120.0,-209.14,69.0,30.2,31.84,14.17,22.39,5.16,12.23,0.63,...,0.03,0.06,0.12,0.22,0.23,0.18,0.17,0.17,0.43,0.82


In [35]:
y = pd.read_csv('../data/dfY.csv')

In [36]:
y.head()

Unnamed: 0,index,song_id,valence_mean,arousal_mean
0,0,2,3.1,3.0
1,1,3,3.5,3.3
2,2,4,5.7,5.5
3,3,5,4.4,5.3
4,4,7,5.8,6.4


In [18]:
df = pd.concat([data,y['valence_mean'], y['arousal_mean']], axis=1)

In [None]:
k = 15 #number of variables for heatmap
corrmat = df.corr()
cols = corrmat.nlargest(k, 'valence_mean')['valence_mean'].index
cm = np.corrcoef(df[cols].values.T)
sns.set(font_scale=1.25)
hm = sns.heatmap(cm, cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size': 10}, yticklabels=cols.values, xticklabels=cols.values)
plt.show()

In [37]:
scaler = StandardScaler()

In [38]:
scaler.fit(data)

StandardScaler(copy=True, with_mean=True, with_std=True)

In [39]:
data_scaled = scaler.transform(data)

In [22]:
data_scaled = pd.DataFrame(data_scaled, columns=data.columns)

In [40]:
data_scaled[0]

array([ 0.65519038, -0.07628927, -0.77544214, -0.03422748, -0.76500356,
        0.62507754,  0.2090739 , -0.23296932,  0.16704597,  0.30380077,
       -0.12093146,  0.84937125,  0.6236305 , -0.73196571,  1.23342518,
        0.64840883, -0.29515861,  0.66941854,  0.67054655,  0.07763418,
        0.35497932,  0.65683281, -0.80359708, -0.09453569,  0.11972583,
        0.34251675,  0.6077862 , -0.7165535 , -0.99956607, -0.43536672,
        0.91738949,  0.36843219,  0.42423349])

In [13]:
def modelTest(clf, train, labels):
    cv = KFold(n_splits=5,shuffle=True,random_state=45).split(train)
    mse = make_scorer(mean_squared_error)
    mse_val_score = cross_val_score(clf, train, labels, cv=cv, scoring=mse)
    scores=[mse_val_score.mean()]
    return scores

In [14]:
def modelPlot(result_dict):
    result = pd.DataFrame.from_dict(result_dict, orient='index')
    result.columns = ["Mean Squared Error"] 
    result = result.sort(columns=["Mean Squared Error"],ascending=False)
    #print(result)
    result.plot(kind="bar",title="Model Scores")
    axes = plt.gca()
    axes.set_ylim([0.5,1])
    return result

In [17]:
def modelFit(train, labels):
    result_dict = {}
    
    clf = linear_model.LinearRegression()
    result_dict["Linear"] = modelTest(clf, train, labels)
    
    clf = linear_model.Lasso(alpha=1e-4)
    result_dict["Lasso"] = modelTest(clf, train, labels)
    
    clf = linear_model.Ridge()
    result_dict["Ridge"] = modelTest(clf, train, labels)
    
    clf = linear_model.BayesianRidge()
    result_dict["Bayesian Ridge"] = modelTest(clf, train, labels)
    
    clf = linear_model.HuberRegressor()
    result_dict["Huber"] = modelTest(clf, train, labels)
    
    clf = svm.SVR()
    result_dict["SVM RBF"] = modelTest(clf, train, labels)
    
    clf = svm.SVR(kernel="linear")
    result_dict["SVM Linear"] = modelTest(clf, train, labels)
    
    clf = BaggingRegressor()
    result_dict["Bagging"] = modelTest(clf, train, labels)
    
    clf = RandomForestRegressor()
    result_dict["RandomForest"] = modelTest(clf, train, labels)
    
    clf = AdaBoostRegressor()
    result_dict["AdaBoost"] = modelTest(clf, train, labels)
    
    clf = XGBRegressor()
    result_dict["XGBoost"] = modelTest(clf, train, labels)
    
    model_summary = modelPlot(result_dict)
    return model_summary

In [None]:
modelFit(data_scaled, y['valence_mean'])

In [41]:
from sklearn.model_selection import train_test_split

In [42]:
def create_model(model, data, y):
    X_train, X_test, y_train, y_test = train_test_split(data, y, test_size=0.33, random_state=7)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(mean_squared_error(y_test, y_pred))

In [58]:
clf_valence = XGBRegressor()

In [59]:
create_model(clf_valence, data.as_matrix(), np.array(y['valence_mean']))

0.828713435051


In [65]:
clf_arousal = XGBRegressor()
create_model(clf_arousal, data.as_matrix(), np.array(y['arousal_mean']))

1.0859522334


In [None]:
clf_arousal_ridge = linear_model.Ridge()
create_model(clf_arousal_ridge, data_scaled, np.array(y['arousal_mean']))

In [40]:
df_compare_arousal = pd.DataFrame({"arousal_target": y_test['arousal_mean'], "arousal_pred": y_pred_arousal})

In [42]:
df_compare_arousal['ErrorRate']= df_compare_arousal.apply(lambda row: abs(row['arousal_pred']-row['arousal_target'])/row['arousal_target'], axis=1)
df_compare_arousal.describe()

Unnamed: 0,arousal_pred,arousal_target,ErrorRate
count,595.0,595.0,595.0
mean,4.829609,4.834353,0.19864
std,0.818259,1.291042,0.202932
min,2.746579,2.0,0.000112
25%,4.226549,3.8,0.063272
50%,4.920983,4.9,0.150523
75%,5.480211,5.8,0.249885
max,6.869541,7.7,1.595567


In [26]:
df_compare = pd.DataFrame({"valence_target": y_test['valence_mean'], "valence_pred": y_pred})

In [27]:
df_compare.head()

Unnamed: 0,valence_pred,valence_target
1554,2.455784,4.6
1666,2.33657,6.4
649,1.844455,4.5
37,2.247237,5.8
1286,2.380033,6.1


In [28]:
df_compare['ErrorRate']= df_compare.apply(lambda row: abs(row['valence_pred']-row['valence_target'])/row['valence_target'], axis=1)

In [29]:
df_compare['ErrorRate'].describe()

count    595.000000
mean       0.527808
std        0.103227
min        0.016969
25%        0.476123
50%        0.545702
75%        0.598228
max        0.729439
Name: ErrorRate, dtype: float64

In [43]:
clfRandom = RandomForestRegressor()
clfRandom.fit(X_train, y_train['valence_mean'])
y_pred_valence_RandomF = clfRandom.predict(X_test)
mean_squared_error(y_test['valence_mean'], y_pred_valence_RandomF)

0.92713670924369751

In [27]:
import pickle

In [47]:
def save_model(model, fileName):
    f = open('./'+fileName, 'wb')
    pickle.dump(model, f)

In [66]:
save_model(clf_valence, "valence_model.sav")
save_model(clf_arousal, "arousal_model.sav")

# Fine Tune

In [49]:
feature = [ 0.65519038, -0.07628927, -0.77544214, -0.03422748, -0.76500356,
        0.62507754,  0.2090739 , -0.23296932,  0.16704597,  0.30380077,
       -0.12093146,  0.84937125,  0.6236305 , -0.73196571,  1.23342518,
        0.64840883, -0.29515861,  0.66941854,  0.67054655,  0.07763418,
        0.35497932,  0.65683281, -0.80359708, -0.09453569,  0.11972583,
        0.34251675,  0.6077862 , -0.7165535 , -0.99956607, -0.43536672,
        0.91738949,  0.36843219,  0.42423349]

In [62]:
feature2= data.loc[2,:]

In [70]:
np.array(feature2)

array([  1.74550000e+02,  -2.16060000e+02,   8.83100000e+01,
        -8.95000000e+00,   4.18900000e+01,   1.89700000e+01,
         7.79000000e+00,  -2.24000000e+00,   5.94000000e+00,
         0.00000000e+00,   5.36000000e+00,   2.07000000e+00,
         2.31000000e+00,  -6.30000000e+00,   4.57000000e+00,
         8.40000000e-01,   4.87000000e+00,  -9.80000000e+00,
        -3.50000000e+00,  -7.39000000e+00,   9.04000000e+00,
         3.60000000e-01,   1.50000000e-01,   2.50000000e-01,
         2.10000000e-01,   4.40000000e-01,   2.10000000e-01,
         9.00000000e-02,   1.80000000e-01,   3.10000000e-01,
         6.70000000e-01,   4.00000000e-01,   5.30000000e-01])

In [63]:
y.loc[2,:]

index           2.0
song_id         4.0
valence_mean    5.7
arousal_mean    5.5
Name: 2, dtype: float64

In [67]:
model = pickle.load(open('./valence_model.sav', 'rb'))

In [68]:
model.predict(feature2)

array([ 5.01894236], dtype=float32)

In [16]:
# Create k-fold
kf = KFold(n_splits=5, shuffle=True, random_state=7).split(X_train)
scorer = make_scorer(mean_squared_error, greater_is_better=True)

In [17]:
def paraSearch(model, x_train, y_train, parameters):
    gs = GridSearchCV(model, parameters, cv=5, scoring=scorer)
    gs.fit(x_train, y_train)
    return gs.best_params_, gs.best_score_, gs.best_estimator_

In [18]:
#XGBRegressor
xgb_params = {
    'colsample_bytree': [0.5, 0.7, 0.9],
    'subsample': [0.5, 0.6, 0.7],
    'learning_rate': [0.075, 0.05, 0.03, 0.01],
    'max_depth': [1, 3, 5, 7, 9, 11, 13],
    'min_child_weight': [1, 3, 5],
    'n_estimators': [50, 100, 150, 200],
    'reg_alpha': [0, 0.1, 0.3, 0.5],
    'reg_lambda': [0.1, 0.5, 0.7, 1],
}

In [19]:
paraSearch(clf, X_train, y_train['valence_mean'], xgb_params)

({'colsample_bytree': 0.7,
  'learning_rate': 0.01,
  'max_depth': 13,
  'min_child_weight': 1,
  'n_estimators': 50,
  'reg_alpha': 0,
  'reg_lambda': 0.5,
  'subsample': 0.5},
 8.4837638608433714,
 XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.7,
        gamma=0, learning_rate=0.01, max_delta_step=0, max_depth=13,
        min_child_weight=1, missing=None, n_estimators=50, nthread=-1,
        objective='reg:linear', reg_alpha=0, reg_lambda=0.5,
        scale_pos_weight=1, seed=0, silent=True, subsample=0.5))

In [None]:
clf_finetune =  XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.9,
        gamma=0, learning_rate=0.075, max_delta_step=0, max_depth=7,
        min_child_weight=3, missing=None, n_estimators=100, nthread=-1,
        objective='reg:linear', reg_alpha=0, reg_lambda=1,
        scale_pos_weight=1, seed=0, silent=True, subsample=0.5)

In [None]:
clf_finetune.fit(X_train, y_train['valence_mean'])

In [None]:
y_pred_finetune = clf_finetune.predict(X_test)

In [None]:
mean_squared_error(y_test['valence_mean'], y_pred_finetune)

In [None]:
from hpsklearn import HyperoptEstimator

In [None]:
from hyperopt import hp

In [None]:
from hpsklearn import any_regressor

In [None]:
def run_experiments(
        experimental_run,
        dataset,
        model_class=XGBR,
        loss=LOG_LOSS,
        test_metric=accuracy_score,
        random_state=None,
        dataset_name=None):
    """
    Basic experimental framework.
    Parameters
    ----------
    experimental_run : list of tuples
        These tuples should have exactly three members: the first one
        of `grid_search`, `randomized_search`, `hyperopt_search`,
        `skopt_gp_minimize`, `skopt_forest_minimize`, or
        `skopt_forest_gbrt`, the second an appropriate `param_grid`
        dict for that function, and the third a dict specifying
        keyword arguments to the search function.
    dataset : (np.array, iterable)
        A dataset (X, y) where `X` has dimension
        `(n_samples, n_features)` and `y` has
         dimension `n_samples`.
    
    model_class : classifier
        A classifier model in the mode of `sklearn`, with at least
        `fit` and `predict` methods operating on things like
        `X` and `y`.
    loss : function or string
        An appropriate loss function or string recognizable by
        `sklearn.cross_validation.cross_val_score`. In `sklearn`, scores
        are positive and losses are negative because they maximize,
        but here we are minimizing so we always want smaller to mean
        better.
    test_metric : function
        An `sklearn.metrics` function.
    random_state : int
    dataset_name : str or None
        Informal name to give the dataset. Purely for
        book-keeping.
    Returns
    -------
    list of dict
       Each dict is a results dictionary of the sort returned
       by `assess`.
    """                    
    X, y = dataset    
    skf = get_cross_validation_indices(
        X, y, random_state=random_state)        
    all_results = []
    # This loop can easily be parallelized, but doing so can
    # be tricky on some systems, since `cross_val_score`
    # calls `joblib` even if `n_jobs=1`, resulting in
    # nested parallel jobs even if there is no actual
    # parallelization elsewhere in the experimental run.
    for search_func, param_grid, kwargs in experimental_run:
        print(search_func.__name__)
        all_results.append(
            assess(
                X, y,                
                search_func=search_func, 
                model_class=XGBClassifier, 
                param_grid=param_grid,
                xval_indices=skf,
                loss=loss,
                test_metric=test_metric,                
                dataset_name=dataset_name,
                search_func_args=kwargs))
    return all_results


In [9]:
a = np.array([1,2,3])

In [5]:
a.var

<function ndarray.var>

In [12]:
a.itemset()

ValueError: itemset must have at least one argument