In [94]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import OneHotEncoder
from sklearn.decomposition import PCA

from sklearn.linear_model import LinearRegression
from sklearn.svm import LinearSVR
# https://lightgbm.readthedocs.io/en/latest/Python-API.html#lightgbm.LGBMModel
from lightgbm import LGBMRegressor

from sklearn.metrics import mean_squared_error
from sklearn.utils import shuffle
from sklearn.model_selection import KFold, train_test_split

from hyperopt import fmin, tpe, hp
from sklearn.cross_validation import cross_val_score
from bayes_opt import BayesianOptimization

# SVR optimization methods

In [247]:
def bayesian_optimization_svr(X, y, cv=6, max_iter_svr=100, max_iter_opt=15):
    svr_opt = BayesianOptimization(
        lambda C: cross_val_score(
            LinearSVR(C=float(C), max_iter=max_iter_svr),
            X, y.squeeze(), cv=KFold(n_splits=cv).split(X), scoring='neg_mean_squared_error'
        ).mean(),
        {'C': (0.01, 50)},
        verbose=0
    )
    
    svr_opt.init(10)
    svr_opt.maximize(n_iter=max_iter_opt)
    
    return svr_opt.res['max']['max_params']['C']

def hyperopt_optimization_svr(X, y, cv=6, max_iter_svr=100, max_iter_opt=15):
    space = hp.choice('regressor_type', [
        {
            'type': 'svr',
            'C': hp.uniform('svr_C', 0.01, 50),
            'kernel': 'linear',
        }
    ])

    best = fmin(
        fn=lambda args: cross_val_score(
            LinearSVR(C=args['C'], max_iter=max_iter_svr),
            X, y.squeeze(), cv=KFold(n_splits=cv).split(X), scoring='neg_mean_squared_error'
        ).mean(),
        space=space,
        algo=tpe.suggest,
        max_evals=max_iter_opt
    )

    return best['svr_C']

def evaulate_best_svr_argument(X, y, C_vals: list, cv=6):
    scores = []
    for C in C_vals:
        scores.append(cross_val_score(
            LinearSVR(C=C, max_iter=200),
            X, y.squeeze(), cv=KFold(n_splits=cv).split(X), scoring='neg_mean_squared_error'
        ).mean())
        
    return C_vals[np.argmax(scores)]

# RF regressor optimization code

In [275]:
# hyperopt for lightgbm shows terrible results
def hyperopt_optimization_lightgbm(X, y, cv=6, max_iter_opt=15):
    space = hp.choice('regressor_type', [
        {
            'type': 'lightgbm',
            'feature_fraction': hp.uniform('feature_fraction', 0.05, 0.95),
            'bagging_fraction': hp.uniform('bagging_fraction', 0.05, 0.95),
            'bagging_freq': hp.uniform('bagging_freq', 1, 50),
            'n_estimators': hp.uniform('n_estimators', 5, 50),
            #'max_bin': hp.uniform('max_bin', )
        }
    ])

    best = fmin(
        fn=lambda args: cross_val_score(
            LGBMRegressor(
                boosting_type='rf', 
                feature_fraction=args['feature_fraction'], 
                bagging_freq=int(args['bagging_freq']), 
                bagging_fraction=args['bagging_fraction'],
                n_estimators=int(args['n_estimators'])
            ),
            X, y.squeeze(), cv=KFold(n_splits=cv).split(X), scoring='neg_mean_squared_error'
        ).mean(),
        space=space,
        algo=tpe.suggest,
        max_evals=max_iter_opt
    )
    
    return best

def bayesian_optimization_lightgbm(X, y, cv=6, max_iter_opt=15):
    svr_opt = BayesianOptimization(
        lambda feature_fraction, bagging_freq, bagging_fraction, n_estimators: cross_val_score(
            LGBMRegressor(
                boosting_type='rf', 
                feature_fraction=feature_fraction, 
                bagging_freq=int(bagging_freq), 
                bagging_fraction=bagging_fraction,
                n_estimators=int(n_estimators)
            ),
            X, y.squeeze(), cv=KFold(n_splits=cv).split(X), scoring='neg_mean_squared_error'
        ).mean(),
        {'feature_fraction': (0.05, 0.95),
         'bagging_fraction': (0.05, 0.95),
         'bagging_freq': (1, 50),
         'n_estimators': (5, 50) },
        verbose=0
    )
    
    svr_opt.init(10)
    svr_opt.maximize(n_iter=max_iter_opt)
    
    return svr_opt.res['max']['max_params']#['C']

# Facebook Comment Volume Dataset

In [407]:
df = pd.read_csv('datasets/facebook_comments.csv', index_col=0)

In [408]:
print('dataset size: {}'.format(df.shape))

dataset size: (40949, 54)


In [413]:
# encoding categorical variables
# H Local - category
# Post Promotion Status - category
# Base Time - time variable
# Page Category - category

H_Local = OneHotEncoder().fit_transform(df['H Local'].values.reshape(-1, 1)).todense()
Post_Promotion_Status = OneHotEncoder().fit_transform(df['Post Promotion Status'].values.reshape(-1, 1)).todense()
Base_Time = OneHotEncoder().fit_transform(df['Base Time'].values.reshape(-1, 1)).todense()
Page_Category = OneHotEncoder().fit_transform(df['Page Category'].values.reshape(-1, 1)).todense()

In [414]:
y = df.Target.values.reshape(-1, 1)
X = df.drop(['H Local', 'Post Promotion Status', 'Base Time', 'Page Category', 'Target'], axis=1).values
X = np.hstack([X, H_Local, Post_Promotion_Status, Base_Time, Page_Category])

In [416]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1)

In [417]:
print('dataset size after preprocessing: {}'.format(X.shape))

dataset size after preprocessing: (40949, 228)


## Training plain model

### * Estimating SVR penalty

In [418]:
X_train, y_train = shuffle(X_train, y_train)

In [419]:
C_hyperopt = hyperopt_optimization_svr(X_train, y_train, cv=6, max_iter_svr=1000, max_iter_opt=15)
C_bayesian = bayesian_optimization_svr(X_train, y_train, cv=6, max_iter_svr=1000, max_iter_opt=15)

In [420]:
C_opt = evaulate_best_svr_argument(X_train, y_train, [C_hyperopt, C_bayesian], cv=6)

### * Estimating LightGBM params

In [421]:
params_opt = bayesian_optimization_lightgbm(X_train, y_train, cv=6, max_iter_opt=50)

  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)


In [422]:
params_opt['bagging_freq'] = int(params_opt['bagging_freq'])
params_opt['n_estimators'] = int(params_opt['n_estimators'])

### * Evaluate with CV

In [423]:
models = {
    'linear': LinearRegression(),
    'svr': LinearSVR(C=C_opt, max_iter=1000),
    'forest': LGBMRegressor(boosting_type='rf', **params_opt)
}

In [424]:
stats = {}

for k, model in models.items():
    stats[k] = []
    kfold = KFold(n_splits=8, shuffle=True)
    
    for train_ix, test_ix in kfold.split(X_train, y_train):
        X_crossval_train, X_crossval_test = X_train[train_ix], X_train[test_ix]
        y_crossval_train, y_crossval_test = y_train[train_ix], y_train[test_ix]
        
        # here must be sume sort of optimization
        model.fit(X_crossval_train, y_crossval_train.ravel())
        stats[k].append(mean_squared_error(model.predict(X_crossval_test), y_crossval_test))

In [425]:
print('for pure data (averate scores):')
for model, model_stats in stats.items():
    print('{}, MSE: {}'.format(model, np.mean(model_stats)))

for pure data (averate scores):
linear, MSE: 833.9187652926395
svr, MSE: 16797.00158755496
forest, MSE: 809.8926892998372


In [426]:
print('for pure data (validation scores):')
for name, model in models.items():
    model.fit(X_train, y_train.squeeze())
    print('{}, MSE: {}'.format(name, mean_squared_error(model.predict(X_val), y_val.squeeze())))

for pure data (validation scores):
linear, MSE: 581.8866573212343
svr, MSE: 918.5697861435972
forest, MSE: 681.0062429409885


### Training model with PCA-processed data

In [427]:
# number of components is fixed to 10
pca = PCA(n_components=10)
X_pca = pca.fit_transform(X_train)
X_val_pca = pca.transform(X_val)

In [428]:
X_pca, y_train = shuffle(X_pca, y_train)

### * Estimating SVR penalty

In [429]:
C_hyperopt = hyperopt_optimization_svr(X_pca, y_train, cv=6, max_iter_svr=1000, max_iter_opt=15)
C_bayesian = bayesian_optimization_svr(X_pca, y_train, cv=6, max_iter_svr=1000, max_iter_opt=15)

In [430]:
C_opt = evaulate_best_svr_argument(X_pca, y_train, [C_hyperopt, C_bayesian], cv=6)

### * Estimating LightGBM params

In [431]:
params_opt = bayesian_optimization_lightgbm(X_pca, y_train, cv=6, max_iter_opt=50)

  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)
  " state: %s" % con

In [432]:
params_opt['bagging_freq'] = int(params_opt['bagging_freq'])
params_opt['n_estimators'] = int(params_opt['n_estimators'])

In [433]:
models = {
    'linear': LinearRegression(),
    'svr': LinearSVR(C=C_opt),
    'forest': LGBMRegressor(boosting_type='rf', **params_opt)
}

### * Evaluate with CV

In [435]:
stats = {}

for k, model in models.items():
    stats[k] = []
    kfold = KFold(n_splits=8, shuffle=True)
    
    for train_ix, test_ix in kfold.split(X_pca, y_train):
        X_crossval_train, X_crossval_test = X_pca[train_ix], X_pca[test_ix]
        y_crossval_train, y_crossval_test = y_train[train_ix], y_train[test_ix]
        
        model.fit(X_crossval_train, y_crossval_train.ravel())
        stats[k].append(mean_squared_error(model.predict(X_crossval_test), y_crossval_test))
        

In [436]:
print('for decorrelated data using PCA (averate scores)')
for model, model_stats in stats.items():
    print('{}, MSE: {}'.format(model, np.mean(model_stats)))

for decorrelated data using PCA (averate scores)
linear, MSE: 951.8163250196471
svr, MSE: 19583.045959025876
forest, MSE: 948.3350585676535


In [437]:
print('for decorrelated data using PCA (validation scores):')
for name, model in models.items():
    model.fit(X_train, y_train.squeeze())
    print('{}, MSE: {}'.format(name, mean_squared_error(model.predict(X_val), y_val.squeeze())))

for decorrelated data using PCA (validation scores):
linear, MSE: 1047.3367376483732
svr, MSE: 108636895.59545851
forest, MSE: 1051.8889012558677


## Parkinsons Telemonitoring Data Set

In [357]:
df = pd.read_csv('datasets/parkinsons_updrs.data')

In [358]:
print('dataset size: {}'.format(df.shape))

dataset size: (5875, 22)


In [359]:
subject = df['subject#'].unique().tolist()
subject_binary = np.zeros((df.shape[0], len(subject)))
for k, i in df.iterrows():
    subject_binary[k, int(i['subject#']) - 1] = 1

In [360]:
# replacing categorical features with binary values
y = df.total_UPDRS.values
X = df.drop(['motor_UPDRS', 'total_UPDRS', 'subject#'], axis=1).values
X = np.concatenate([subject_binary, X], axis=1)

In [361]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1)

In [362]:
print('dataset size after preprocessing: {}'.format(X.shape))

dataset size after preprocessing: (5875, 61)


## Training plain model

### * Estimating SVR penalty

In [363]:
X_train, y_train = shuffle(X_train, y_train)

In [364]:
C_hyperopt = hyperopt_optimization_svr(X_train, y_train, cv=6, max_iter_svr=1000, max_iter_opt=15)
C_bayesian = bayesian_optimization_svr(X_train, y_train, cv=6, max_iter_svr=1000, max_iter_opt=15)

In [365]:
C_opt = evaulate_best_svr_argument(X_train, y_train, [C_hyperopt, C_bayesian], cv=6)

### * Estimating LightGBM params

In [366]:
params_opt = bayesian_optimization_lightgbm(X_train, y_train, cv=6, max_iter_opt=50)

In [367]:
params_opt['bagging_freq'] = int(params_opt['bagging_freq'])
params_opt['n_estimators'] = int(params_opt['n_estimators'])

### * Evaluate with CV

In [368]:
models = {
    'linear': LinearRegression(),
    'svr': LinearSVR(C=C_opt, max_iter=1000),
    'forest': LGBMRegressor(boosting_type='rf', **params_opt)
}

In [369]:
stats = {}

for k, model in models.items():
    stats[k] = []
    kfold = KFold(n_splits=8, shuffle=True)
    
    for train_ix, test_ix in kfold.split(X_train, y_train):
        X_crossval_train, X_crossval_test = X_train[train_ix], X_train[test_ix]
        y_crossval_train, y_crossval_test = y_train[train_ix], y_train[test_ix]
        
        # here must be sume sort of optimization
        model.fit(X_crossval_train, y_crossval_train.ravel())
        stats[k].append(mean_squared_error(model.predict(X_crossval_test), y_crossval_test))

In [370]:
print('for pure data (averate scores):')
for model, model_stats in stats.items():
    print('{}, MSE: {}'.format(model, np.mean(model_stats)))

for pure data (averate scores):
linear, MSE: 6.749107616366471
svr, MSE: 13.981509078246749
forest, MSE: 7.709238651848022


In [371]:
print('for pure data (validation scores):')
for name, model in models.items():
    model.fit(X_train, y_train.squeeze())
    print('{}, MSE: {}'.format(name, mean_squared_error(model.predict(X_val), y_val.squeeze())))

for pure data (validation scores):
linear, MSE: 6.717286051699213
svr, MSE: 9.22396102296615
forest, MSE: 6.81742984876102


### Training model with PCA-processed data

In [372]:
# number of components is fixed to 10
pca = PCA(n_components=10)
X_pca = pca.fit_transform(X_train)
X_val_pca = pca.transform(X_val)

In [373]:
X_pca, y_train = shuffle(X_pca, y_train)

### * Estimating SVR penalty

In [374]:
C_hyperopt = hyperopt_optimization_svr(X_pca, y_train, cv=6, max_iter_svr=1000, max_iter_opt=15)
C_bayesian = bayesian_optimization_svr(X_pca, y_train, cv=6, max_iter_svr=1000, max_iter_opt=15)

In [375]:
C_opt = evaulate_best_svr_argument(X_pca, y_train, [C_hyperopt, C_bayesian], cv=6)

### * Estimating LightGBM params

In [376]:
params_opt = bayesian_optimization_lightgbm(X_pca, y_train, cv=6, max_iter_opt=50)

In [377]:
params_opt['bagging_freq'] = int(params_opt['bagging_freq'])
params_opt['n_estimators'] = int(params_opt['n_estimators'])

In [378]:
models = {
    'linear': LinearRegression(),
    'svr': LinearSVR(C=C_opt),
    'forest': LGBMRegressor(boosting_type='rf', **params_opt)
}

### * Evaluate with CV

In [379]:
stats = {}

for k, model in models.items():
    stats[k] = []
    kfold = KFold(n_splits=8, shuffle=True)
    
    for train_ix, test_ix in kfold.split(X_pca, y_train):
        X_crossval_train, X_crossval_test = X_pca[train_ix], X_pca[test_ix]
        y_crossval_train, y_crossval_test = y_train[train_ix], y_train[test_ix]
        
        model.fit(X_crossval_train, y_crossval_train.ravel())
        stats[k].append(mean_squared_error(model.predict(X_crossval_test), y_crossval_test))
        

In [380]:
print('for decorrelated data using PCA (averate scores)')
for model, model_stats in stats.items():
    print('{}, MSE: {}'.format(model, np.mean(model_stats)))

for decorrelated data using PCA (averate scores)
linear, MSE: 90.62399381710603
svr, MSE: 130.3804838348584
forest, MSE: 7.038721579129563


In [381]:
print('for decorrelated data using PCA (validation scores):')
for name, model in models.items():
    model.fit(X_train, y_train.squeeze())
    print('{}, MSE: {}'.format(name, mean_squared_error(model.predict(X_val), y_val.squeeze())))

for decorrelated data using PCA (validation scores):
linear, MSE: 107.93276349773285
svr, MSE: 199.71396217757305
forest, MSE: 114.61438161337999


## Energy efficiency Data Set

In [382]:
df = pd.read_excel('./datasets/ENB2012_data.xlsx')

In [383]:
print('dataset size: {}'.format(df.shape))

dataset size: (768, 10)


In [384]:
subject = df['X6'].unique().tolist()
subject_map = dict(zip(subject, range(len(subject))))
subject_binary = np.zeros((df.shape[0], len(subject)))
for k, i in df.iterrows():
    subject_binary[k, subject_map[i['X6']]] = 1

In [385]:
# replacing categorical features with binary values
y = df.Y1.values
X = df.drop(['Y1', 'Y2', 'X6'], axis=1).values
X = np.concatenate([subject_binary, X], axis=1)

In [386]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1)

In [387]:
print('dataset size after preprocessing: {}'.format(X.shape))

dataset size after preprocessing: (768, 11)


## Training plain model

### * Estimating SVR penalty

In [388]:
X_train, y_train = shuffle(X_train, y_train)

In [389]:
C_hyperopt = hyperopt_optimization_svr(X_train, y_train, cv=6, max_iter_svr=1000, max_iter_opt=15)
C_bayesian = bayesian_optimization_svr(X_train, y_train, cv=6, max_iter_svr=1000, max_iter_opt=15)

In [390]:
C_opt = evaulate_best_svr_argument(X_train, y_train, [C_hyperopt, C_bayesian], cv=6)

### * Estimating LightGBM params

In [391]:
params_opt = bayesian_optimization_lightgbm(X_train, y_train, cv=6, max_iter_opt=50)

  " state: %s" % convergence_dict)


In [392]:
params_opt['bagging_freq'] = int(params_opt['bagging_freq'])
params_opt['n_estimators'] = int(params_opt['n_estimators'])

### * Evaluate with CV

In [393]:
models = {
    'linear': LinearRegression(),
    'svr': LinearSVR(C=C_opt, max_iter=1000),
    'forest': LGBMRegressor(boosting_type='rf', **params_opt)
}

In [394]:
stats = {}

for k, model in models.items():
    stats[k] = []
    kfold = KFold(n_splits=8, shuffle=True)
    
    for train_ix, test_ix in kfold.split(X_train, y_train):
        X_crossval_train, X_crossval_test = X_train[train_ix], X_train[test_ix]
        y_crossval_train, y_crossval_test = y_train[train_ix], y_train[test_ix]
        
        # here must be sume sort of optimization
        model.fit(X_crossval_train, y_crossval_train.ravel())
        stats[k].append(mean_squared_error(model.predict(X_crossval_test), y_crossval_test))

In [395]:
print('for pure data (averate scores):')
for model, model_stats in stats.items():
    print('{}, MSE: {}'.format(model, np.mean(model_stats)))

for pure data (averate scores):
linear, MSE: 8.668724774180173
svr, MSE: 24.33405747205793
forest, MSE: 1.172100281816479


In [396]:
print('for pure data (validation scores):')
for name, model in models.items():
    model.fit(X_train, y_train.squeeze())
    print('{}, MSE: {}'.format(name, mean_squared_error(model.predict(X_val), y_val.squeeze())))

for pure data (validation scores):
linear, MSE: 10.08179054280034
svr, MSE: 16.21476784384871
forest, MSE: 1.5204760853069244


### Training model with PCA-processed data

In [397]:
# number of components is fixed to 4
pca = PCA(n_components=4)
X_pca = pca.fit_transform(X_train)
X_val_pca = pca.transform(X_val)

### * Estimating SVR penalty

In [398]:
X_pca, y_train = shuffle(X_pca, y_train)

In [399]:
C_hyperopt = hyperopt_optimization_svr(X_pca, y_train, cv=6, max_iter_svr=1000, max_iter_opt=15)
C_bayesian = bayesian_optimization_svr(X_pca, y_train, cv=6, max_iter_svr=1000, max_iter_opt=15)

In [400]:
C_opt = evaulate_best_svr_argument(X_pca, y_train, [C_hyperopt, C_bayesian], cv=6)

### * Estimating LightGBM params

In [401]:
params_opt = bayesian_optimization_lightgbm(X_pca, y_train, cv=6, max_iter_opt=50)

In [402]:
params_opt['bagging_freq'] = int(params_opt['bagging_freq'])
params_opt['n_estimators'] = int(params_opt['n_estimators'])

In [403]:
models = {
    'linear': LinearRegression(),
    'svr': LinearSVR(C=C_opt),
    'forest': LGBMRegressor(boosting_type='rf', **params_opt)
}

### * Evaluate with CV

In [404]:
stats = {}

for k, model in models.items():
    stats[k] = []
    kfold = KFold(n_splits=8, shuffle=True)
    
    for train_ix, test_ix in kfold.split(X_pca, y_train):
        X_crossval_train, X_crossval_test = X_pca[train_ix], X_pca[test_ix]
        y_crossval_train, y_crossval_test = y_train[train_ix], y_train[test_ix]
        
        model.fit(X_crossval_train, y_crossval_train.ravel())
        stats[k].append(mean_squared_error(model.predict(X_crossval_test), y_crossval_test))
        

In [405]:
print('for decorrelated data using PCA (averate scores)')
for model, model_stats in stats.items():
    print('{}, MSE: {}'.format(model, np.mean(model_stats)))

for decorrelated data using PCA (averate scores)
linear, MSE: 21.34162669831759
svr, MSE: 33.71467589634579
forest, MSE: 7.584906645118648


In [406]:
print('for decorrelated data using PCA (validation scores):')
for name, model in models.items():
    model.fit(X_train, y_train.squeeze())
    print('{}, MSE: {}'.format(name, mean_squared_error(model.predict(X_val), y_val.squeeze())))

for decorrelated data using PCA (validation scores):
linear, MSE: 113.02315052557574
svr, MSE: 175.97334396754545
forest, MSE: 122.73967175546818
