In [1]:
%matplotlib inline

In [2]:
import sklearn
import xgboost
import vecstack
import dill

  from . import _csparsetools
  from ._shortest_path import shortest_path, floyd_warshall, dijkstra,\
  from ._tools import csgraph_to_dense, csgraph_from_dense,\
  from ._traversal import breadth_first_order, depth_first_order, \
  from ._min_spanning_tree import minimum_spanning_tree
  from ._reordering import reverse_cuthill_mckee, maximum_bipartite_matching, \
  from .murmurhash import murmurhash3_32
  from ._solve_toeplitz import levinson
  from ._decomp_update import *
  from ._ufuncs import *
  from ._ellip_harm_2 import _ellipsoid, _ellipsoid_norm
  from ._group_columns import group_dense, group_sparse
  from .tslib import iNaT, NaT, Timestamp, Timedelta, OutOfBoundsDatetime
  from pandas._libs import (hashtable as _hashtable,
  from pandas._libs import algos, lib
  from pandas._libs import hashing, tslib
  from pandas._libs import (lib, index as libindex, tslib as libts,
  import pandas._libs.tslibs.offsets as liboffsets
  from pandas._libs import algos as libalgos, ops as lib

In [3]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

In [4]:
def read_df(filename, valtype):
    df = pd.read_csv(filename, low_memory=False, dtype=valtype)
    return df

In [5]:
from collections import OrderedDict, defaultdict

def create_combined_df(input_dict):
    fdf = pd.DataFrame()
    cols = OrderedDict()
    for k, v in input_dict.items():
        df = read_df('./data/'+k, v)
        colnames = [c for c in df.columns if c not in ['None', 'Unnamed: 0']]
        cols[k] = colnames
        fdf = pd.concat([fdf, df], axis=1)
    
    # fdf = fdf.DataFrame(fdf, columns=cols)
    fdf = fdf.drop(['None', 'Unnamed: 0'], axis=1)
    return fdf, cols

# Read Dependent feature data

In [6]:
base = './data/'

In [7]:
y_train = pd.read_csv(base+'fin_train_indeps.csv', encoding='utf8')
y_train = y_train.drop(['Unnamed: 0'], axis=1)
y_train.sample()

Unnamed: 0,review_scores_rating
78999,97.0


In [8]:
np.unique(y_train)

array([ 80.,  81.,  82.,  83.,  84.,  85.,  86.,  87.,  88.,  89.,  90.,
        91.,  92.,  93.,  94.,  95.,  96.,  97.,  98.,  99., 100.])

In [9]:
y_test = pd.read_csv(base+'fin_test_indeps.csv', encoding='utf8')
y_test = y_test.drop(['Unnamed: 0'], axis=1)
y_test.sample()

Unnamed: 0,review_scores_rating
27445,96.0


In [10]:
np.unique(y_test)

array([ 80.,  81.,  82.,  83.,  84.,  85.,  86.,  87.,  88.,  89.,  90.,
        91.,  92.,  93.,  94.,  95.,  96.,  97.,  98.,  99., 100.])

# Read independent feature data

In [11]:
comb_train = pd.read_csv(base+'fin_cat_train_deps.csv')
comb_train = comb_train.drop(['Unnamed: 0'], axis=1)
comb_test = pd.read_csv(base+'fin_cat_test_deps.csv')
comb_test = comb_test.drop(['Unnamed: 0'], axis=1)

# Split Test data into Holdout and Validation sets

In [12]:
np.random.seed(1234)
idxs = np.random.randint(0, y_test.shape[0], y_test.shape[0]//2)

In [13]:
y_validation = y_test.iloc[~y_test.index.isin(idxs)]
y_holdout = y_test.iloc[idxs]

In [14]:
comb_validation = comb_test.iloc[~comb_test.index.isin(idxs)]
comb_holdout = comb_test.iloc[idxs]

In [15]:
# Objective: Implement a wrapper for vecstack
# Input scikit model instances and training data
# Output: Predictions

import xgboost
import dill as pickle
from copy import deepcopy 
from vecstack import StackingTransformer
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.externals import joblib

class VecstackRunner():

    def __init__(self, X, y, Xt, yt, l1_estimators, l2_estimator, prfx='', metric='rmse', regression=True, nfolds=10, verbose=2):
        self.X = X
        self.y = y
        self.Xt = Xt
        self.yt = yt
        self.yp = None
        self.ytp = None
        self.l1 = l1_estimators
        self.l2 = l2_estimator
        self.prfx = prfx
        self.metname = metric
        self.nf = nfolds
        self.v = verbose
        self.is_reg = regression
        self.stack = None
    
    def mse(self, actual, pred):
        return mean_squared_error(actual, pred)
    
    def rsq(self, actual, pred):
        return r2_score(actual, pred)
    
    def get_metric_calc(self, actual, pred):
        if 'rmse' == self.metname:
            return self.mse(actual, pred)
        elif 'r2' == self.metname:
            return self.rsq(actual, pred)
    
    def build_stack(self):
        self.stack = StackingTransformer(self.l1, 
                                        regression=self.is_reg, 
                                        shuffle=True, 
                                        n_folds=self.nf, 
                                        metric=self.get_metric_calc, 
                                        verbose=self.v)
    
    def fit_stack(self):
        return self.stack.fit(self.X, self.y)
    
    def transform(self):
        self.X = self.stack.transform(self.X)
        self.Xt = self.stack.transform(self.Xt)
    
    def fit_l2(self):
        self.l2.fit(self.X, self.y)
    
    def predict(self):
        self.yp = self.l2.predict(self.X)
        self.ytp = self.l2.predict(self.Xt)
    
    def calculate_error(self, calc=None):
        if calc:
            y_err = calc(self.y, self.yp)
            yp_err = calc(self.y, self.ytp)
        else:
            y_err = self.get_metric_calc(self.y, self.yp)
            yt_err = self.get_metric_calc(self.yt, self.ytp)
            
        return [('Train err', y_err), ('Test err', yt_err)]
    
    def save_to_disk(self):
        sname = self.prfx+'_vecstack_stack.pkl'
        cname = self.prfx+'_vecstack_clf.pkl'
        
        with open('./data/'+sname, 'wb') as f:
            pickle.dump(self.stack,f)
        
        with open('./data/'+cname, 'wb') as f:
            pickle.dump(self.l2, f)
    
    def run(self):
        self.build_stack()
        self.fit_stack()
        self.transform()
        self.fit_l2()
        self.predict()
        self.save_to_disk()
        return self.calculate_error()

In [16]:
from sklearn.linear_model import ElasticNet, Lasso, Ridge
from sklearn.ensemble import AdaBoostRegressor, GradientBoostingRegressor, RandomForestRegressor
from sklearn.svm import SVR

en = ElasticNet(alpha=0.01)
# lass = Lasso(alpha=0.001, fit_intercept=False, max_iter=1000)
ridge = Ridge(normalize=False, alpha=0.1, max_iter=1000)
gb = GradientBoostingRegressor(n_estimators=100, learning_rate=0.01, 
                               subsample=0.75, max_depth=15)
ab = AdaBoostRegressor(n_estimators=100, learning_rate=0.01)
xgb = xgboost.XGBRegressor(n_estimators=100, learning_rate=0.05, 
                           gamma=0, subsample=0.5, 
                           max_depth=15, objective='reg:linear')
randf = RandomForestRegressor(n_estimators=100, min_samples_split=10)
svr_rbf = SVR(kernel='rbf')

  from ..utils.seq_dataset import ArrayDataset, CSRDataset
  from ..utils import arrayfuncs, as_float_array, check_X_y, deprecated
  from . import cd_fast
  from .sgd_fast import Hinge, Log, ModifiedHuber, SquaredLoss, Huber
  from .sag_fast import sag
  from . import libsvm, liblinear
  from . import libsvm_sparse
  from .ball_tree import BallTree
  from .kd_tree import KDTree
  from ._criterion import Criterion
  from numpy.core.umath_tests import inner1d
  from ._gradient_boosting import predict_stages


In [18]:
from copy import deepcopy

In [19]:
estimators = [
                ('en', en), 
                # ('lass', lass), 
                ('ridge', ridge),
                ('gb', gb), 
                ('ab', ab), 
                ('randf', randf), 
             ]
base = './data/'
prf = 'cat'
vr = VecstackRunner(comb_train, y_train, 
                    comb_validation, y_validation, 
                    estimators, xgb, prfx=prf, nfolds=10)
vr.run()

# print mean_squared_error(y_validation, tc.predict(ts.transform(comb_validation)))

  y = column_or_1d(y, warn=True)


task:         [regression]
metric:       [get_metric_calc]
variant:      [A]
n_estimators: [5]

estimator  0: [en: ElasticNet]
    fold  0:  [24.43272087]
    fold  1:  [23.66970580]
    fold  2:  [24.29031372]
    fold  3:  [24.06383952]
    fold  4:  [24.51103422]
    fold  5:  [24.22888244]
    fold  6:  [23.55362957]
    fold  7:  [24.59675096]
    fold  8:  [24.26004300]
    fold  9:  [23.71747209]
    ----
    MEAN:     [24.13243922] + [0.35009191]

estimator  1: [ridge: Ridge]
    fold  0:  [23.91862928]
    fold  1:  [23.23719485]
    fold  2:  [23.81388740]
    fold  3:  [23.48697014]
    fold  4:  [24.06513239]
    fold  5:  [23.69875777]
    fold  6:  [23.09342931]
    fold  7:  [24.13301594]
    fold  8:  [23.77186778]
    fold  9:  [23.28556422]
    ----
    MEAN:     [23.65044491] + [0.34080176]

estimator  2: [gb: GradientBoostingRegressor]
    fold  0:  [23.42958411]
    fold  1:  [22.57304576]
    fold  2:  [23.04550936]
    fold  3:  [22.74823486]
    fold  4:  [23.23

[('Train err', 19.71447780732107), ('Test err', 21.3257782626964)]

In [20]:
with open(base+prf+'_vecstack_stack.pkl', 'rb') as f:
    ts = pickle.load(f)

with open(base+prf+'_vecstack_clf.pkl', 'rb') as f:
    tc = pickle.load(f)


Xt = ts.transform(comb_train)
Xp = tc.predict(Xt)
y = y_train

Xvt = ts.transform(comb_test)
Xvp = tc.predict(Xvt)
yv = y_test

print '####################'
print 'Column: %s' % prf
print 'Train: '
print 'MSE: %s' % mean_squared_error(y, Xp)
print 'R2: %s' % r2_score(y, Xp) 
print 'Test: '
print 'MSE: %s' % mean_squared_error(yv, Xvp)
print 'R2: %s' % r2_score(yv, Xvp)
print '####################\n'

Train set was detected.
Transforming...

estimator  0: [en: ElasticNet]
    model from fold  0: done
    model from fold  1: done
    model from fold  2: done
    model from fold  3: done
    model from fold  4: done
    model from fold  5: done
    model from fold  6: done
    model from fold  7: done
    model from fold  8: done
    model from fold  9: done
    ----
    DONE

estimator  1: [ridge: Ridge]
    model from fold  0: done
    model from fold  1: done
    model from fold  2: done
    model from fold  3: done
    model from fold  4: done
    model from fold  5: done
    model from fold  6: done
    model from fold  7: done
    model from fold  8: done
    model from fold  9: done
    ----
    DONE

estimator  2: [gb: GradientBoostingRegressor]
    model from fold  0: done
    model from fold  1: done
    model from fold  2: done
    model from fold  3: done
    model from fold  4: done
    model from fold  5: done
    model from fold  6: done
    model from fold  7: done
   

In [21]:
tc.feature_importances_

array([0.33665705, 0.19491045, 0.18456122, 0.10439015, 0.17948112],
      dtype=float32)