In [0]:
%matplotlib inline

In [0]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

In [0]:
def read_df(filename, valtype):
    df = pd.read_csv(filename, low_memory=False, dtype=valtype)
    return df

In [0]:
from collections import OrderedDict, defaultdict

def create_combined_df(input_dict):
    fdf = pd.DataFrame()
    cols = OrderedDict()
    for k, v in input_dict.items():
        df = read_df('./data/'+k, v)
        colnames = [c for c in df.columns if c not in ['None', 'Unnamed: 0']]
        cols[k] = colnames
        fdf = pd.concat([fdf, df], axis=1)
    
    # fdf = fdf.DataFrame(fdf, columns=cols)
    fdf = fdf.drop(['None', 'Unnamed: 0'], axis=1)
    return fdf, cols

# Read Dependent feature data

In [1]:
base = './data'

In [0]:
y_train = pd.read_csv(base+'fin_train_indeps.csv', encoding='utf8')
y_train = y_train.drop(['Unnamed: 0'], axis=1)
y_train.sample()

In [0]:
np.unique(y_train)

In [0]:
y_test = pd.read_csv(base+'fin_test_indeps.csv', encoding='utf8')
y_test = y_test.drop(['Unnamed: 0'], axis=1)
y_test.sample()

In [0]:
np.unique(y_test)

# Read independent feature data

In [0]:
num_train = pd.read_csv(base+'fin_num_train_deps.csv')
num_train = num_train.drop(['Unnamed: 0'], axis=1)
num_test = pd.read_csv(base+'fin_num_test_deps.csv')
num_test = num_test.drop(['Unnamed: 0'], axis=1)

cat_train = pd.read_csv(base+'fin_cat_train_deps.csv')
cat_train = cat_train.drop(['Unnamed: 0'], axis=1)
cat_test = pd.read_csv(base+'fin_cat_test_deps.csv')
cat_test = cat_test.drop(['Unnamed: 0'], axis=1)

amen_train = pd.read_csv(base+'fin_amen_train_deps.csv')
amen_train = amen_train.drop(['Unnamed: 0'], axis=1)
amen_test = pd.read_csv(base+'fin_amen_test_deps.csv')
amen_test = amen_test.drop(['Unnamed: 0'], axis=1)

In [0]:
comb_train = pd.read_csv(base+'fin_comb_train_deps.csv')
comb_train = comb_train.drop(['Unnamed: 0'], axis=1)
comb_test = pd.read_csv(base+'fin_comb_test_deps.csv')
comb_test = comb_test.drop(['Unnamed: 0'], axis=1)

# Split Test data into Holdout and Validation sets

In [0]:
np.random.seed(1234)
idxs = np.random.randint(0, y_test.shape[0], y_test.shape[0]//2)

In [0]:
y_validation = y_test.iloc[~y_test.index.isin(idxs)]
y_holdout = y_test.iloc[idxs]

In [0]:
num_validation = num_test.iloc[~num_test.index.isin(idxs)]
num_holdout = num_test.iloc[idxs]

cat_validation = cat_test.iloc[~cat_test.index.isin(idxs)]
cat_holdout = cat_test.iloc[idxs]

amen_validation = amen_test.iloc[~amen_test.index.isin(idxs)]
amen_holdout = amen_test.iloc[idxs]

comb_validation = comb_test.iloc[~comb_test.index.isin(idxs)]
comb_holdout = comb_test.iloc[idxs]

In [0]:
def run_model(X, y, Xt, yt, model):
    if len(y.shape) == 1:
        y = y.reshape(-1, 1)
    if len(yt.shape) == 1:
        yt = yt.reshape(-1, 1)
    
    res = {'train': {}, 'test': {}}
    
    model.fit(X, y)
    
    train_pred = model.predict(X)
    res['train']['pred'] = train_pred
    
    test_pred = model.predict(Xt)
    res['test']['pred'] = test_pred
    
    for name, tup in zip(['train', 'test'], [(y, train_pred), (yt, test_pred)]):
        act = tup[0].values 
        prd = tup[1].ravel()
        
        mserr = mean_squared_error(act, prd)
        res[name]['mse'] = mserr
        
        rsq_score = r2_score(act, prd)
        res[name]['r2'] = rsq_score
    
    return model, res

In [0]:
def run_model_resgen(X, y, Xt, yt, model, sample_sizes=[]):
    for ss in sample_sizes:
        np.random.seed(1234)
        rand_idxs = np.random.randint(0, X.shape[0], size=ss)
        Xs = X.iloc[rand_idxs]
        ys = y.iloc[rand_idxs]
        
        yield run_model(Xs, ys, Xt, yt, model)

In [0]:
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score

sample_sizes = [i for i in range(1000, num_train.shape[0], 2500)] + [num_train.shape[0]]

regr = linear_model.Lasso(alpha=0.01, fit_intercept=True, max_iter=10000)


def plot_model_results(X, y, Xt, yt, model, sample_sizes):
    mse_train = []
    mse_test = []
    r2_train = []
    r2_test = []

    for _, r in run_model_resgen(X, y, Xt, yt, model, sample_sizes):
        mse_train.append(r['train']['mse'])
        mse_test.append(r['test']['mse'])        
        r2_train.append(r['train']['r2'])
        r2_test.append(r['test']['r2'])        
    
    
    
    fig = plt.figure(figsize=(10, 7))
    ax1 = fig.add_subplot(111)
    alph = 0.75
    ln1 = ax1.plot(sample_sizes, mse_train, 'b-', color='b', alpha=alph, label='Train MSE')
    ln2 = ax1.plot(sample_sizes, mse_test, 'b--', color='b', alpha=alph, label='Test MSE')
    ax1.set_xlabel('sample size')
    ax1.set_ylabel('MSE', color='b')
    ax1.tick_params('y', colors='b')

    ax2 = ax1.twinx()
    ln3 = ax2.plot(sample_sizes, r2_train, 'g-', color='g', alpha=alph, label='Train R2')
    ln4 = ax2.plot(sample_sizes, r2_test, 'g--', color='g', alpha=alph, label='Test R2')
    ax2.set_ylabel('R2', color='g')
    ax2.tick_params('y', colors='g')
    
    lns = ln1 + ln2 + ln3 + ln4
    labs = [l.get_label() for l in lns]
    ax1.legend(lns, labs)
    
    # fig.tight_layout()
    # plt.legend()
    plt.title('Train and Test MSE and R2 vs. sample size') 
    plt.show()
    
    model = _ # Just the final model
    return model

In [0]:
model = plot_model_results(num_train, y_train, num_validation, y_validation, regr, sample_sizes)

In [0]:
model = plot_model_results(cat_train, y_train, cat_validation, y_validation, regr, sample_sizes)

In [0]:
model = plot_model_results(amen_train, y_train, amen_validation, y_validation, regr, sample_sizes)

In [0]:
model = plot_model_results(comb_train, y_train, comb_validation, y_validation, regr, sample_sizes)

In [2]:
import vecstack

In [0]:
import xgboost
from vecstack import StackingTransformer
from vecstack import stacking
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression, Ridge, ElasticNet, Lasso
from sklearn.ensemble import AdaBoostRegressor, GradientBoostingRegressor, RandomForestRegressor
from sklearn.svm import SVR

def rmse(train, pred):
    return mean_squared_error(train, pred)


def rsq(train, pred):
    return r2_score(train, pred)

lass = Lasso(alpha=0.001, fit_intercept=False, max_iter=10000)

ridge = Ridge(normalize=False, alpha=0.1, max_iter=10000)

gb = GradientBoostingRegressor(n_estimators=100, learning_rate=0.01, 
                               subsample=0.75, max_depth=15)

xgb = xgboost.XGBRegressor(n_estimators=100, learning_rate=0.05, 
                           gamma=0, subsample=0.5, colsample_bytree=0.5, 
                           max_depth=15, objective='reg:linear')

randf = RandomForestRegressor(n_estimators=100, min_samples_split=10)

svr_rbf = SVR(kernel='rbf')


estimators = [
              ('ridge', ridge),
              ('gbf', GradientBoostingRegressor(alpha=0.001)), 
              ('randf', randf), 
             ]

stack = StackingTransformer(estimators, regression=True, 
                            shuffle=True, n_folds=10, 
                            metric=rmse, verbose=2, 
                           )

stack = stack.fit(comb_train, y_train)

# Get stacked features
S_train = stack.transform(comb_train)




In [0]:
S_test = stack.transform(comb_validation)

# Use 2nd level estimator on Stacked data
# model = RandomForestRegressor(n_estimators=100, min_impurity_decrease=0.05)
model = xgb

model.fit(S_train, y_train)

y_train_pred = model.predict(S_train)

# Make predictions using the testing set
y_pred = model.predict(S_test)

In [0]:
# The mean squared error
print("Train Mean squared error: %.2f"
      % mean_squared_error(y_train, y_train_pred))
# Explained variance score: 1 is perfect prediction
print 'Train Variance score: %.2f' % r2_score(y_train, y_train_pred) 

# The mean squared error
print("Validation Mean squared error: %.2f"
      % mean_squared_error(y_validation, y_pred))
# Explained variance score: 1 is perfect prediction
print 'Validation Variance score: %.2f' % r2_score(y_validation, y_pred) 


In [0]:
S_test = stack.transform(comb_validation)

# Use 2nd level estimator on Stacked data
# model = RandomForestRegressor(n_estimators=100, min_impurity_decrease=0.05)
model = gb

model.fit(S_train, y_train)

y_train_pred = model.predict(S_train)

# Make predictions using the testing set
y_pred = model.predict(S_test)


In [0]:
# The mean squared error
print("Train Mean squared error: %.2f"
      % mean_squared_error(y_train, y_train_pred))
# Explained variance score: 1 is perfect prediction
print 'Train Variance score: %.2f' % r2_score(y_train, y_train_pred) 


In [0]:
# The mean squared error
print("Validation Mean squared error: %.2f"
      % mean_squared_error(y_validation, y_pred))
# Explained variance score: 1 is perfect prediction
print 'Validation Variance score: %.2f' % r2_score(y_validation, y_pred)

In [0]:
print y_holdout.shape
print y_validation.shape

In [0]:
np.subtract(y_train, y_train_pred)

In [0]:
# Plot the residuals
print 'train residuals:'
plt.scatter(y_train.values, , c='b', marker='o', alpha=0.5)
plt.show()


In [0]:
def wrap_vec_stack(X, y, Xt, yt, sample_sizes, sstack, mmodel):

    for ss in sample_sizes:
        np.random.seed(1234)
        rand_idxs = np.random.randint(0, X.shape[0], size=ss)
        Xs = X.iloc[rand_idxs]
        ys = y.iloc[rand_idxs]

        stack = sstack.fit(Xs, ys)
        
        # Get stacked features
        S_train = stack.transform(Xs)
        S_test = stack.transform(Xt)

        # Use 2nd level estimator on Stacked data
        # model = RandomForestRegressor(n_estimators=100, min_impurity_decrease=0.05)
        model = mmodel
        plot_model_results(Xs, ys, Xt, yt, model, [ss])


# wrap_vec_stack(comb_train, y_train, comb_validation, y_validation, sample_sizes, stack, xgb)

In [0]:
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, make_scorer, roc_auc_score

m_train = np.mean(y_train)
y_train_baseline_pred = np.array([m_train for i in range(y_train.shape[0])])
y_validation_baseline_pred = np.array([m_train for i in range(y_validation.shape[0])])


In [0]:

print y_train.shape[0]

In [0]:
print y_train.shape
print y_train_baseline_pred.shape

In [0]:
# The mean squared error
print("Train Mean squared error: %.2f"
      % mean_squared_error(y_train, y_train_baseline_pred))
# Explained variance score: 1 is perfect prediction
print('Train Validation Variance score: %.2f' % r2_score(y_train, y_train_baseline_pred))

In [0]:


# The mean squared error
print("Validation Mean squared error: %.2f"
      % mean_squared_error(y_validation, y_validation_baseline_pred))
# Explained variance score: 1 is perfect prediction
print('Validation Variance score: %.2f' % r2_score(y_validation, y_validation_baseline_pred))


In [0]:
# Objective: Implement a wrapper for vecstack
# InputL scikit model instances and training data
# Output: Predictions

import vecstack
import dill

import xgboost
import dill as pickle
from copy import deepcopy 
from vecstack import StackingTransformer
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.externals import joblib

class VecstackRunner():

    def __init__(self, X, y, Xt, yt, l1_estimators, l2_estimator, prfx='', metric='rmse', regression=True, nfolds=10, verbose=2):
        self.X = X
        self.y = y
        self.Xt = Xt
        self.yt = yt
        self.yp = None
        self.ytp = None
        self.l1 = l1_estimators
        self.l2 = l2_estimator
        self.prfx = prfx
        self.metname = metric
        self.nf = nfolds
        self.v = verbose
        self.is_reg = regression
        self.stack = None
    
    def mse(self, actual, pred):
        return mean_squared_error(actual, pred)
    
    def rsq(self, actual, pred):
        return r2_score(actual, pred)
    
    def get_metric_calc(self, actual, pred):
        if 'rmse' == self.metname:
            return self.mse(actual, pred)
        elif 'r2' == self.metname:
            return self.rsq(actual, pred)
    
    def build_stack(self):
        self.stack = StackingTransformer(self.l1, 
                                        regression=self.is_reg, 
                                        shuffle=True, 
                                        n_folds=self.nf, 
                                        metric=self.get_metric_calc, 
                                        verbose=self.v)
    
    def fit_stack(self):
        return self.stack.fit(self.X, self.y)
    
    def transform(self):
        self.X = self.stack.transform(self.X)
        self.Xt = self.stack.transform(self.Xt)
    
    def fit_l2(self):
        self.l2.fit(self.X, self.y)
    
    def predict(self):
        self.yp = self.l2.predict(self.X)
        self.ytp = self.l2.predict(self.Xt)
    
    def calculate_error(self, calc=None):
        if calc:
            y_err = calc(self.y, self.yp)
            yp_err = calc(self.y, self.ytp)
        else:
            y_err = self.get_metric_calc(self.y, self.yp)
            yt_err = self.get_metric_calc(self.yt, self.ytp)
            
        return [('Train err', y_err), ('Test err', yt_err)]
    
    def save_to_disk(self):
        sname = self.prfx+'_vecstack_stack.pkl'
        cname = self.prfx+'_vecstack_clf.pkl'
        
        with open('./data/'+sname, 'wb') as f:
            pickle.dump(self.stack,f)
        
        with open('./data/'+cname, 'wb') as f:
            pickle.dump(self.l2, f)
    
    def run(self):
        self.build_stack()
        self.fit_stack()
        self.transform()
        self.fit_l2()
        self.predict()
        self.save_to_disk()
        return self.calculate_error()

In [0]:
from sklearn.linear_model import ElasticNet, Lasso, Ridge
from sklearn.ensemble import AdaBoostRegressor, GradientBoostingRegressor, RandomForestRegressor
from sklearn.svm import SVR


en = ElasticNet(alpha=0.001)
lass = Lasso(alpha=0.001, fit_intercept=False, max_iter=10000)
ridge = Ridge(normalize=False, alpha=0.1, max_iter=10000)
gb = GradientBoostingRegressor(n_estimators=100, learning_rate=0.01, 
                               subsample=0.75, max_depth=15)
ab = AdaBoostRegressor(n_estimators=100, learning_rate=0.01)
xgb = xgboost.XGBRegressor(n_estimators=100, learning_rate=0.05, 
                           gamma=0, subsample=0.5, 
                           max_depth=15, objective='reg:linear')
randf = RandomForestRegressor(n_estimators=100, min_samples_split=10)
svr_rbf = SVR(kernel='rbf')

In [0]:
from copy import deepcopy

In [0]:
estimators = [
                ('en', en), 
                ('lass', lass), 
                ('ridge', ridge),
                ('gb', gb), 
                ('ab', ab), 
                ('randf', randf), 
             ]

base = './data/'
prf = 'num'
vr = VecstackRunner(num_train, y_train, num_test, y_test, l1_estimators=estimators, l2_estimator=xgb, prfx=prf)
vr.run()

with open(base+prf+'_vecstack_stack.pkl', 'rb') as f:
    ts = pickle.load(f)

with open(base+prf+'_vecstack_clf.pkl', 'rb') as f:
    tc = pickle.load(f)
    
print mean_squared_error(y_test, tc.predict(ts.transform(num_test)))

In [0]:
estimators = [
                ('en', en), 
                ('lass', lass), 
                ('ridge', ridge),
                ('gb', gb), 
                ('ab', ab), 
                ('randf', randf), 
             ]

base = '/tmp/'
prf = 'cat'
vr = VecstackRunner(cat_train, y_train, cat_test, y_test, estimators, xgb, prfx=prf)
vr.run()

with open(base+prf+'_vecstack_stack.pkl', 'rb') as f:
    ts = pickle.load(f)

with open(base+prf+'_vecstack_clf.pkl', 'rb') as f:
    tc = pickle.load(f)

print mean_squared_error(y_test, tc.predict(ts.transform(cat_test)))

In [0]:
estimators = [
                ('en', en), 
                ('lass', lass), 
                ('ridge', ridge),
                ('gb', gb), 
                ('ab', ab), 
                ('randf', randf), 
             ]

base = '/tmp/'
prf = 'amen'
vr = VecstackRunner(amen_train, y_train, amen_test, y_test, estimators, xgb, prfx=prf)
vr.run()

with open(base+prf+'_vecstack_stack.pkl', 'rb') as f:
    ts = pickle.load(f)

with open(base+prf+'_vecstack_clf.pkl', 'rb') as f:
    tc = pickle.load(f)

print mean_squared_error(y_test, tc.predict(ts.transform(amen_test)))

In [0]:
estimators = [
                ('en', en), 
                ('lass', lass), 
                ('ridge', ridge),
                ('gb', gb), 
                ('ab', ab), 
                ('randf', randf), 
             ]
https://github.com/jansonclui/w210_capstone_project/raw/master/tfidf_sentiment.csv
base = '/tmp/'
prf = 'comb'
vr = VecstackRunner(comb_train, y_train, comb_validation, y_validation, estimators, xgb, prfx=prf)
vr.run()

with open(base+prf+'_vecstack_stack.pkl', 'rb') as f:
    ts = pickle.load(f)

with open(base+prf+'_vecstack_clf.pkl', 'rb') as f:
    tc = pickle.load(f)

print mean_squared_error(y_validation, tc.predict(ts.transform(comb_validation)))

In [0]:
# Helper functions to use model predictions as meta features

def get_meta_features(X, y, Xt, model):
    model.fit(X, y)
    return { 'train': model.predict(X), 'test': model.predict(Xt) }

def build_meta_features(X, y, Xt, estimators):

    train_meta = pd.DataFrame()
    test_meta = pd.DataFrame()
    
    models = [e[1] for e in estimators]
    
    for model in models:
        r = get_meta_features(X, y, Xt)
        train_meta = pd.concat([train_meta, r['train'].ravel()], axis=1)
        test_meta = pd.concat([test_meta, r['test'].ravel()], axis=1)
        
    return {'train': train_meta, 'test': test_meta}