Train all models and save preds

In [1]:
import pandas as pd
import sklearn
import thon.models.prediction as tmp
from thon.churn_functions import out_eval
data = pd.read_csv("data/out/features.csv")

In [2]:
# global params
split = (117, 119)
scoring = 'neg_root_mean_squared_error'
raw_scoring = sklearn.metrics.mean_squared_error

In [3]:
# lasso
from thon.models.selection.lasso import lasso_vars
lass_f = lasso_vars(scoring = 'neg_root_mean_squared_error',
                   split = (119, 122))

lass_f

['F_v_lag3',
 'g_best_new_cars_lag12',
 'GT_lag3',
 'GT_lag9',
 'laborpart_lag3',
 'month',
 'newhouses_lag6',
 'overmanuf_lag6']

In [6]:
from thon.models.selection.tree_imp import trim_tree
tree_f = trim_tree(split = (119, 122),
                  scoring =  'neg_root_mean_squared_error')
tree_f

['GT_lag3', 'laborpart_lag3', 'oilimport_lag12']

In [13]:
lm = tmp.linear_regression(data = data,
                      split = (117, 119), 
                      feature_selection=lass_f,
                     targetvar = 'n')

In [14]:
out_eval(lm, sklearn.metrics.mean_squared_error)

(1682.641022922891, 2738.587970471226)

2

In [15]:
dtree = tmp.decision_tree(data = data,
                          split = (117, 119),
                          feature_selection=tree_f,
                          targetvar = 'n')

In [16]:
out_eval(dtree, sklearn.metrics.mean_squared_error)

(867.5463625263627, 3205.035599886007)

In [19]:
# random forest
rf = tmp.random_forest(data = data,
                   split = (117, 119),
                  feature_selection=tree_f,
                  targetvar = 'n')

In [20]:
out_eval(rf, sklearn.metrics.mean_squared_error)

(856.1441653693739, 3145.8981544676353)

In [11]:
# takes a long time
arim = tmp.run_arima(data = data,
                    split = 119, 
                     targetvar= 'n')
out_eval(arim, sklearn.metrics.mean_squared_error)

(1785.3623136346714, 4830.894333320808)

In [14]:
n = tmp.run_lstm(data = data,
                 split = (117, 119),
                 feature_selection=lass_f,
                 targetvar = 'n',
                     num_epochs = 200,
                     learning_rate = 0.01,
                     hidden_size = 32)

In [13]:
out_eval(n, sklearn.metrics.mean_squared_error)

(1680.3704819489103, 4872.537168439585)

In [15]:
gru = tmp.run_gru(data = data,
                 split = (117, 119),
                 feature_selection=lass_f,
                 targetvar = 'n',
                     num_epochs = 200,
                     learning_rate = 0.01,
                     hidden_size = 32)

In [8]:
out_eval(gru, sklearn.metrics.mean_squared_error)

(1665.4112149173875, 5421.597218527431)

In [2]:
from sklearn.metrics import r2_score
from thon.churn_functions import out_eval, bake, modernize, simple_split

def run_selection(split = (119, 122),
                  scoring = 'neg_root_mean_squared_error'):
    
    # lasso
    from thon.models.selection.lasso import lasso_vars
    lass_f = lasso_vars(split = split,
                        scoring = scoring)
    
    from thon.models.selection.tree_imp import trim_tree
    tree_f = trim_tree(split = (119, 122),
                  scoring =  'neg_root_mean_squared_error')

    from thon.models.selection.random_imp import random_perm
    rperm_f = random_perm(split = split)
    
    return (lass_f, tree_f, rperm_f)

In [3]:
lass_f, tree_f, rperm_f = run_selection()

In [74]:
import sklearn
import pandas as pd
import numpy as np

def run_prediction(split = 117,
                   feature_selection:dict = {'lasso': lass_f, 'tree': tree_f, 'forest': rperm_f},
                   eval_scoring = sklearn.metrics.mean_squared_error):
    
    # set training params, no split
    np.warnings.filterwarnings('ignore', category=np.VisibleDeprecationWarning)                 

    def expand_grid(feature_selection:dict, eval_scoring):
        aG, bG = np.meshgrid(list(feature_selection.values()), eval_scoring) # create the actual grid
        aG = aG.flatten() # make the grid 1d
        bG = bG.flatten()
        return pd.DataFrame({
            'feature_selection':aG, 
                'eval_scoring':bG}) # return a dataframe
    
    params = expand_grid(feature_selection, eval_scoring)
        
    # arima
#     arim = tmp.run_arima(data = data,
#                     split = 119, 
#                      targetvar= 'n')
    
    out = pd.DataFrame({'model':'arima',
                       'split':119,
                     'feature_selection':'none',
                     'scoring_method':'mse', 
                      'train_score':1785,
                     'eval_score':4830}, index= [0])
    
    for i in params.index:
        
        # shorthand vars
        f = params.iloc[i]['feature_selection']
        es = params.iloc[i]['eval_scoring']
        
        # for name reporting >:(
        fs_name = [k for (k, v) in list(feature_selection.items()) if v == f][0]
        
            
        #lm
        lm = tmp.linear_regression(data = data,
                      split = split, 
                      feature_selection=f,
                     targetvar = 'n')

        df = pd.DataFrame(columns = ['split', 
                                     'feature_selection',
                                     'scoring_method',
                                      'train_score',
                                     'eval_score'])
        
        df.loc['lm'] = [split, fs_name, es.__name__, out_eval(lm, es)[0], out_eval(lm, es)[1]]
        
        # tree
        dtree = tmp.decision_tree(data = data,
                          split = split,
                          feature_selection=f,
                          targetvar = 'n')

        df.loc['dtree'] = [split, fs_name, es.__name__, out_eval(dtree, es)[0], out_eval(dtree, es)[1]]

        
        # random forest
        rf = tmp.random_forest(data = data,
                               split = split,
                  feature_selection=f,
                  targetvar = 'n')

        df.loc['rf'] = [split, fs_name, es.__name__, out_eval(rf, es)[0], out_eval(rf, es)[1]]

        # lstm
        lstm = tmp.run_lstm(data = data,
                 split = split,
                 feature_selection=f,
                 targetvar = 'n',
                     num_epochs = 200,
                     learning_rate = 0.01,
                     hidden_size = 32)


        df.loc['lstm200.32'] = [split, fs_name, es.__name__, out_eval(lstm, es)[0], out_eval(lstm, es)[1]]

        lstm = tmp.run_lstm(data = data,
                 split = split,
                 feature_selection=f,
                 targetvar = 'n',
                     num_epochs = 1000,
                     learning_rate = 0.01,
                     hidden_size = 64)


        df.loc['lstm1k.64'] = [split, fs_name, es.__name__, out_eval(lstm, es)[0], out_eval(lstm, es)[1]]
        
        lstm = tmp.run_lstm(data = data,
                 split = split,
                 feature_selection=f,
                 targetvar = 'n',
                     num_epochs = 1000,
                     learning_rate = 0.01,
                     hidden_size = 16)


        df.loc['lstm1k.16'] = [split, fs_name, es.__name__, out_eval(lstm, es)[0], out_eval(lstm, es)[1]]
        
        # gru
        gru = tmp.run_gru(data = data,
                 split = split,
                 feature_selection=f,
                 targetvar = 'n',
                     num_epochs = 200,
                     learning_rate = 0.01,
                     hidden_size = 32)

        df.loc['gru200.32'] = [split, fs_name, es.__name__, out_eval(gru, es)[0], out_eval(gru, es)[1]]
        
        gru = tmp.run_gru(data = data,
                 split = split,
                 feature_selection=f,
                 targetvar = 'n',
                     num_epochs = 1000,
                     learning_rate = 0.01,
                     hidden_size = 64)

        df.loc['gru1k.64'] = [split, fs_name, es.__name__, out_eval(gru, es)[0], out_eval(gru, es)[1]]
        
        gru = tmp.run_gru(data = data,
                 split = split,
                 feature_selection=f,
                 targetvar = 'n',
                     num_epochs = 1000,
                     learning_rate = 0.01,
                     hidden_size = 16)

        df.loc['gru1k.16'] = [split, fs_name, es.__name__, out_eval(gru, es)[0], out_eval(gru, es)[1]]
        
        df.reset_index(inplace=True)
        df = df.rename(columns = {'index':'model'})
        out = pd.concat([out, df], ignore_index=True)
        
    return out.sort_values(['model', 'scoring_method', 'eval_score'])
        

In [65]:
run_prediction(split = (117, 119),
                   feature_selection = {'lasso': lass_f, 'tree': tree_f, 'forest': rperm_f},
                   eval_scoring = sklearn.metrics.mean_squared_error)

Unnamed: 0,model,split,feature_selection,scoring_method,train_score,eval_score
0,arima,119,none,mean_squared_error,1785.362314,4830.894333
2,dtree,"(117, 119)",lasso,mean_squared_error,109.760521,2178.344828
11,dtree,"(117, 119)",tree,mean_squared_error,867.546363,3205.0356
20,dtree,"(117, 119)",forest,mean_squared_error,0.0,6769.034483
9,gru1k.16,"(117, 119)",lasso,mean_squared_error,812.77323,5770.570806
18,gru1k.16,"(117, 119)",tree,mean_squared_error,1279.466153,7911.560719
27,gru1k.16,"(117, 119)",forest,mean_squared_error,309.486193,10025.640798
8,gru1k.64,"(117, 119)",lasso,mean_squared_error,770.144328,5782.358422
26,gru1k.64,"(117, 119)",forest,mean_squared_error,286.581504,10193.793615
17,gru1k.64,"(117, 119)",tree,mean_squared_error,1075.71537,11963.565557


In [75]:
full = pd.DataFrame(columns = ['model',
                               'split', 
                                     'feature_selection',
                                     'scoring_method',
                                      'train_score',
                                     'eval_score'])
for i in range(17, 117, 5):
    
    pdf = run_prediction(split = (0, i, 119, 149),
                   feature_selection = {'lasso': lass_f, 'tree': tree_f, 'rf':rperm_f},
                   eval_scoring = sklearn.metrics.mean_squared_error)
    
    full = pd.concat([pdf, full], ignore_index = True)

In [76]:
full.to_csv("data/predictions.csv")

In [None]:
import sklearn
import pandas as pd
import numpy as np
def set_prediction(split = (25, 100, 122, 130),
                   train_scoring = 'neg_root_mean_squared_error',
                   eval_scoring = sklearn.metrics.r2_score):

    # set training params, no split
    np.warnings.filterwarnings('ignore', category=np.VisibleDeprecationWarning)                 

    # for name reporting >:(

    df = pd.DataFrame(columns = ['split',
                         'feature_selection',
                         'scoring_method', 
                         'train_score', 
                         'eval_score'])

    #lm
    from thon.models.prediction.linear import linear_model
    lm = tmp.linear_regression(data = data,
                      split = (117, 119), 
                      feature_selection=f,
                     targetvar = 'n')

    df.loc['lm'] = [split, 'lasso', eval_scoring.__name__, out_eval(lm, eval_scoring)[0], out_eval(lm, eval_scoring)[1]]

    from thon.models.prediction.tree import decision_tree
    dtree = decision_tree(split = split,
                          scoring = eval_scoring,
                         feature_selection = tree_f)

    df.loc['dtree'] = [split, 'tree', eval_scoring.__name__, out_eval(dtree, eval_scoring)[0], out_eval(dtree, eval_scoring)[1]]


    # random forest
    from thon.models.prediction.random import random_forest
    rf = random_forest(split = split,
                      feature_selection=)

    df.loc['rf'] = [split, fs_name, es.__name__, out_eval(rf, es)[0], out_eval(rf, es)[1]]

    from thon.models.prediction.lstm import run_lstm1
    lstm = run_lstm(split = split,
                        num_epochs = 500,
                        learning_rate = 0.01,
                        hidden_size = 12,
                        n_layers = 1, 
                        feature_selection = f)

    lstm_scores = out_eval(lstm, es)

    df.loc['lstm'] = [split, fs_name, es.__name__, out_eval(lstm, es)[0], out_eval(lstm, es)[1]]

    from thon.models.prediction.gru import run_gru
    gru = run_gru(split = split,
                         num_epochs = 500,
                         learning_rate = 0.001,
                         hidden_size = 24,
                         n_layers = 1,
                         feature_selection = f)

    df.loc['gru'] = [split, fs_name, es.__name__, out_eval(gru, es)[0], out_eval(gru, es)[1]]

    df.reset_index(inplace=True)
    df = df.rename(columns = {'index':'model'})
    out = pd.concat([out, df], ignore_index=True)

    return out.sort_values(['model', 'scoring_method', 'eval_score'])

In [10]:
        df = pd.DataFrame(columns = ['split',
                             'feature_selection',
                             'train_score', 
                             'eval_score'])

In [11]:
df

Unnamed: 0,split,feature_selection,train_score,eval_score


In [47]:
out = pd.DataFrame(columns = ['model',
                                  'split',
                                 'feature_selection',
                                 'scoring_method', 
                                  'train_score',
                                 'eval_score'])

two = pd.DataFrame({'model':1,
                   'split':2,
                 'feature_selection':'feed',
                 'scoring_method':4, 
                  'train_score':5,
                 'eval_score':6}, index= [0])

In [48]:
out = pd.concat([out, two], axis = 0, ignore_index = True)

In [49]:
out

Unnamed: 0,model,split,feature_selection,scoring_method,train_score,eval_score
0,1,2,feed,4,5,6
