In [1]:
# load packages 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, train_test_split, StratifiedKFold, cross_val_score

from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as imbPipeline

import warnings
import time

warnings.filterwarnings("ignore")

### Prepare data for training

In [2]:
# read in data / process data 
df = pd.read_csv('processed-data-02-07.csv', index_col =0)

In [3]:
# take out anything with wind_10m, date
df = df.loc[:, df.columns[~df.columns.astype('str').str.contains('wind_10m')]]
df = df.drop(columns="date")

In [4]:
X = df.copy().drop(columns=["is_heat"])
y = df["is_heat"]

In [5]:
# split into train and test 
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.2, random_state=512,
                                                    stratify = y
                                                   )

### Select subsets

In [None]:
# select a small subset of features

# 36 subsets
# hi, air temp, wbgt

# only day of 
# 1-day, 3-day 
# 3 day, 5 day 

# just mean 
# mean, max, var
# mean, max
# mean, var 

# list of lists 

In [6]:
subsets= []
cols = X_train.columns
cols = cols[~cols.str.contains('min')]

# first subset by mean / max /var
mean_c = cols[cols.str.contains('mean')]
mean_max_c = cols[cols.str.contains('mean|max')]
mean_var_c = cols[cols.str.contains('mean|var')]

# for each of cols, mean_c, mean_max_c, mean_var_c
for cols_i in [cols,mean_c, mean_max_c, mean_var_c]:
    # take out 3 or 5 lag 
    a = cols_i[~cols_i.str.contains('lag')]
    # take out 3 lag 
    b = cols_i[~cols_i.str.contains('lag3')]
    # take out 5 lag 
    c = cols_i[~cols_i.str.contains('la5')]
    for cols_j in [a, b, c]:
        # take out air temp, wbgt
        subsets.append(cols_j[~cols_j.str.contains('air_temp|wbgt')])
        # take out hi, wbgt
        subsets.append(cols_j[~cols_j.str.contains('hi|wbgt')])
        # take out hi, air temp
        subsets.append(cols_j[~cols_j.str.contains('hi|air_temp')])

### Cross-validation Training

In [None]:
# for each of these subsets 
# run gridsearch cv on the models 
# for each model / resampling method option, calculate cross val score on best params 

# for the subset, store the avg F1 for each combination of resampling and model 

#### Function for storing results

In [7]:
def store_best_res(res_list, res, X_refit, y_refit, model, resampling_type, subset_ind, neighbors=None):
    '''takes a grid search cv fit, calculates 5 fold cross val scores based 
    on best params fit and stores into cv_res 
    which feature subset, model, resampling, the resampling params (if app),
    the model's best params, avg cross val F1, std cross val F1''' 
    # cross val score
    scores = cross_val_score(res, X_refit, y_refit, scoring='f1_weighted')
    
    
    res_df = pd.DataFrame(np.array([[subset_ind, model, resampling_type, neighbors, res.best_params_, scores.mean(), scores.std()]]),
                            columns=["subset", "model", "resampling_method", "neighbors", "params", "mean_cv_F1_score", "std_cv_F1_score"])
    
    return res_list.append(res_df)

#### Initializing things that don't need to be in for loop

In [8]:
# initialize stratified k fold 
folds = StratifiedKFold(n_splits=5, random_state=512, shuffle=True)

# no resampling 
none_en_pipe = imbPipeline([
    ('classifier', LogisticRegression()) 
])

none_xg_pipe = imbPipeline([
    ('classifier', XGBClassifier()) 
])

# set params for steps of pipeline 
xg_param_grid = [
    {'classifier' : [XGBClassifier()],
    'classifier__n_estimators' : list(range(25,250,50)),
     'classifier__max_depth' : list(range(1,4)),
     'classifier__eta' : list(np.arange(0.1,0.5,0.2))
    }]

en_param_grid = [
    {'classifier' : [LogisticRegression(penalty='elasticnet', solver='saga', tol = 0.01)],
    'classifier__l1_ratio' : list(np.arange(0.1, 1., 0.2)),
    'classifier__C': list(np.arange(0.1, 1., 0.2))}
]

#### for loop of training!!!

In [9]:
for i, features_j in enumerate(subsets):
    # initialize time 
    start_time = time.time()
    
    # subset X for features j 
    X_train_j = X_train[features_j]
    # initialize list to store results for each subset 
    cv_res_i = []
    
    ## NO RESAMPLING 
    # grid search 
    none_en_gscv = GridSearchCV(none_en_pipe, param_grid = en_param_grid, 
          cv = folds, verbose=0, scoring="f1_weighted")
    none_xg_gscv = GridSearchCV(none_xg_pipe, param_grid = xg_param_grid, 
          cv = folds, verbose=0, scoring="f1_weighted")
    # fit 
    none_en_gscv.fit(X_train_j, y_train)
    none_xg_gscv.fit(X_train_j, y_train)
    # store res 
    store_best_res(cv_res_i, none_en_gscv, X_train_j, y_train, 'elastic_net', 'none', i)
    store_best_res(cv_res_i, none_xg_gscv, X_train_j, y_train, 'xgboost', 'none', i)
    
    print('done with no resampling of subset ', i)
    

    ## RESAMPLING 
    for smote_n in [3,5,7]:
        # smote en
        smote_en_pipe = imbPipeline([
            ('over', SMOTE(sampling_strategy = 0.55, k_neighbors= smote_n)),
            ('classifier', LogisticRegression()) 
        ])
        # smote xg
        smote_xg_pipe = imbPipeline([
            ('over', SMOTE(sampling_strategy = 0.55, k_neighbors= smote_n)),
            ('classifier', XGBClassifier()) 
        ])
        # smote and undersampling en
        both_en_pipe = imbPipeline([
            ('over', SMOTE(sampling_strategy = 0.55, k_neighbors= smote_n)), 
            ('under', RandomUnderSampler()),
            ('classifier', LogisticRegression()) 
        ])
        # smote and undersampling xg
        both_xg_pipe = imbPipeline([
            ('over', SMOTE(sampling_strategy = 0.55, k_neighbors= smote_n)), 
            ('under', RandomUnderSampler()),
            ('classifier', XGBClassifier()) 
        ])

        # grid search  
        smote_en_gscv = GridSearchCV(smote_en_pipe, param_grid = en_param_grid, 
                                 cv = folds, verbose=0, scoring="f1_weighted")
        smote_xg_gscv = GridSearchCV(smote_xg_pipe, param_grid = xg_param_grid, 
                                 cv = folds, verbose=0, scoring="f1_weighted")
        both_en_gscv = GridSearchCV(both_en_pipe, param_grid = en_param_grid, 
                                  cv = folds, verbose=0, scoring="f1_weighted")
        both_xg_gscv = GridSearchCV(both_xg_pipe, param_grid = xg_param_grid, 
                                  cv = folds, verbose=0, scoring="f1_weighted")
        
        # fit 
        smote_en_gscv.fit(X_train_j, y_train)
        smote_xg_gscv.fit(X_train_j, y_train)
        both_en_gscv.fit(X_train_j, y_train)
        both_xg_gscv.fit(X_train_j, y_train)
        
        store_best_res(cv_res_i, smote_en_gscv, X_train_j, y_train, 'elastic_net', 'smote', i, smote_n)
        store_best_res(cv_res_i, smote_xg_gscv, X_train_j, y_train, 'xgboost', 'smote', i, smote_n)
        store_best_res(cv_res_i, both_en_gscv, X_train_j, y_train, 'elastic_net', 'both', i, smote_n)
        store_best_res(cv_res_i, both_xg_gscv, X_train_j, y_train, 'xgboost', 'both', i, smote_n)
        
        print('done with resampling of subset ', i, ' for neighbor ', smote_n)
    
    # for each subset, have list of dataframes 
    cv_res_i_df= pd.concat(cv_res_i)

    # write subset to pickle
    file_name = str(i) + '_subset_cv_res.pickle'
    cv_res_i_df.to_pickle(file_name)
    
    print("Done with subset", str(i + 1), "/", len(subsets), 'in', np.round(time.time() - start_time), 'seconds')
    

done with no resampling of subset  0
Done with subset  0 / 36 in  149.0  seconds


KeyboardInterrupt: 