# Import Packages

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import math
import numpy as np
%matplotlib inline
%config IPCompleter.greedy=True

df = pd.read_csv('../data/df_model_trimmed.csv')

sns.set(style='darkgrid')
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, PowerTransformer
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.linear_model import RidgeCV, LassoCV, ElasticNetCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.feature_selection import RFECV
import itertools

In [2]:
df.head(2)

Unnamed: 0,year,company,auwgr,lkpp,hlr_lag1,hlr_lag2,hlr_lag3,hlr_lag4,hlr_lag5,mer,...,class_fire,class_health,class_mac,class_mahl,class_motor,class_others,class_pa,class_prof_indm,class_pub_lia,class_wic
0,2005,c166,1.403847,9.746642,-0.653108,-0.67628,-0.702346,-0.71422,-0.734678,-0.295015,...,0,0,0,0,0,0,1,0,0,0
1,2006,c166,1.897721,4.531747,-0.950801,-0.67628,-0.702346,-0.71422,-0.734678,-0.334269,...,0,0,0,0,0,0,1,0,0,0


In [3]:
df.shape

(4347, 33)

# Drop Dummies

In [4]:
# # drop the 'class' dummies
droplist = ['class_bonds','class_cnstr_engr','class_cpr','class_fire','class_health',
            'class_mac','class_mahl','class_motor','class_others','class_pa',
            'class_prof_indm','class_pub_lia','class_wic']
df.drop(columns=droplist,inplace=True)
df.shape

(4347, 20)

# Get features from dataframe

In [5]:
print(df.shape)
features = [col for col in df._get_numeric_data().columns if (col != 'auwgr') and (col != 'year')]
print(features)

(4347, 20)
['lkpp', 'hlr_lag1', 'hlr_lag2', 'hlr_lag3', 'hlr_lag4', 'hlr_lag5', 'mer', 'der', 'oer', 'prem_write_net_lag1', 'claim_set_net_lag1', 'exp_management_lag1', 'exp_comm_incur_net_lag1', 'exp_other_lag1', 'prem_liab_diff_lag1', 'claim_liab_diff_lag1', 'uw_gain_lag1']


In [6]:
df.isna().sum().any()

False

In [7]:
X = df[features]
y = df['auwgr']
print(X.shape)
print(y.shape)

(4347, 17)
(4347,)


## Regression Model 1

### Train/Test Split

In [23]:
# split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
print(X_train.shape,X_test.shape)
print(y_train.shape,y_test.shape)

(3260, 17) (1087, 17)
(3260,) (1087,)


### Standard Scaler

In [24]:
ss = StandardScaler()
ss.fit(X_train)
X_train_sc = ss.transform(X_train)
X_test_sc = ss.transform(X_test)

### Linear, Ridge, Lasso (with standard scaled data)

In [25]:
# LINEAR REG - Instantiate and score using cross validation (3 folds)
linreg = LinearRegression()
print('\nLINEAR REG cross-val mean score:')
print('X-Val score MEAN using X_train\t\t',cross_val_score(linreg, X_train_sc, y_train,cv=3).mean())

# RIDGE - Instantiate and score using cross validation (3 folds)
ridge=RidgeCV(alphas=np.linspace(.1, 10, 100))
print('\nRIDGE cross-val mean score:')
print('X-Val score MEAN using X_train\t\t',cross_val_score(ridge, X_train_sc, y_train,cv=3).mean())

# LASSO - Instantiate and score using cross validation (3 folds)
lasso = LassoCV(n_alphas=200,cv=3)
print('\nLASSO cross-val mean score:')
print('X-Val score MEAN using X_train\t\t',cross_val_score(lasso, X_train_sc, y_train,cv=3).mean())


LINEAR REG cross-val mean score:
X-Val score MEAN using X_train		 0.11119262833850345

RIDGE cross-val mean score:
X-Val score MEAN using X_train		 0.11351653946765368

LASSO cross-val mean score:
X-Val score MEAN using X_train		 0.1131184168029213


## Regression Model 2

### Power Transformer

In [11]:
pt_x = PowerTransformer() # transform X
pt_x.fit(X_train)
X_train_pt = pt_x.transform(X_train)
X_test_pt = pt_x.transform(X_test)

pt_y = PowerTransformer() # transform Y
# PowerTransformer requires a matrix/DataFrame, so we use .to_frame() method on y_train
# subsequently we use .ravel() to flatten it into an array (which is required for cross_val later)
pt_y.fit(y_train.to_frame())
y_train_pt = pt_y.transform(y_train.to_frame()).ravel()
y_test_pt = pt_y.transform(y_test.to_frame()).ravel()

### Linear, Ridge Lasso (with power transformed data)

In [12]:
# LINEAR REG - Instantiate and score using cross validation (3 folds)
linreg = LinearRegression()
print('\nLINEAR REG cross-val mean score:')
print('X-Val score MEAN using X_train\t\t',cross_val_score(linreg, X_train_pt, y_train_pt,cv=3).mean())

# RIDGE - Instantiate and score using cross validation (3 folds)
ridge=RidgeCV(alphas=np.linspace(.1, 10, 100))
print('\nRIDGE cross-val mean score:')
print('X-Val score MEAN using X_train\t\t',cross_val_score(ridge, X_train_pt, y_train_pt,cv=3).mean())

# LASSO - Instantiate and score using cross validation (3 folds)
lasso = LassoCV(n_alphas=200,cv=3)
print('\nLASSO cross-val mean score:')
print('X-Val score MEAN using X_train\t\t',cross_val_score(lasso, X_train_pt, y_train_pt,cv=3).mean())


LINEAR REG cross-val mean score:
X-Val score MEAN using X_train		 0.08187312635603868

RIDGE cross-val mean score:
X-Val score MEAN using X_train		 0.08303968253782978

LASSO cross-val mean score:
X-Val score MEAN using X_train		 0.08247678166047641


## Decision Tree Regressor

### Initial Hyperparameters

In [13]:
dtreg = DecisionTreeRegressor()
dtreg.fit(X_train_sc,y_train) # Use un-scaled data
# Evaluate model.
print(dtreg.score(X_train_sc,y_train))
print(dtreg.score(X_test_sc,y_test))

0.999999999490874
-0.7812532255355314


### GridSearchCV

In [None]:
# param_grid = [{'max_depth':range(2,1000),
#                'min_samples_split':range(2,21)
#               }]
# reg = GridSearchCV(DecisionTreeRegressor(), param_grid, cv=5)
# reg.fit(X_train, y_train)
# reg.best_params_

### Results

In [None]:
# dtreg = DecisionTreeRegressor(max_depth=2, min_samples_split=15)
# dtreg.fit(X_train,y_train) # Use un-scaled data
# # Evaluate model.
# print(dtreg.score(X_train,y_train))
# print(dtreg.score(X_test,y_test))

## Random Forest Regressor

### Initial Hyperparameters

In [14]:
rfreg = RandomForestRegressor(n_estimators=10) # default no. of trees ('n_estimators') = 10
rfreg.fit(X_train_sc,y_train) # Use un-scaled data
# Evaluate model
print(rfreg.score(X_train_sc,y_train))
print(rfreg.score(X_test_sc,y_test))

0.81783645478352
-0.016826547420873972


### GridSearchCV

In [None]:
# param_grid = [{'n_estimators':[50,100,200],
#                'max_depth':range(2,50),
#                'min_samples_split':range(2,20),
#                'oob_score':[True]
#               }]
# reg = GridSearchCV(RandomForestRegressor(), param_grid, cv=5)
# reg.fit(X_train, y_train)
# reg.best_params_

### Results

In [None]:
# rfreg = RandomForestRegressor(n_estimators=100,max_depth=2,min_samples_split=10,oob_score=True)
# rfreg.fit(X_train,y_train) # Use un-scaled data
# # Evaluate model
# print(rfreg.score(X_train,y_train))
# print(rfreg.score(X_test,y_test))

## Extra Trees Regressor

### Initial Hyperparameters

In [15]:
etreg = ExtraTreesRegressor(bootstrap=True,oob_score=True,warm_start=False,n_estimators=100)
etreg.fit(X_train_sc,y_train) # Use un-scaled data
# Evaluate model
print(etreg.score(X_train_sc,y_train))
print(etreg.score(X_test_sc,y_test))

0.8785319032217249
0.09059870568796191


### GridSearchCV

In [None]:
# param_grid = [{'n_estimators':[100,200,300],
#                'max_depth':range(2,50),
#                'min_samples_split':range(2,20),
#                'oob_score':[True],
#                'bootstrap':[True]
#               }]

# reg = GridSearchCV(ExtraTreesRegressor(), param_grid, cv=5)
# reg.fit(X_train, y_train)
# reg.best_params_

### Results

In [None]:
# etreg = ExtraTreesRegressor(bootstrap=True,max_depth=23,min_samples_split=14,n_estimators=100,oob_score=True)
# etreg.fit(X_train,y_train) # Use un-scaled data
# # Evaluate model
# print(etreg.score(X_train,y_train))
# print(etreg.score(X_test,y_test))

# Feature Selection

## RFECV (on Colab)

# IGNORE BELOW (Superseded)

# Additive Feature Search (Sequential)

In [None]:
# X_train_sc, X_train_pt
# X_test_sc, X_test_pt
# y_train, y_test

# FURTHER TO DO:
# select best scoring set of features ==> 
# send to grid search (dt, rf, et), also send to adaBoost, GradientBoost regressors

print('total no. of features to search:',len(features))

df_search_results = pd.DataFrame(data=[['','','','','','','','','','','','','','']], 
                                 index=None, 
                                 columns=['num_features','list_features','sc_score_linreg','sc_score_ridge',
                                          'sc_score_lasso','sc_score_dt','sc_score_rf','sc_score_et',
                                          'pt_score_linreg','pt_score_ridge','pt_score_lasso',
                                          'pt_score_dt','pt_score_rf','pt_score_et'])

df_search_row = pd.DataFrame(data=[['','','','','','','','','','','','','','']], 
                                 index=None, 
                                 columns=['num_features','list_features','sc_score_linreg','sc_score_ridge',
                                          'sc_score_lasso','sc_score_dt','sc_score_rf','sc_score_et',
                                          'pt_score_linreg','pt_score_ridge','pt_score_lasso',
                                          'pt_score_dt','pt_score_rf','pt_score_et'])

print('Initialization:')
display(df_search_row.head(1))

feature_searchlist = []
y = df['auwgr']

for i in features:   
    feature_searchlist.append(i)  
    print('\n#########',len(feature_searchlist),'features\n',feature_searchlist)
    X = df[feature_searchlist]
    print('X:',X.shape,'y:',y.shape)
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
    ss = StandardScaler()
    ss.fit(X_train)
    X_train_sc = ss.transform(X_train)
    X_test_sc = ss.transform(X_test)
    pt_x = PowerTransformer() # transform X
    pt_x.fit(X_train)
    X_train_pt = pt_x.transform(X_train)
    X_test_pt = pt_x.transform(X_test)
    pt_y = PowerTransformer() # transform Y
    pt_y.fit(y_train.to_frame())
    y_train_pt = pt_y.transform(y_train.to_frame()).ravel()
    y_test_pt = pt_y.transform(y_test.to_frame()).ravel()
    
    ######################
#     print('X (scaled):')

    df_search_row['num_features'] = len(feature_searchlist)
    df_search_row['list_features'] = str(feature_searchlist)
    
    df_search_row.loc[0,'sc_score_linreg'] = cross_val_score(LinearRegression(), X_train_sc, y_train,cv=3).mean()
    df_search_row.loc[0,'sc_score_ridge'] = cross_val_score(RidgeCV(alphas=np.linspace(.1, 10, 100)), X_train_sc, y_train,cv=3).mean()
    df_search_row.loc[0,'sc_score_lasso'] = cross_val_score(LassoCV(n_alphas=200,cv=3), X_train_sc, y_train,cv=3).mean()

    dtreg = DecisionTreeRegressor()
    dtreg.fit(X_train_sc,y_train)
    df_search_row.loc[0,'sc_score_dt'] = dtreg.score(X_test_sc,y_test)
    
    rfreg = RandomForestRegressor(n_estimators=10)
    rfreg.fit(X_train_sc,y_train)
    df_search_row.loc[0,'sc_score_rf'] = rfreg.score(X_test_sc,y_test)

    etreg = ExtraTreesRegressor(bootstrap=True,oob_score=True,warm_start=False,n_estimators=100)
    etreg.fit(X_train_sc,y_train)
    df_search_row.loc[0,'sc_score_et'] = etreg.score(X_test_sc,y_test)
    
    ######################
#     print('X (power transformed):')
    
    df_search_row.loc[0,'pt_score_linreg'] = cross_val_score(LinearRegression(), X_train_pt, y_train_pt,cv=3).mean()
    df_search_row.loc[0,'pt_score_ridge'] = cross_val_score(RidgeCV(alphas=np.linspace(.1, 10, 100)), X_train_pt, y_train_pt,cv=3).mean()
    df_search_row.loc[0,'pt_score_lasso'] = cross_val_score(LassoCV(n_alphas=200,cv=3), X_train_pt, y_train_pt,cv=3).mean()

    dtreg = DecisionTreeRegressor()
    dtreg.fit(X_train_pt,y_train_pt)
    df_search_row.loc[0,'pt_score_dt'] = dtreg.score(X_test_pt,y_test_pt)

    rfreg = RandomForestRegressor(n_estimators=10)
    rfreg.fit(X_train_pt,y_train_pt)
    df_search_row.loc[0,'pt_score_rf'] = rfreg.score(X_test_pt,y_test_pt)

    etreg = ExtraTreesRegressor(bootstrap=True,oob_score=True,warm_start=False,n_estimators=100)
    etreg.fit(X_train_pt,y_train_pt)
    df_search_row.loc[0,'pt_score_et'] = etreg.score(X_test_pt,y_test_pt)
    
    ######################
    #debug
    print('end of iteration:')
    print(df_search_row.head(1))
    print('append to df_search_results...')
    df_search_results = df_search_results.append(df_search_row)
    print('shape of df_search_results:',df_search_results.shape)

## End of FOR loop
print('END of FOR loop')
df_search_results.reset_index(drop=True,inplace=True)
df_search_results.drop(labels=0,inplace=True)
df_search_results.reset_index(drop=True,inplace=True)
print(df_search_results.shape)
df_search_results.head()

In [None]:
df_search_results.describe()

In [None]:
print(df_search_results['sc_score_linreg'].max())
print(df_search_results['sc_score_ridge'].max())
print(df_search_results['sc_score_lasso'].max())
print(df_search_results['pt_score_linreg'].max())
print(df_search_results['pt_score_ridge'].max())
print(df_search_results['pt_score_lasso'].max())
print(df_search_results['sc_score_dt'].max())
print(df_search_results['sc_score_rf'].max())
print(df_search_results['sc_score_et'].max())
print(df_search_results['pt_score_dt'].max())
print(df_search_results['pt_score_rf'].max())
print(df_search_results['pt_score_et'].max())

**===> Best score is 0.13626810317016735 ===> Scaled X, Extra Trees**

# Additive Feature Search (Combinations)

In [None]:
#debug
# need to launch notebook using:
# "jupyter notebook --NotebookApp.iopub_data_rate_limit=10000000"

# for num_of_features in range(1,len(features)+1): # this throws an IO error.... 

for num_of_features in range(1,int(len(features)/5)): # try this first ..from 1 to 7
    feature_combo_list = list(itertools.combinations(features,r=num_of_features))
    print(feature_combo_list)


In [None]:
len(feature_combo_list)

In [None]:
list(feature_combo_list[110])

In [None]:
# X_train_sc, X_train_pt
# X_test_sc, X_test_pt
# y_train, y_test

df_search_results = pd.DataFrame(data=[['','','','','','','','','','','','','','']], 
                                 index=None, 
                                 columns=['num_features','list_features','sc_score_linreg','sc_score_ridge',
                                          'sc_score_lasso','sc_score_dt','sc_score_rf','sc_score_et',
                                          'pt_score_linreg','pt_score_ridge','pt_score_lasso',
                                          'pt_score_dt','pt_score_rf','pt_score_et'])
df_search_row = pd.DataFrame(data=[['','','','','','','','','','','','','','']], 
                                 index=None, 
                                 columns=['num_features','list_features','sc_score_linreg','sc_score_ridge',
                                          'sc_score_lasso','sc_score_dt','sc_score_rf','sc_score_et',
                                          'pt_score_linreg','pt_score_ridge','pt_score_lasso',
                                          'pt_score_dt','pt_score_rf','pt_score_et'])
y = df['auwgr']

# =====

for num_of_features in range(1,len(features)+1):
    
    # generate a list of tuples
    feature_combo_list = list(itertools.combinations(features,r=num_of_features))
    
    # extract each specific tuple of features to search
    for specific_list_of_features in feature_combo_list:
        
        # convert the tuple into a list
        specific_list_of_features = list(specific_list_of_features)
        X = df[specific_list_of_features]
        print('total no. of features to search:',len(specific_list_of_features))
        
        print('X:',X.shape,'y:',y.shape)
        X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
        ss = StandardScaler()
        ss.fit(X_train)
        X_train_sc = ss.transform(X_train)
        X_test_sc = ss.transform(X_test)
        pt_x = PowerTransformer() # transform X
        pt_x.fit(X_train)
        X_train_pt = pt_x.transform(X_train)
        X_test_pt = pt_x.transform(X_test)
        pt_y = PowerTransformer() # transform Y
        pt_y.fit(y_train.to_frame())
        y_train_pt = pt_y.transform(y_train.to_frame()).ravel()
        y_test_pt = pt_y.transform(y_test.to_frame()).ravel()
    
        ######################
        #     print('X (scaled):')
        df_search_row['num_features'] = len(specific_list_of_features)
        df_search_row['list_features'] = str(specific_list_of_features)
        df_search_row.loc[0,'sc_score_linreg'] = cross_val_score(LinearRegression(), X_train_sc, y_train,cv=3).mean()
        df_search_row.loc[0,'sc_score_ridge'] = cross_val_score(RidgeCV(alphas=np.linspace(.1, 10, 100)), X_train_sc, y_train,cv=3).mean()
        df_search_row.loc[0,'sc_score_lasso'] = cross_val_score(LassoCV(n_alphas=200,cv=3), X_train_sc, y_train,cv=3).mean()
        dtreg = DecisionTreeRegressor()
        dtreg.fit(X_train_sc,y_train)
        df_search_row.loc[0,'sc_score_dt'] = dtreg.score(X_test_sc,y_test)    
        rfreg = RandomForestRegressor(n_estimators=10)
        rfreg.fit(X_train_sc,y_train)
        df_search_row.loc[0,'sc_score_rf'] = rfreg.score(X_test_sc,y_test)
        etreg = ExtraTreesRegressor(bootstrap=True,oob_score=True,warm_start=False,n_estimators=100)
        etreg.fit(X_train_sc,y_train)
        df_search_row.loc[0,'sc_score_et'] = etreg.score(X_test_sc,y_test)
        ######################
        #     print('X (power transformed):')
        df_search_row.loc[0,'pt_score_linreg'] = cross_val_score(LinearRegression(), X_train_pt, y_train_pt,cv=3).mean()
        df_search_row.loc[0,'pt_score_ridge'] = cross_val_score(RidgeCV(alphas=np.linspace(.1, 10, 100)), X_train_pt, y_train_pt,cv=3).mean()
        df_search_row.loc[0,'pt_score_lasso'] = cross_val_score(LassoCV(n_alphas=200,cv=3), X_train_pt, y_train_pt,cv=3).mean()
        dtreg = DecisionTreeRegressor()
        dtreg.fit(X_train_pt,y_train_pt)
        df_search_row.loc[0,'pt_score_dt'] = dtreg.score(X_test_pt,y_test_pt)
        rfreg = RandomForestRegressor(n_estimators=10)
        rfreg.fit(X_train_pt,y_train_pt)
        df_search_row.loc[0,'pt_score_rf'] = rfreg.score(X_test_pt,y_test_pt)
        etreg = ExtraTreesRegressor(bootstrap=True,oob_score=True,warm_start=False,n_estimators=100)
        etreg.fit(X_train_pt,y_train_pt)
        df_search_row.loc[0,'pt_score_et'] = etreg.score(X_test_pt,y_test_pt)
        ######################
        #debug
        df_search_results = df_search_results.append(df_search_row)
        print('shape of df_search_results:',df_search_results.shape)

    ## End of INNER FOR loop
    print('end of INNER FOR loop:',df_search_results.shape)

# End of OUTER FOR Loop
df_search_results.reset_index(drop=True,inplace=True)
df_search_results.drop(labels=0,inplace=True)
df_search_results.reset_index(drop=True,inplace=True)
print(df_search_results.shape)
df_search_results.head()

In [None]:
df_search_results.head()

In [None]:
df_search_results.describe()

In [None]:
df_search_results.reset_index(drop=True,inplace=True)
df_search_results.drop(labels=0,inplace=True)
df_search_results.reset_index(drop=True,inplace=True)
print(df_search_results.shape)
df_search_results.head()

In [None]:
print(df_search_results['sc_score_linreg'].max())
print(df_search_results['sc_score_ridge'].max())
print(df_search_results['sc_score_lasso'].max())
print(df_search_results['pt_score_linreg'].max())
print(df_search_results['pt_score_ridge'].max())
print(df_search_results['pt_score_lasso'].max())
print(df_search_results['sc_score_dt'].max())
print(df_search_results['sc_score_rf'].max())
print(df_search_results['sc_score_et'].max())
print(df_search_results['pt_score_dt'].max())
print(df_search_results['pt_score_rf'].max())
print(df_search_results['pt_score_et'].max())

In [None]:
# FURTHER TO DO:
# select best scoring set of features ==> 
# send to grid search (dt, rf, et), also send to adaBoost, GradientBoost regressors
