# Import Packages

In [14]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import math
%matplotlib inline
%config IPCompleter.greedy=True

#debug
# df = pd.read_csv('../data/df_model_with_dummies.csv') # modeling dataset
# df = pd.read_csv('../data/df_model_with_dummies_debug.csv') # modeling dataset
df = pd.read_csv('../data/df_model_with_dummies_trimmed.csv') # modeling dataset

sns.set(style='darkgrid')
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, PowerTransformer
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.linear_model import RidgeCV, LassoCV, ElasticNetCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor

In [15]:
df.head(2)

Unnamed: 0,year,company,auwgr,lkpp,hlr_lag1,hlr_lag2,hlr_lag3,hlr_lag4,hlr_lag5,mer,der,oer,prem_write_net_lag1,claim_set_net_lag1,exp_management_lag1,exp_comm_incur_net_lag1,exp_other_lag1,prem_liab_diff_lag1,claim_liab_diff_lag1
0,2005,c166,0.212078,1.546689,4.260059,4.228211,4.225676,4.281284,4.375305,0.287291,0.069188,0.007507,2910275.0,1839364.0,796281.692308,191766.538462,20807.769231,31225.615385,11025.846154
1,2006,c166,-0.238092,1.986037,6.355533,4.228211,4.225676,4.281284,4.375305,0.333851,-0.048705,0.013481,1205469.0,1659653.0,383283.0,-55917.0,15477.0,-642365.0,-410316.0


In [16]:
df.shape

(4424, 19)

# Get features from dataframe

In [17]:
print(df.shape)
print(df._get_numeric_data().columns)
features = [col for col in df._get_numeric_data().columns if (col != 'auwgr') and (col != 'year')]
print(features)

(4424, 19)
Index(['year', 'auwgr', 'lkpp', 'hlr_lag1', 'hlr_lag2', 'hlr_lag3', 'hlr_lag4',
       'hlr_lag5', 'mer', 'der', 'oer', 'prem_write_net_lag1',
       'claim_set_net_lag1', 'exp_management_lag1', 'exp_comm_incur_net_lag1',
       'exp_other_lag1', 'prem_liab_diff_lag1', 'claim_liab_diff_lag1'],
      dtype='object')
['lkpp', 'hlr_lag1', 'hlr_lag2', 'hlr_lag3', 'hlr_lag4', 'hlr_lag5', 'mer', 'der', 'oer', 'prem_write_net_lag1', 'claim_set_net_lag1', 'exp_management_lag1', 'exp_comm_incur_net_lag1', 'exp_other_lag1', 'prem_liab_diff_lag1', 'claim_liab_diff_lag1']


In [18]:
df.isna().sum().any()

False

In [19]:
X = df[features]
y = df['auwgr']
print(X.shape)
print(y.shape)

(4424, 16)
(4424,)


## Regression Model 1

### Train/Test Split

In [20]:
# split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
print(X_train.shape,X_test.shape)
print(y_train.shape,y_test.shape)

(3318, 16) (1106, 16)
(3318,) (1106,)


### Standard Scaler

In [21]:
ss = StandardScaler()
ss.fit(X_train)
X_train_sc = ss.transform(X_train)
X_test_sc = ss.transform(X_test)

### Linear, Ridge, Lasso (with standard scaled data)

In [22]:
# LINEAR REG - Instantiate and score using cross validation (3 folds)
linreg = LinearRegression()
print('\nLINEAR REG cross-val mean score:')
print('X-Val score MEAN using X_train\t\t',cross_val_score(linreg, X_train_sc, y_train,cv=3).mean())

# RIDGE - Instantiate and score using cross validation (3 folds)
ridge=RidgeCV(alphas=np.linspace(.1, 10, 100))
print('\nRIDGE cross-val mean score:')
print('X-Val score MEAN using X_train\t\t',cross_val_score(ridge, X_train_sc, y_train,cv=3).mean())

# LASSO - Instantiate and score using cross validation (3 folds)
lasso = LassoCV(n_alphas=200,cv=3)
print('\nLASSO cross-val mean score:')
print('X-Val score MEAN using X_train\t\t',cross_val_score(lasso, X_train_sc, y_train,cv=3).mean())


LINEAR REG cross-val mean score:
X-Val score MEAN using X_train		 0.08383396358260631

RIDGE cross-val mean score:
X-Val score MEAN using X_train		 0.08464331133319362

LASSO cross-val mean score:
X-Val score MEAN using X_train		 0.07684363003171261


## Regression Model 2

### Power Transformer

In [23]:
pt_x = PowerTransformer() # transform X
pt_x.fit(X_train)
X_train_pt = pt_x.transform(X_train)
X_test_pt = pt_x.transform(X_test)

pt_y = PowerTransformer() # transform Y
# PowerTransformer requires a matrix/DataFrame, so we use .to_frame() method on y_train
# subsequently we use .ravel() to flatten it into an array (which is required for cross_val later)
pt_y.fit(y_train.to_frame())
y_train_pt = pt_y.transform(y_train.to_frame()).ravel()
y_test_pt = pt_y.transform(y_test.to_frame()).ravel()

### Linear, Ridge Lasso (with power transformed data)

In [24]:
# LINEAR REG - Instantiate and score using cross validation (3 folds)
linreg = LinearRegression()
print('\nLINEAR REG cross-val mean score:')
print('X-Val score MEAN using X_train\t\t',cross_val_score(linreg, X_train_pt, y_train_pt,cv=3).mean())

# RIDGE - Instantiate and score using cross validation (3 folds)
ridge=RidgeCV(alphas=np.linspace(.1, 10, 100))
print('\nRIDGE cross-val mean score:')
print('X-Val score MEAN using X_train\t\t',cross_val_score(ridge, X_train_pt, y_train_pt,cv=3).mean())

# LASSO - Instantiate and score using cross validation (3 folds)
lasso = LassoCV(n_alphas=200,cv=3)
print('\nLASSO cross-val mean score:')
print('X-Val score MEAN using X_train\t\t',cross_val_score(lasso, X_train_pt, y_train_pt,cv=3).mean())


LINEAR REG cross-val mean score:
X-Val score MEAN using X_train		 0.053859700066143414

RIDGE cross-val mean score:
X-Val score MEAN using X_train		 0.05487273795830964

LASSO cross-val mean score:
X-Val score MEAN using X_train		 0.05346565519765253


# Checkpoint 1: Comments
The cross-validation scores above show that the data, as it is, cannot be used for regression.<br>
The next step is to divide the data into the 13 individual insurance types (fire, motor, wic, cargo etc). For each insurance type, check the data for possiblity of regression.

# Separate data into individual insurance types

In [None]:
# insurance class dictionary
ins_class_dict = {'mac':'Marine and Aviation Cargo',
                  'mahl':'Marine and Aviation Hull and Liability',
                  'fire':'Fire',
                  'motor':'Motor',
                  'wic':'Work Injury Compensation',
                  'pa':'Personal Accident',
                  'health':'Health',
                  'pub_lia':'Public Liability',
                  'bonds':'Bonds',
                  'cnstr_engr':'Engineering/CAR/EAR',
                  'prof_indm':'Professional Indemnity',
                  'cpr':'Credit/Political Risk',
                  'others':'Others'}

In [None]:
# split insurance types into their corresponding dataframes
df_mac = df[df['class']=='mac'].copy()
df_mahl = df[df['class']=='mahl'].copy()
df_fire = df[df['class']=='fire'].copy()
df_motor = df[df['class']=='motor'].copy()
df_wic = df[df['class']=='wic'].copy()
df_pa = df[df['class']=='pa'].copy()
df_health = df[df['class']=='health'].copy()
df_pub_lia = df[df['class']=='pub_lia'].copy()
df_bonds = df[df['class']=='bonds'].copy()
df_cnstr_engr = df[df['class']=='cnstr_engr'].copy()
df_prof_indm = df[df['class']=='prof_indm'].copy()
df_cpr = df[df['class']=='cpr'].copy()
df_others = df[df['class']=='others'].copy()
print('df_mac:\t\t',df_mac.shape[0],'rows')
print('df_mahl:\t',df_mahl.shape[0],'rows')
print('df_fire:\t',df_fire.shape[0],'rows')
print('df_motor:\t',df_motor.shape[0],'rows')
print('df_wic:\t\t',df_wic.shape[0],'rows')
print('df_pa:\t\t',df_pa.shape[0],'rows')
print('df_health:\t',df_health.shape[0],'rows')
print('df_pub_lia:\t',df_pub_lia.shape[0],'rows')
print('df_bonds:\t',df_bonds.shape[0],'rows')
print('df_cnstr_engr:\t',df_cnstr_engr.shape[0],'rows')
print('df_prof_indm:\t',df_prof_indm.shape[0],'rows')
print('df_cpr:\t\t',df_cpr.shape[0],'rows')
print('df_others:\t',df_others.shape[0],'rows')

# Explore Insurance Types

## Class 'mac'

### Prepare Data

In [None]:
# df_mac
features = [col for col in df_mac._get_numeric_data().columns if (col != 'auwgr') and (col != 'year')]
X = df_mac[features]
y = df_mac['auwgr']
print(X.shape, y.shape)

### Train/Test Split

In [None]:
# split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
print(X_train.shape,X_test.shape,y_train.shape,y_test.shape)

### Standard Scaler

In [None]:
ss = StandardScaler()
ss.fit(X_train)
X_train_sc = ss.transform(X_train)
X_test_sc = ss.transform(X_test)

### Linear, Ridge, Lasso (with standard scaled data)

In [None]:
# LINEAR REG - Instantiate and score using cross validation (3 folds)
linreg = LinearRegression()
print('\nLINEAR REG cross-val mean score:')
print('X-Val score MEAN using X_train\t\t',cross_val_score(linreg, X_train_sc, y_train,cv=3).mean())

# RIDGE - Instantiate and score using cross validation (3 folds)
ridge=RidgeCV(alphas=np.linspace(.1, 10, 100))
print('\nRIDGE cross-val mean score:')
print('X-Val score MEAN using X_train\t\t',cross_val_score(ridge, X_train_sc, y_train,cv=3).mean())

# LASSO - Instantiate and score using cross validation (3 folds)
lasso = LassoCV(n_alphas=200,cv=3)
print('\nLASSO cross-val mean score:')
print('X-Val score MEAN using X_train\t\t',cross_val_score(lasso, X_train_sc, y_train,cv=3).mean())

### Power Transformer

In [None]:
pt_x = PowerTransformer() # transform X
pt_x.fit(X_train)
X_train_pt = pt_x.transform(X_train)
X_test_pt = pt_x.transform(X_test)

pt_y = PowerTransformer() # transform Y
# PowerTransformer requires a matrix/DataFrame, so we use .to_frame() method on y_train
# subsequently we use .ravel() to flatten it into an array (which is required for cross_val later)
pt_y.fit(y_train.to_frame())
y_train_pt = pt_y.transform(y_train.to_frame()).ravel()
y_test_pt = pt_y.transform(y_test.to_frame()).ravel()

### Linear, Ridge Lasso (with power transformed data)

In [None]:
# LINEAR REG - Instantiate and score using cross validation (3 folds)
linreg = LinearRegression()
print('\nLINEAR REG cross-val mean score:')
print('X-Val score MEAN using X_train\t\t',cross_val_score(linreg, X_train_pt, y_train_pt,cv=3).mean())

# RIDGE - Instantiate and score using cross validation (3 folds)
ridge=RidgeCV(alphas=np.linspace(.1, 10, 100))
print('\nRIDGE cross-val mean score:')
print('X-Val score MEAN using X_train\t\t',cross_val_score(ridge, X_train_pt, y_train_pt,cv=3).mean())

# LASSO - Instantiate and score using cross validation (3 folds)
lasso = LassoCV(n_alphas=200,cv=3)
print('\nLASSO cross-val mean score:')
print('X-Val score MEAN using X_train\t\t',cross_val_score(lasso, X_train_pt, y_train_pt,cv=3).mean())

## Class 'mahl'

### Prepare Data

In [None]:
# df_mahl
features = [col for col in df_mahl._get_numeric_data().columns if (col != 'auwgr') and (col != 'year')]
X = df_mahl[features]
y = df_mahl['auwgr']
print(X.shape, y.shape)

### Train/Test Split

In [None]:
# split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
print(X_train.shape,X_test.shape,y_train.shape,y_test.shape)

### Standard Scaler

In [None]:
ss = StandardScaler()
ss.fit(X_train)
X_train_sc = ss.transform(X_train)
X_test_sc = ss.transform(X_test)

### Linear, Ridge, Lasso (with standard scaled data)

In [None]:
# LINEAR REG - Instantiate and score using cross validation (3 folds)
linreg = LinearRegression()
print('\nLINEAR REG cross-val mean score:')
print('X-Val score MEAN using X_train\t\t',cross_val_score(linreg, X_train_sc, y_train,cv=3).mean())

# RIDGE - Instantiate and score using cross validation (3 folds)
ridge=RidgeCV(alphas=np.linspace(.1, 10, 100))
print('\nRIDGE cross-val mean score:')
print('X-Val score MEAN using X_train\t\t',cross_val_score(ridge, X_train_sc, y_train,cv=3).mean())

# LASSO - Instantiate and score using cross validation (3 folds)
lasso = LassoCV(n_alphas=200,cv=3)
print('\nLASSO cross-val mean score:')
print('X-Val score MEAN using X_train\t\t',cross_val_score(lasso, X_train_sc, y_train,cv=3).mean())

### Power Transformer

In [None]:
pt_x = PowerTransformer() # transform X
pt_x.fit(X_train)
X_train_pt = pt_x.transform(X_train)
X_test_pt = pt_x.transform(X_test)

pt_y = PowerTransformer() # transform Y
# PowerTransformer requires a matrix/DataFrame, so we use .to_frame() method on y_train
# subsequently we use .ravel() to flatten it into an array (which is required for cross_val later)
pt_y.fit(y_train.to_frame())
y_train_pt = pt_y.transform(y_train.to_frame()).ravel()
y_test_pt = pt_y.transform(y_test.to_frame()).ravel()

### Linear, Ridge Lasso (with power transformed data)

In [None]:
# LINEAR REG - Instantiate and score using cross validation (3 folds)
linreg = LinearRegression()
print('\nLINEAR REG cross-val mean score:')
print('X-Val score MEAN using X_train\t\t',cross_val_score(linreg, X_train_pt, y_train_pt,cv=3).mean())

# RIDGE - Instantiate and score using cross validation (3 folds)
ridge=RidgeCV(alphas=np.linspace(.1, 10, 100))
print('\nRIDGE cross-val mean score:')
print('X-Val score MEAN using X_train\t\t',cross_val_score(ridge, X_train_pt, y_train_pt,cv=3).mean())

# LASSO - Instantiate and score using cross validation (3 folds)
lasso = LassoCV(n_alphas=200,cv=3)
print('\nLASSO cross-val mean score:')
print('X-Val score MEAN using X_train\t\t',cross_val_score(lasso, X_train_pt, y_train_pt,cv=3).mean())

## Class 'fire'

### Prepare Data

In [None]:
# df_fire
features = [col for col in df_fire._get_numeric_data().columns if (col != 'auwgr') and (col != 'year')]
X = df_fire[features]
y = df_fire['auwgr']
print(X.shape, y.shape)

### Train/Test Split

In [None]:
# split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
print(X_train.shape,X_test.shape,y_train.shape,y_test.shape)

### Standard Scaler

In [None]:
ss = StandardScaler()
ss.fit(X_train)
X_train_sc = ss.transform(X_train)
X_test_sc = ss.transform(X_test)

### Linear, Ridge, Lasso (with standard scaled data)

In [None]:
# LINEAR REG - Instantiate and score using cross validation (3 folds)
linreg = LinearRegression()
print('\nLINEAR REG cross-val mean score:')
print('X-Val score MEAN using X_train\t\t',cross_val_score(linreg, X_train_sc, y_train,cv=3).mean())

# RIDGE - Instantiate and score using cross validation (3 folds)
ridge=RidgeCV(alphas=np.linspace(.1, 10, 100))
print('\nRIDGE cross-val mean score:')
print('X-Val score MEAN using X_train\t\t',cross_val_score(ridge, X_train_sc, y_train,cv=3).mean())

# LASSO - Instantiate and score using cross validation (3 folds)
lasso = LassoCV(n_alphas=200,cv=3)
print('\nLASSO cross-val mean score:')
print('X-Val score MEAN using X_train\t\t',cross_val_score(lasso, X_train_sc, y_train,cv=3).mean())

### Power Transformer

In [None]:
pt_x = PowerTransformer() # transform X
pt_x.fit(X_train)
X_train_pt = pt_x.transform(X_train)
X_test_pt = pt_x.transform(X_test)

pt_y = PowerTransformer() # transform Y
# PowerTransformer requires a matrix/DataFrame, so we use .to_frame() method on y_train
# subsequently we use .ravel() to flatten it into an array (which is required for cross_val later)
pt_y.fit(y_train.to_frame())
y_train_pt = pt_y.transform(y_train.to_frame()).ravel()
y_test_pt = pt_y.transform(y_test.to_frame()).ravel()

### Linear, Ridge Lasso (with power transformed data)

In [None]:
# LINEAR REG - Instantiate and score using cross validation (3 folds)
linreg = LinearRegression()
print('\nLINEAR REG cross-val mean score:')
print('X-Val score MEAN using X_train\t\t',cross_val_score(linreg, X_train_pt, y_train_pt,cv=3).mean())

# RIDGE - Instantiate and score using cross validation (3 folds)
ridge=RidgeCV(alphas=np.linspace(.1, 10, 100))
print('\nRIDGE cross-val mean score:')
print('X-Val score MEAN using X_train\t\t',cross_val_score(ridge, X_train_pt, y_train_pt,cv=3).mean())

# LASSO - Instantiate and score using cross validation (3 folds)
lasso = LassoCV(n_alphas=200,cv=3)
print('\nLASSO cross-val mean score:')
print('X-Val score MEAN using X_train\t\t',cross_val_score(lasso, X_train_pt, y_train_pt,cv=3).mean())

## Class 'motor'

### Prepare Data

In [None]:
# df_motor
features = [col for col in df_motor._get_numeric_data().columns if (col != 'auwgr') and (col != 'year')]
X = df_motor[features]
y = df_motor['auwgr']
print(X.shape, y.shape)

### Train/Test Split

In [None]:
# split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
print(X_train.shape,X_test.shape,y_train.shape,y_test.shape)

### Standard Scaler

In [None]:
ss = StandardScaler()
ss.fit(X_train)
X_train_sc = ss.transform(X_train)
X_test_sc = ss.transform(X_test)

### Linear, Ridge, Lasso (with standard scaled data)

In [None]:
# LINEAR REG - Instantiate and score using cross validation (3 folds)
linreg = LinearRegression()
print('\nLINEAR REG cross-val mean score:')
print('X-Val score MEAN using X_train\t\t',cross_val_score(linreg, X_train_sc, y_train,cv=3).mean())

# RIDGE - Instantiate and score using cross validation (3 folds)
ridge=RidgeCV(alphas=np.linspace(.1, 10, 100))
print('\nRIDGE cross-val mean score:')
print('X-Val score MEAN using X_train\t\t',cross_val_score(ridge, X_train_sc, y_train,cv=3).mean())

# LASSO - Instantiate and score using cross validation (3 folds)
lasso = LassoCV(n_alphas=200,cv=3)
print('\nLASSO cross-val mean score:')
print('X-Val score MEAN using X_train\t\t',cross_val_score(lasso, X_train_sc, y_train,cv=3).mean())

### Power Transformer

In [None]:
pt_x = PowerTransformer() # transform X
pt_x.fit(X_train)
X_train_pt = pt_x.transform(X_train)
X_test_pt = pt_x.transform(X_test)

pt_y = PowerTransformer() # transform Y
# PowerTransformer requires a matrix/DataFrame, so we use .to_frame() method on y_train
# subsequently we use .ravel() to flatten it into an array (which is required for cross_val later)
pt_y.fit(y_train.to_frame())
y_train_pt = pt_y.transform(y_train.to_frame()).ravel()
y_test_pt = pt_y.transform(y_test.to_frame()).ravel()

### Linear, Ridge Lasso (with power transformed data)

In [None]:
# LINEAR REG - Instantiate and score using cross validation (3 folds)
linreg = LinearRegression()
print('\nLINEAR REG cross-val mean score:')
print('X-Val score MEAN using X_train\t\t',cross_val_score(linreg, X_train_pt, y_train_pt,cv=3).mean())

# RIDGE - Instantiate and score using cross validation (3 folds)
ridge=RidgeCV(alphas=np.linspace(.1, 10, 100))
print('\nRIDGE cross-val mean score:')
print('X-Val score MEAN using X_train\t\t',cross_val_score(ridge, X_train_pt, y_train_pt,cv=3).mean())

# LASSO - Instantiate and score using cross validation (3 folds)
lasso = LassoCV(n_alphas=200,cv=3)
print('\nLASSO cross-val mean score:')
print('X-Val score MEAN using X_train\t\t',cross_val_score(lasso, X_train_pt, y_train_pt,cv=3).mean())

## Class 'wic'

### Prepare Data

In [None]:
# df_wic
features = [col for col in df_wic._get_numeric_data().columns if (col != 'auwgr') and (col != 'year')]
X = df_wic[features]
y = df_wic['auwgr']
print(X.shape, y.shape)

### Train/Test Split

In [None]:
# split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
print(X_train.shape,X_test.shape,y_train.shape,y_test.shape)

### Standard Scaler

In [None]:
ss = StandardScaler()
ss.fit(X_train)
X_train_sc = ss.transform(X_train)
X_test_sc = ss.transform(X_test)

### Linear, Ridge, Lasso (with standard scaled data)

In [None]:
# LINEAR REG - Instantiate and score using cross validation (3 folds)
linreg = LinearRegression()
print('\nLINEAR REG cross-val mean score:')
print('X-Val score MEAN using X_train\t\t',cross_val_score(linreg, X_train_sc, y_train,cv=3).mean())

# RIDGE - Instantiate and score using cross validation (3 folds)
ridge=RidgeCV(alphas=np.linspace(.1, 10, 100))
print('\nRIDGE cross-val mean score:')
print('X-Val score MEAN using X_train\t\t',cross_val_score(ridge, X_train_sc, y_train,cv=3).mean())

# LASSO - Instantiate and score using cross validation (3 folds)
lasso = LassoCV(n_alphas=200,cv=3)
print('\nLASSO cross-val mean score:')
print('X-Val score MEAN using X_train\t\t',cross_val_score(lasso, X_train_sc, y_train,cv=3).mean())

### Power Transformer

In [None]:
pt_x = PowerTransformer() # transform X
pt_x.fit(X_train)
X_train_pt = pt_x.transform(X_train)
X_test_pt = pt_x.transform(X_test)

pt_y = PowerTransformer() # transform Y
# PowerTransformer requires a matrix/DataFrame, so we use .to_frame() method on y_train
# subsequently we use .ravel() to flatten it into an array (which is required for cross_val later)
pt_y.fit(y_train.to_frame())
y_train_pt = pt_y.transform(y_train.to_frame()).ravel()
y_test_pt = pt_y.transform(y_test.to_frame()).ravel()

### Linear, Ridge Lasso (with power transformed data)

In [None]:
# LINEAR REG - Instantiate and score using cross validation (3 folds)
linreg = LinearRegression()
print('\nLINEAR REG cross-val mean score:')
print('X-Val score MEAN using X_train\t\t',cross_val_score(linreg, X_train_pt, y_train_pt,cv=3).mean())

# RIDGE - Instantiate and score using cross validation (3 folds)
ridge=RidgeCV(alphas=np.linspace(.1, 10, 100))
print('\nRIDGE cross-val mean score:')
print('X-Val score MEAN using X_train\t\t',cross_val_score(ridge, X_train_pt, y_train_pt,cv=3).mean())

# LASSO - Instantiate and score using cross validation (3 folds)
lasso = LassoCV(n_alphas=200,cv=3)
print('\nLASSO cross-val mean score:')
print('X-Val score MEAN using X_train\t\t',cross_val_score(lasso, X_train_pt, y_train_pt,cv=3).mean())

## Class 'pa'

### Prepare Data

In [None]:
# df_pa
features = [col for col in df_pa._get_numeric_data().columns if (col != 'auwgr') and (col != 'year')]
X = df_pa[features]
y = df_pa['auwgr']
print(X.shape, y.shape)

### Train/Test Split

In [None]:
# split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
print(X_train.shape,X_test.shape,y_train.shape,y_test.shape)

### Standard Scaler

In [None]:
ss = StandardScaler()
ss.fit(X_train)
X_train_sc = ss.transform(X_train)
X_test_sc = ss.transform(X_test)

### Linear, Ridge, Lasso (with standard scaled data)

In [None]:
# LINEAR REG - Instantiate and score using cross validation (3 folds)
linreg = LinearRegression()
print('\nLINEAR REG cross-val mean score:')
print('X-Val score MEAN using X_train\t\t',cross_val_score(linreg, X_train_sc, y_train,cv=3).mean())

# RIDGE - Instantiate and score using cross validation (3 folds)
ridge=RidgeCV(alphas=np.linspace(.1, 10, 100))
print('\nRIDGE cross-val mean score:')
print('X-Val score MEAN using X_train\t\t',cross_val_score(ridge, X_train_sc, y_train,cv=3).mean())

# LASSO - Instantiate and score using cross validation (3 folds)
lasso = LassoCV(n_alphas=200,cv=3)
print('\nLASSO cross-val mean score:')
print('X-Val score MEAN using X_train\t\t',cross_val_score(lasso, X_train_sc, y_train,cv=3).mean())

### Power Transformer

In [None]:
pt_x = PowerTransformer() # transform X
pt_x.fit(X_train)
X_train_pt = pt_x.transform(X_train)
X_test_pt = pt_x.transform(X_test)

pt_y = PowerTransformer() # transform Y
# PowerTransformer requires a matrix/DataFrame, so we use .to_frame() method on y_train
# subsequently we use .ravel() to flatten it into an array (which is required for cross_val later)
pt_y.fit(y_train.to_frame())
y_train_pt = pt_y.transform(y_train.to_frame()).ravel()
y_test_pt = pt_y.transform(y_test.to_frame()).ravel()

### Linear, Ridge Lasso (with power transformed data)

In [None]:
# LINEAR REG - Instantiate and score using cross validation (3 folds)
linreg = LinearRegression()
print('\nLINEAR REG cross-val mean score:')
print('X-Val score MEAN using X_train\t\t',cross_val_score(linreg, X_train_pt, y_train_pt,cv=3).mean())

# RIDGE - Instantiate and score using cross validation (3 folds)
ridge=RidgeCV(alphas=np.linspace(.1, 10, 100))
print('\nRIDGE cross-val mean score:')
print('X-Val score MEAN using X_train\t\t',cross_val_score(ridge, X_train_pt, y_train_pt,cv=3).mean())

# LASSO - Instantiate and score using cross validation (3 folds)
lasso = LassoCV(n_alphas=200,cv=3)
print('\nLASSO cross-val mean score:')
print('X-Val score MEAN using X_train\t\t',cross_val_score(lasso, X_train_pt, y_train_pt,cv=3).mean())

## Class 'health'

### Prepare Data

In [None]:
# df_health
features = [col for col in df_health._get_numeric_data().columns if (col != 'auwgr') and (col != 'year')]
X = df_health[features]
y = df_health['auwgr']
print(X.shape, y.shape)

### Train/Test Split

In [None]:
# split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
print(X_train.shape,X_test.shape,y_train.shape,y_test.shape)

### Standard Scaler

In [None]:
ss = StandardScaler()
ss.fit(X_train)
X_train_sc = ss.transform(X_train)
X_test_sc = ss.transform(X_test)

### Linear, Ridge, Lasso (with standard scaled data)

In [None]:
# LINEAR REG - Instantiate and score using cross validation (3 folds)
linreg = LinearRegression()
print('\nLINEAR REG cross-val mean score:')
print('X-Val score MEAN using X_train\t\t',cross_val_score(linreg, X_train_sc, y_train,cv=3).mean())

# RIDGE - Instantiate and score using cross validation (3 folds)
ridge=RidgeCV(alphas=np.linspace(.1, 10, 100))
print('\nRIDGE cross-val mean score:')
print('X-Val score MEAN using X_train\t\t',cross_val_score(ridge, X_train_sc, y_train,cv=3).mean())

# LASSO - Instantiate and score using cross validation (3 folds)
lasso = LassoCV(n_alphas=200,cv=3)
print('\nLASSO cross-val mean score:')
print('X-Val score MEAN using X_train\t\t',cross_val_score(lasso, X_train_sc, y_train,cv=3).mean())

### Power Transformer

In [None]:
pt_x = PowerTransformer() # transform X
pt_x.fit(X_train)
X_train_pt = pt_x.transform(X_train)
X_test_pt = pt_x.transform(X_test)

pt_y = PowerTransformer() # transform Y
# PowerTransformer requires a matrix/DataFrame, so we use .to_frame() method on y_train
# subsequently we use .ravel() to flatten it into an array (which is required for cross_val later)
pt_y.fit(y_train.to_frame())
y_train_pt = pt_y.transform(y_train.to_frame()).ravel()
y_test_pt = pt_y.transform(y_test.to_frame()).ravel()

### Linear, Ridge Lasso (with power transformed data)

In [None]:
# LINEAR REG - Instantiate and score using cross validation (3 folds)
linreg = LinearRegression()
print('\nLINEAR REG cross-val mean score:')
print('X-Val score MEAN using X_train\t\t',cross_val_score(linreg, X_train_pt, y_train_pt,cv=3).mean())

# RIDGE - Instantiate and score using cross validation (3 folds)
ridge=RidgeCV(alphas=np.linspace(.1, 10, 100))
print('\nRIDGE cross-val mean score:')
print('X-Val score MEAN using X_train\t\t',cross_val_score(ridge, X_train_pt, y_train_pt,cv=3).mean())

# LASSO - Instantiate and score using cross validation (3 folds)
lasso = LassoCV(n_alphas=200,cv=3)
print('\nLASSO cross-val mean score:')
print('X-Val score MEAN using X_train\t\t',cross_val_score(lasso, X_train_pt, y_train_pt,cv=3).mean())

## Class 'pub_lia'

### Prepare Data

In [None]:
# df_pub_lia
features = [col for col in df_pub_lia._get_numeric_data().columns if (col != 'auwgr') and (col != 'year')]
X = df_pub_lia[features]
y = df_pub_lia['auwgr']
print(X.shape, y.shape)

### Train/Test Split

In [None]:
# split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
print(X_train.shape,X_test.shape,y_train.shape,y_test.shape)

### Standard Scaler

In [None]:
ss = StandardScaler()
ss.fit(X_train)
X_train_sc = ss.transform(X_train)
X_test_sc = ss.transform(X_test)

### Linear, Ridge, Lasso (with standard scaled data)

In [None]:
# LINEAR REG - Instantiate and score using cross validation (3 folds)
linreg = LinearRegression()
print('\nLINEAR REG cross-val mean score:')
print('X-Val score MEAN using X_train\t\t',cross_val_score(linreg, X_train_sc, y_train,cv=3).mean())

# RIDGE - Instantiate and score using cross validation (3 folds)
ridge=RidgeCV(alphas=np.linspace(.1, 10, 100))
print('\nRIDGE cross-val mean score:')
print('X-Val score MEAN using X_train\t\t',cross_val_score(ridge, X_train_sc, y_train,cv=3).mean())

# LASSO - Instantiate and score using cross validation (3 folds)
lasso = LassoCV(n_alphas=200,cv=3)
print('\nLASSO cross-val mean score:')
print('X-Val score MEAN using X_train\t\t',cross_val_score(lasso, X_train_sc, y_train,cv=3).mean())

### Power Transformer

In [None]:
pt_x = PowerTransformer() # transform X
pt_x.fit(X_train)
X_train_pt = pt_x.transform(X_train)
X_test_pt = pt_x.transform(X_test)

pt_y = PowerTransformer() # transform Y
# PowerTransformer requires a matrix/DataFrame, so we use .to_frame() method on y_train
# subsequently we use .ravel() to flatten it into an array (which is required for cross_val later)
pt_y.fit(y_train.to_frame())
y_train_pt = pt_y.transform(y_train.to_frame()).ravel()
y_test_pt = pt_y.transform(y_test.to_frame()).ravel()

### Linear, Ridge Lasso (with power transformed data)

In [None]:
# LINEAR REG - Instantiate and score using cross validation (3 folds)
linreg = LinearRegression()
print('\nLINEAR REG cross-val mean score:')
print('X-Val score MEAN using X_train\t\t',cross_val_score(linreg, X_train_pt, y_train_pt,cv=3).mean())

# RIDGE - Instantiate and score using cross validation (3 folds)
ridge=RidgeCV(alphas=np.linspace(.1, 10, 100))
print('\nRIDGE cross-val mean score:')
print('X-Val score MEAN using X_train\t\t',cross_val_score(ridge, X_train_pt, y_train_pt,cv=3).mean())

# LASSO - Instantiate and score using cross validation (3 folds)
lasso = LassoCV(n_alphas=200,cv=3)
print('\nLASSO cross-val mean score:')
print('X-Val score MEAN using X_train\t\t',cross_val_score(lasso, X_train_pt, y_train_pt,cv=3).mean())

## Class 'bonds'

### Prepare Data

In [None]:
# df_bonds
features = [col for col in df_bonds._get_numeric_data().columns if (col != 'auwgr') and (col != 'year')]
X = df_bonds[features]
y = df_bonds['auwgr']
print(X.shape, y.shape)

### Train/Test Split

In [None]:
# split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
print(X_train.shape,X_test.shape,y_train.shape,y_test.shape)

### Standard Scaler

In [None]:
ss = StandardScaler()
ss.fit(X_train)
X_train_sc = ss.transform(X_train)
X_test_sc = ss.transform(X_test)

### Linear, Ridge, Lasso (with standard scaled data)

In [None]:
# LINEAR REG - Instantiate and score using cross validation (3 folds)
linreg = LinearRegression()
print('\nLINEAR REG cross-val mean score:')
print('X-Val score MEAN using X_train\t\t',cross_val_score(linreg, X_train_sc, y_train,cv=3).mean())

# RIDGE - Instantiate and score using cross validation (3 folds)
ridge=RidgeCV(alphas=np.linspace(.1, 10, 100))
print('\nRIDGE cross-val mean score:')
print('X-Val score MEAN using X_train\t\t',cross_val_score(ridge, X_train_sc, y_train,cv=3).mean())

# LASSO - Instantiate and score using cross validation (3 folds)
lasso = LassoCV(n_alphas=200,cv=3)
print('\nLASSO cross-val mean score:')
print('X-Val score MEAN using X_train\t\t',cross_val_score(lasso, X_train_sc, y_train,cv=3).mean())

### Power Transformer

In [None]:
pt_x = PowerTransformer() # transform X
pt_x.fit(X_train)
X_train_pt = pt_x.transform(X_train)
X_test_pt = pt_x.transform(X_test)

pt_y = PowerTransformer() # transform Y
# PowerTransformer requires a matrix/DataFrame, so we use .to_frame() method on y_train
# subsequently we use .ravel() to flatten it into an array (which is required for cross_val later)
pt_y.fit(y_train.to_frame())
y_train_pt = pt_y.transform(y_train.to_frame()).ravel()
y_test_pt = pt_y.transform(y_test.to_frame()).ravel()

### Linear, Ridge Lasso (with power transformed data)

In [None]:
# LINEAR REG - Instantiate and score using cross validation (3 folds)
linreg = LinearRegression()
print('\nLINEAR REG cross-val mean score:')
print('X-Val score MEAN using X_train\t\t',cross_val_score(linreg, X_train_pt, y_train_pt,cv=3).mean())

# RIDGE - Instantiate and score using cross validation (3 folds)
ridge=RidgeCV(alphas=np.linspace(.1, 10, 100))
print('\nRIDGE cross-val mean score:')
print('X-Val score MEAN using X_train\t\t',cross_val_score(ridge, X_train_pt, y_train_pt,cv=3).mean())

# LASSO - Instantiate and score using cross validation (3 folds)
lasso = LassoCV(n_alphas=200,cv=3)
print('\nLASSO cross-val mean score:')
print('X-Val score MEAN using X_train\t\t',cross_val_score(lasso, X_train_pt, y_train_pt,cv=3).mean())

## Class 'cnstr_engr'

### Prepare Data

In [None]:
# df_cnstr_engr
features = [col for col in df_cnstr_engr._get_numeric_data().columns if (col != 'auwgr') and (col != 'year')]
X = df_cnstr_engr[features]
y = df_cnstr_engr['auwgr']
print(X.shape, y.shape)

### Train/Test Split

In [None]:
# split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
print(X_train.shape,X_test.shape,y_train.shape,y_test.shape)

### Standard Scaler

In [None]:
ss = StandardScaler()
ss.fit(X_train)
X_train_sc = ss.transform(X_train)
X_test_sc = ss.transform(X_test)

### Linear, Ridge, Lasso (with standard scaled data)

In [None]:
# LINEAR REG - Instantiate and score using cross validation (3 folds)
linreg = LinearRegression()
print('\nLINEAR REG cross-val mean score:')
print('X-Val score MEAN using X_train\t\t',cross_val_score(linreg, X_train_sc, y_train,cv=3).mean())

# RIDGE - Instantiate and score using cross validation (3 folds)
ridge=RidgeCV(alphas=np.linspace(.1, 10, 100))
print('\nRIDGE cross-val mean score:')
print('X-Val score MEAN using X_train\t\t',cross_val_score(ridge, X_train_sc, y_train,cv=3).mean())

# LASSO - Instantiate and score using cross validation (3 folds)
lasso = LassoCV(n_alphas=200,cv=3)
print('\nLASSO cross-val mean score:')
print('X-Val score MEAN using X_train\t\t',cross_val_score(lasso, X_train_sc, y_train,cv=3).mean())

### Power Transformer

In [None]:
pt_x = PowerTransformer() # transform X
pt_x.fit(X_train)
X_train_pt = pt_x.transform(X_train)
X_test_pt = pt_x.transform(X_test)

pt_y = PowerTransformer() # transform Y
# PowerTransformer requires a matrix/DataFrame, so we use .to_frame() method on y_train
# subsequently we use .ravel() to flatten it into an array (which is required for cross_val later)
pt_y.fit(y_train.to_frame())
y_train_pt = pt_y.transform(y_train.to_frame()).ravel()
y_test_pt = pt_y.transform(y_test.to_frame()).ravel()

### Linear, Ridge Lasso (with power transformed data)

In [None]:
# LINEAR REG - Instantiate and score using cross validation (3 folds)
linreg = LinearRegression()
print('\nLINEAR REG cross-val mean score:')
print('X-Val score MEAN using X_train\t\t',cross_val_score(linreg, X_train_pt, y_train_pt,cv=3).mean())

# RIDGE - Instantiate and score using cross validation (3 folds)
ridge=RidgeCV(alphas=np.linspace(.1, 10, 100))
print('\nRIDGE cross-val mean score:')
print('X-Val score MEAN using X_train\t\t',cross_val_score(ridge, X_train_pt, y_train_pt,cv=3).mean())

# LASSO - Instantiate and score using cross validation (3 folds)
lasso = LassoCV(n_alphas=200,cv=3)
print('\nLASSO cross-val mean score:')
print('X-Val score MEAN using X_train\t\t',cross_val_score(lasso, X_train_pt, y_train_pt,cv=3).mean())

## Class 'prof_indm'

### Prepare Data

In [None]:
# df_prof_indm
features = [col for col in df_prof_indm._get_numeric_data().columns if (col != 'auwgr') and (col != 'year')]
X = df_prof_indm[features]
y = df_prof_indm['auwgr']
print(X.shape, y.shape)

### Train/Test Split

In [None]:
# split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
print(X_train.shape,X_test.shape,y_train.shape,y_test.shape)

### Standard Scaler

In [None]:
ss = StandardScaler()
ss.fit(X_train)
X_train_sc = ss.transform(X_train)
X_test_sc = ss.transform(X_test)

### Linear, Ridge, Lasso (with standard scaled data)

In [None]:
# LINEAR REG - Instantiate and score using cross validation (3 folds)
linreg = LinearRegression()
print('\nLINEAR REG cross-val mean score:')
print('X-Val score MEAN using X_train\t\t',cross_val_score(linreg, X_train_sc, y_train,cv=3).mean())

# RIDGE - Instantiate and score using cross validation (3 folds)
ridge=RidgeCV(alphas=np.linspace(.1, 10, 100))
print('\nRIDGE cross-val mean score:')
print('X-Val score MEAN using X_train\t\t',cross_val_score(ridge, X_train_sc, y_train,cv=3).mean())

# LASSO - Instantiate and score using cross validation (3 folds)
lasso = LassoCV(n_alphas=200,cv=3)
print('\nLASSO cross-val mean score:')
print('X-Val score MEAN using X_train\t\t',cross_val_score(lasso, X_train_sc, y_train,cv=3).mean())

### Power Transformer

In [None]:
pt_x = PowerTransformer() # transform X
pt_x.fit(X_train)
X_train_pt = pt_x.transform(X_train)
X_test_pt = pt_x.transform(X_test)

pt_y = PowerTransformer() # transform Y
# PowerTransformer requires a matrix/DataFrame, so we use .to_frame() method on y_train
# subsequently we use .ravel() to flatten it into an array (which is required for cross_val later)
pt_y.fit(y_train.to_frame())
y_train_pt = pt_y.transform(y_train.to_frame()).ravel()
y_test_pt = pt_y.transform(y_test.to_frame()).ravel()

### Linear, Ridge Lasso (with power transformed data)

In [None]:
# LINEAR REG - Instantiate and score using cross validation (3 folds)
linreg = LinearRegression()
print('\nLINEAR REG cross-val mean score:')
print('X-Val score MEAN using X_train\t\t',cross_val_score(linreg, X_train_pt, y_train_pt,cv=3).mean())

# RIDGE - Instantiate and score using cross validation (3 folds)
ridge=RidgeCV(alphas=np.linspace(.1, 10, 100))
print('\nRIDGE cross-val mean score:')
print('X-Val score MEAN using X_train\t\t',cross_val_score(ridge, X_train_pt, y_train_pt,cv=3).mean())

# LASSO - Instantiate and score using cross validation (3 folds)
lasso = LassoCV(n_alphas=200,cv=3)
print('\nLASSO cross-val mean score:')
print('X-Val score MEAN using X_train\t\t',cross_val_score(lasso, X_train_pt, y_train_pt,cv=3).mean())

## Class 'cpr'

### Prepare Data

In [None]:
# df_cpr
features = [col for col in df_cpr._get_numeric_data().columns if (col != 'auwgr') and (col != 'year')]
X = df_cpr[features]
y = df_cpr['auwgr']
print(X.shape, y.shape)

### Train/Test Split

In [None]:
# split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
print(X_train.shape,X_test.shape,y_train.shape,y_test.shape)

### Standard Scaler

In [None]:
ss = StandardScaler()
ss.fit(X_train)
X_train_sc = ss.transform(X_train)
X_test_sc = ss.transform(X_test)

### Linear, Ridge, Lasso (with standard scaled data)

In [None]:
# LINEAR REG - Instantiate and score using cross validation (3 folds)
linreg = LinearRegression()
print('\nLINEAR REG cross-val mean score:')
print('X-Val score MEAN using X_train\t\t',cross_val_score(linreg, X_train_sc, y_train,cv=3).mean())

# RIDGE - Instantiate and score using cross validation (3 folds)
ridge=RidgeCV(alphas=np.linspace(.1, 10, 100))
print('\nRIDGE cross-val mean score:')
print('X-Val score MEAN using X_train\t\t',cross_val_score(ridge, X_train_sc, y_train,cv=3).mean())

# LASSO - Instantiate and score using cross validation (3 folds)
lasso = LassoCV(n_alphas=200,cv=3)
print('\nLASSO cross-val mean score:')
print('X-Val score MEAN using X_train\t\t',cross_val_score(lasso, X_train_sc, y_train,cv=3).mean())

### Power Transformer

In [None]:
pt_x = PowerTransformer() # transform X
pt_x.fit(X_train)
X_train_pt = pt_x.transform(X_train)
X_test_pt = pt_x.transform(X_test)

pt_y = PowerTransformer() # transform Y
# PowerTransformer requires a matrix/DataFrame, so we use .to_frame() method on y_train
# subsequently we use .ravel() to flatten it into an array (which is required for cross_val later)
pt_y.fit(y_train.to_frame())
y_train_pt = pt_y.transform(y_train.to_frame()).ravel()
y_test_pt = pt_y.transform(y_test.to_frame()).ravel()

### Linear, Ridge Lasso (with power transformed data)

In [None]:
# LINEAR REG - Instantiate and score using cross validation (3 folds)
linreg = LinearRegression()
print('\nLINEAR REG cross-val mean score:')
print('X-Val score MEAN using X_train\t\t',cross_val_score(linreg, X_train_pt, y_train_pt,cv=3).mean())

# RIDGE - Instantiate and score using cross validation (3 folds)
ridge=RidgeCV(alphas=np.linspace(.1, 10, 100))
print('\nRIDGE cross-val mean score:')
print('X-Val score MEAN using X_train\t\t',cross_val_score(ridge, X_train_pt, y_train_pt,cv=3).mean())

# LASSO - Instantiate and score using cross validation (3 folds)
lasso = LassoCV(n_alphas=200,cv=3)
print('\nLASSO cross-val mean score:')
print('X-Val score MEAN using X_train\t\t',cross_val_score(lasso, X_train_pt, y_train_pt,cv=3).mean())

## Class 'others'

### Prepare Data

In [None]:
# df_others
features = [col for col in df_others._get_numeric_data().columns if (col != 'auwgr') and (col != 'year')]
X = df_others[features]
y = df_others['auwgr']
print(X.shape, y.shape)

### Train/Test Split

In [None]:
# split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
print(X_train.shape,X_test.shape,y_train.shape,y_test.shape)

### Standard Scaler

In [None]:
ss = StandardScaler()
ss.fit(X_train)
X_train_sc = ss.transform(X_train)
X_test_sc = ss.transform(X_test)

### Linear, Ridge, Lasso (with standard scaled data)

In [None]:
# LINEAR REG - Instantiate and score using cross validation (3 folds)
linreg = LinearRegression()
print('\nLINEAR REG cross-val mean score:')
print('X-Val score MEAN using X_train\t\t',cross_val_score(linreg, X_train_sc, y_train,cv=3).mean())

# RIDGE - Instantiate and score using cross validation (3 folds)
ridge=RidgeCV(alphas=np.linspace(.1, 10, 100))
print('\nRIDGE cross-val mean score:')
print('X-Val score MEAN using X_train\t\t',cross_val_score(ridge, X_train_sc, y_train,cv=3).mean())

# LASSO - Instantiate and score using cross validation (3 folds)
lasso = LassoCV(n_alphas=200,cv=3)
print('\nLASSO cross-val mean score:')
print('X-Val score MEAN using X_train\t\t',cross_val_score(lasso, X_train_sc, y_train,cv=3).mean())

### Power Transformer

In [None]:
pt_x = PowerTransformer() # transform X
pt_x.fit(X_train)
X_train_pt = pt_x.transform(X_train)
X_test_pt = pt_x.transform(X_test)

pt_y = PowerTransformer() # transform Y
# PowerTransformer requires a matrix/DataFrame, so we use .to_frame() method on y_train
# subsequently we use .ravel() to flatten it into an array (which is required for cross_val later)
pt_y.fit(y_train.to_frame())
y_train_pt = pt_y.transform(y_train.to_frame()).ravel()
y_test_pt = pt_y.transform(y_test.to_frame()).ravel()

### Linear, Ridge Lasso (with power transformed data)

In [None]:
# LINEAR REG - Instantiate and score using cross validation (3 folds)
linreg = LinearRegression()
print('\nLINEAR REG cross-val mean score:')
print('X-Val score MEAN using X_train\t\t',cross_val_score(linreg, X_train_pt, y_train_pt,cv=3).mean())

# RIDGE - Instantiate and score using cross validation (3 folds)
ridge=RidgeCV(alphas=np.linspace(.1, 10, 100))
print('\nRIDGE cross-val mean score:')
print('X-Val score MEAN using X_train\t\t',cross_val_score(ridge, X_train_pt, y_train_pt,cv=3).mean())

# LASSO - Instantiate and score using cross validation (3 folds)
lasso = LassoCV(n_alphas=200,cv=3)
print('\nLASSO cross-val mean score:')
print('X-Val score MEAN using X_train\t\t',cross_val_score(lasso, X_train_pt, y_train_pt,cv=3).mean())

# Checkpoint 2: Comments
The cross-validation scores above show that the data, even when separated into individual insurance types, cannot be used for regression.<br>
Further analysis of the data is required.<br><br>
Deep dive in the 'fire' class (this has the most number of rows - 765 rows) to see what we can find.

# Define Functions

In [None]:
def find_threshold_count(df_arg, col_name, min_thresh, max_thresh):
    counter_min=df_arg[df_arg[col_name]< min_thresh].count().head(1).values[0]
    counter_pct_min=round(100*(df_arg[df_arg[col_name]< min_thresh].count().head(1).values[0]/num_rows),2)
    counter_max=df_arg[df_arg[col_name]> max_thresh].count().head(1).values[0]
    counter_pct_max=round(100*(df_arg[df_arg[col_name]> max_thresh].count().head(1).values[0]/num_rows),2)
    print('column name:', col_name)
    print('no. of rows with values less than',min_thresh,'=>\t',counter_min,'(',counter_pct_min,'% )')
    print('no. of rows with values more than',max_thresh,'=>\t',counter_max,'(',counter_pct_max,'% )')
    print('total percentage of rows beyond min/max thresholds:',round(counter_pct_min+counter_pct_max,2),'%')
    return None

In [None]:
# get the list of indexes, then drop the rows based on the corresponding index
def drop_outside_threshold(df_arg, col_name, min_thresh, max_thresh):
    old_shape = df_arg.shape
    try:
        idx_list = list(df_arg[(df_arg[col_name]<min_thresh) | (df_arg[col_name]>max_thresh)].index.values)
    except:
        print("No such cases found...")
    else:
        if len(idx_list) > 0:
            df_arg.drop(index=idx_list,inplace=True)
    finally:
        print('Dropped',len(idx_list),'rows...')
        print('Old shape =>',old_shape)
        print('New shape =>',df_arg.shape)
    
    return None

# Deep Dive into 'fire' class

## Reset Dataframe

In [None]:
# df_fire = df[df['class']=='fire'].copy()
df_fire = df.copy()

## Explore data

In [None]:
sns.pairplot(df_fire, x_vars=['lkpp', 'amlr', 'amer', 'der', 'aoer'], y_vars=['auwgr'])

In [None]:
df_fire.describe()

### augwr

In [None]:
sns.boxplot(df_fire['auwgr']);

In [None]:
# find_threshold_count(df_fire,'auwgr',-3,7)
find_threshold_count(df_fire,'auwgr',-2,2)

In [None]:
drop_outside_threshold(df_fire,'auwgr',-2,2)

In [None]:
sns.boxplot(df_fire['auwgr']);

In [None]:
df_fire.describe()

### lkpp

In [None]:
sns.boxplot(df_fire['lkpp']);

In [None]:
find_threshold_count(df_fire,'lkpp',-5,13)

In [None]:
drop_outside_threshold(df_fire,'lkpp',-5,13)

In [None]:
sns.boxplot(df_fire['lkpp']);

In [None]:
df_fire.describe()

### amlr

In [None]:
sns.boxplot(df_fire['amlr']);

In [None]:
find_threshold_count(df_fire,'amlr',-1,2.2)

In [None]:
drop_outside_threshold(df_fire,'amlr',-1,2.2)

In [None]:
sns.boxplot(df_fire['amlr']);

In [None]:
df_fire.describe()

### amer

In [None]:
sns.boxplot(df_fire['amer']);

In [None]:
find_threshold_count(df_fire,'amer',0,5)

In [None]:
drop_outside_threshold(df_fire,'amer',0,5)

In [None]:
sns.boxplot(df_fire['amer']);

In [None]:
df_fire.describe()

### der

In [None]:
sns.boxplot(df_fire['der']);

In [None]:
find_threshold_count(df_fire,'der',-1,2)

In [None]:
drop_outside_threshold(df_fire,'der',-1,2)

In [None]:
sns.boxplot(df_fire['der']);

In [None]:
df_fire.describe()

### aoer

In [None]:
sns.boxplot(df_fire['aoer']);

In [None]:
find_threshold_count(df_fire,'aoer',0,0.2)

In [None]:
drop_outside_threshold(df_fire,'aoer',0,0.2)

In [None]:
sns.boxplot(df_fire['aoer']);

In [None]:
df_fire.describe()

## Re-run Regression Model

### Prepare Data

In [None]:
df_fire.shape

In [None]:
# df_fire
features = [col for col in df_fire._get_numeric_data().columns if (col != 'auwgr') and (col != 'year')]
X = df_fire[features]
y = df_fire['auwgr']
print(X.shape, y.shape)

### Train/Test Split

In [None]:
# split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
print(X_train.shape,X_test.shape,y_train.shape,y_test.shape)

### Standard Scaler

In [None]:
ss = StandardScaler()
ss.fit(X_train)
X_train_sc = ss.transform(X_train)
X_test_sc = ss.transform(X_test)

### Linear, Ridge, Lasso (with standard scaled data)

In [None]:
# LINEAR REG - Instantiate and score using cross validation (3 folds)
linreg = LinearRegression()
print('\nLINEAR REG cross-val mean score:')
print('X-Val score MEAN using X_train\t\t',cross_val_score(linreg, X_train_sc, y_train,cv=3).mean())

# RIDGE - Instantiate and score using cross validation (3 folds)
ridge=RidgeCV(alphas=np.linspace(.1, 10, 100))
print('\nRIDGE cross-val mean score:')
print('X-Val score MEAN using X_train\t\t',cross_val_score(ridge, X_train_sc, y_train,cv=3).mean())

# LASSO - Instantiate and score using cross validation (3 folds)
lasso = LassoCV(n_alphas=200,cv=3)
print('\nLASSO cross-val mean score:')
print('X-Val score MEAN using X_train\t\t',cross_val_score(lasso, X_train_sc, y_train,cv=3).mean())

### Power Transformer

In [None]:
pt_x = PowerTransformer() # transform X
pt_x.fit(X_train)
X_train_pt = pt_x.transform(X_train)
X_test_pt = pt_x.transform(X_test)

pt_y = PowerTransformer() # transform Y
# PowerTransformer requires a matrix/DataFrame, so we use .to_frame() method on y_train
# subsequently we use .ravel() to flatten it into an array (which is required for cross_val later)
pt_y.fit(y_train.to_frame())
y_train_pt = pt_y.transform(y_train.to_frame()).ravel()
y_test_pt = pt_y.transform(y_test.to_frame()).ravel()

### Linear, Ridge Lasso (with power transformed data)

In [None]:
# LINEAR REG - Instantiate and score using cross validation (3 folds)
linreg = LinearRegression()
print('\nLINEAR REG cross-val mean score:')
print('X-Val score MEAN using X_train\t\t',cross_val_score(linreg, X_train_pt, y_train_pt,cv=3).mean())

# RIDGE - Instantiate and score using cross validation (3 folds)
ridge=RidgeCV(alphas=np.linspace(.1, 10, 100))
print('\nRIDGE cross-val mean score:')
print('X-Val score MEAN using X_train\t\t',cross_val_score(ridge, X_train_pt, y_train_pt,cv=3).mean())

# LASSO - Instantiate and score using cross validation (3 folds)
lasso = LassoCV(n_alphas=200,cv=3)
print('\nLASSO cross-val mean score:')
print('X-Val score MEAN using X_train\t\t',cross_val_score(lasso, X_train_pt, y_train_pt,cv=3).mean())

In [None]:
sns.pairplot(df_fire, x_vars=['lkpp', 'amlr', 'amer', 'der', 'aoer'], y_vars=['auwgr'])

## Decision Tree Regressor

### Initial Hyperparameters

In [None]:
dtreg = DecisionTreeRegressor()
dtreg.fit(X_train,y_train) # Use un-scaled data
# Evaluate model.
print(dtreg.score(X_train,y_train))
print(dtreg.score(X_test,y_test))

### GridSearchCV

In [None]:
param_grid = [{'max_depth':range(2,1000),
               'min_samples_split':range(2,21)
              }]

In [None]:
reg = GridSearchCV(DecisionTreeRegressor(), param_grid, cv=5)
reg.fit(X_train, y_train)
reg.best_params_

### Results

In [None]:
dtreg = DecisionTreeRegressor(max_depth=2, min_samples_split=15)
dtreg.fit(X_train,y_train) # Use un-scaled data
# Evaluate model.
print(dtreg.score(X_train,y_train))
print(dtreg.score(X_test,y_test))

########## **Poor score, can't use this** ###########

## Random Forest Regressor

### Initial Hyperparameters

In [None]:
rfreg = RandomForestRegressor(n_estimators=10) # default no. of trees ('n_estimators') = 10
rfreg.fit(X_train,y_train) # Use un-scaled data
# Evaluate model
print(rfreg.score(X_train,y_train))
print(rfreg.score(X_test,y_test))

### GridSearchCV

In [None]:
param_grid = [{'n_estimators':[50,100,200],
               'max_depth':range(2,50),
               'min_samples_split':range(2,20),
               'oob_score':[True]
              }]

reg = GridSearchCV(RandomForestRegressor(), param_grid, cv=5)
reg.fit(X_train, y_train)
reg.best_params_

### Results

In [None]:
rfreg = RandomForestRegressor(n_estimators=100,max_depth=2,min_samples_split=10,oob_score=True)
rfreg.fit(X_train,y_train) # Use un-scaled data
# Evaluate model
print(rfreg.score(X_train,y_train))
print(rfreg.score(X_test,y_test))

########## **Poor score, can't use this** ###########

## Extra Trees Regressor

### Initial Hyperparameters

In [None]:
etreg = ExtraTreesRegressor(bootstrap=True,oob_score=True,warm_start=False,n_estimators=100)
etreg.fit(X_train,y_train) # Use un-scaled data
# Evaluate model
print(etreg.score(X_train,y_train))
print(etreg.score(X_test,y_test))

### GridSearchCV

In [None]:
param_grid = [{'n_estimators':[100,200,300],
               'max_depth':range(2,50),
               'min_samples_split':range(2,20),
               'oob_score':[True],
               'bootstrap':[True]
              }]

reg = GridSearchCV(ExtraTreesRegressor(), param_grid, cv=5)
reg.fit(X_train, y_train)
reg.best_params_

### Results

In [None]:
etreg = ExtraTreesRegressor(bootstrap=True,max_depth=23,min_samples_split=14,n_estimators=100,oob_score=True)
etreg.fit(X_train,y_train) # Use un-scaled data
# Evaluate model
print(etreg.score(X_train,y_train))
print(etreg.score(X_test,y_test))

## Ada Boost Regressor

### Initial Hyperparameters

### GridSearchCV

### Results

## Gradient Boost Regressor

### Initial Hyperparameters

Consider using the following loss functions (more robust to outliers) instead of MSE<br>
- Absolute Loss
- Huber Loss

### GridSearchCV

### Results

## Model with PCA