# Loading libraries

In [51]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

# Load data

In [52]:
data = pd.read_csv("Data.csv")
data.head()

Unnamed: 0.1,Unnamed: 0,Customer,State,Customer Lifetime Value,Response,Coverage,Education,Effective To Date,EmploymentStatus,Gender,...,Number of Open Complaints,Number of Policies,Policy Type,Policy,Renew Offer Type,Sales Channel,Total Claim Amount,Vehicle Class,Vehicle Size,Vehicle Type
0,0,DK49336,Arizona,4809.21696,No,Basic,College,2/18/11,Employed,M,...,0.0,9,Corporate Auto,Corporate L3,Offer3,Agent,292.8,Four-Door Car,Medsize,
1,1,KX64629,California,2228.525238,No,Basic,College,1/18/11,Unemployed,F,...,0.0,1,Personal Auto,Personal L3,Offer4,Call Center,744.924331,Four-Door Car,Medsize,
2,2,LZ68649,Washington,14947.9173,No,Basic,Bachelor,2/10/11,Employed,M,...,0.0,2,Personal Auto,Personal L3,Offer3,Call Center,480.0,SUV,Medsize,A
3,3,XL78013,Oregon,22332.43946,Yes,Extended,College,1/11/11,Employed,M,...,0.0,2,Corporate Auto,Corporate L3,Offer2,Branch,484.013411,Four-Door Car,Medsize,A
4,4,QA50777,Oregon,9025.067525,No,Premium,Bachelor,1/17/11,Medical Leave,F,...,,7,Personal Auto,Personal L2,Offer1,Branch,707.925645,Four-Door Car,Medsize,


# Defining X, y

In [53]:
X = data.drop(columns=['Unnamed: 0', 'Customer','Total Claim Amount'], axis = 1)
y = np.log(data['Total Claim Amount'])

# Data splitting

In [54]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

X_train = pd.DataFrame(X_train, columns=X.columns)
X_test  = pd.DataFrame(X_test, columns=X.columns)

In [55]:
X_train.describe()

Unnamed: 0,Customer Lifetime Value,Income,Monthly Premium Auto,Months Since Last Claim,Months Since Policy Inception,Number of Open Complaints,Number of Policies
count,8728.0,8728.0,8728.0,8201.0,8728.0,8201.0,8728.0
mean,8025.739678,37593.503093,93.291247,15.105231,48.059808,0.379954,2.970784
std,6973.335781,30343.602668,34.710942,10.04365,27.969144,0.901494,2.387027
min,1898.007675,0.0,61.0,0.0,0.0,0.0,1.0
25%,4016.439689,0.0,68.0,6.0,24.0,0.0,1.0
50%,5764.823237,33889.5,83.0,14.0,48.0,0.0,2.0
75%,8956.200142,62198.75,109.0,23.0,71.0,0.0,4.0
max,83325.38119,99981.0,298.0,35.0,99.0,5.0,9.0


# Variance threshold method

Univariate method

In [56]:
from sklearn.feature_selection import VarianceThreshold # It only works with numerical features


X_train = X_train.select_dtypes(include=np.number)
X_test  = X_test.select_dtypes(include=np.number)

#display(X_train)
print("Initial number of numerical columns: ",X_train.shape)
print()


selector = VarianceThreshold(threshold=100) # Default threshold value is 0
# Features with a training-set variance lower than this threshold will be removed.
selector.fit(X_train)

kept_features_indexes = selector.get_support(indices = True) #returns an array of integers corresponding to nonremoved features
kept_features = list(X_train.iloc[:,kept_features_indexes].columns)

X_train = selector.transform(X_train)
X_test  = selector.transform(X_test)

X_train = pd.DataFrame(X_train, columns=kept_features)
X_test  = pd.DataFrame(X_test, columns=kept_features)

print("Final number of numerical columns: ",X_train.shape)
print()
X_train

Initial number of numerical columns:  (8728, 7)

Final number of numerical columns:  (8728, 5)



Unnamed: 0,Customer Lifetime Value,Income,Monthly Premium Auto,Months Since Last Claim,Months Since Policy Inception
0,4665.129599,0.0,62.0,26.0,62.0
1,10288.924950,96337.0,127.0,19.0,12.0
2,4873.436612,18866.0,126.0,4.0,62.0
3,6944.739992,0.0,68.0,24.0,31.0
4,2472.469209,63860.0,62.0,26.0,81.0
...,...,...,...,...,...
8723,3810.238281,0.0,108.0,7.0,57.0
8724,3815.851163,38651.0,98.0,12.0,83.0
8725,7850.590399,0.0,69.0,5.0,78.0
8726,4974.235309,0.0,70.0,18.0,74.0


# Correlation matrix

Univariate method

In [66]:
import seaborn as sns
import matplotlib.pyplot as plt

c = abs(data.corr())
#c

#fig, ax = plt.subplots(figsize=(14,14))
#sns.heatmap(c, annot=True);

#c['Total Claim Amount']
c_last = c['Total Claim Amount'].sort_values(ascending=False)
#c_last
c_thr = .3
cols_to_keep = list(c_last[c_last > c_thr].index)[1:] + [list(c_last[c_last > c_thr].index)[0]]
print(cols_to_keep)

data[cols_to_keep]

['Monthly Premium Auto', 'Income', 'Total Claim Amount']


Unnamed: 0,Monthly Premium Auto,Income,Total Claim Amount
0,61,48029,292.800000
1,64,0,744.924331
2,100,22139,480.000000
3,97,49078,484.013411
4,117,23675,707.925645
...,...,...,...
10905,253,0,1214.400000
10906,65,61146,273.018929
10907,201,39837,381.306996
10908,158,64195,618.288849


# Recursive feature elimination

In [58]:
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

X_train = X_train.select_dtypes(include=np.number)
X_test  = X_test.select_dtypes(include=np.number)

X_train = pd.DataFrame(X_train, columns=X.columns)
X_test  = pd.DataFrame(X_test, columns=X.columns)

#X_train.isna().sum()
nulls = pd.DataFrame(X_train.isna().sum()).reset_index()
#nulls.head()
nulls.columns = ['Column','nas']
#nulls.head()
#nulls[nulls['nas'] > 0].head()
cols_to_drop = nulls[nulls['nas'] > 0]['Column'] # Too drastic, but made on pourpose for quick filtering (don't do this in production!!)

X_train.drop(columns=cols_to_drop, axis=1, inplace = True)
X_test.drop(columns=cols_to_drop, axis=1, inplace = True)

#display(X_train)

lm = LinearRegression()

selector = RFE(lm, n_features_to_select= 8, step = 1, verbose = 1) # Step is how many features to add or drop everytime
selector.fit(X_train, y_train)

kept_features = selector.get_support(indices = True) #returns an array of integers corresponding to nonremoved features
kept_features = list(X_train.iloc[:,kept_features].columns)

X_train = selector.transform(X_train)
X_test  = selector.transform(X_test)

X_train = pd.DataFrame(X_train, columns=kept_features)
X_test  = pd.DataFrame(X_test, columns=kept_features)

print("Final selected features: ")
display(X_train)


Final selected features: 


Unnamed: 0,Customer Lifetime Value,Income,Monthly Premium Auto,Months Since Policy Inception,Number of Policies
0,4665.129599,0.0,62.0,62.0,3.0
1,10288.924950,96337.0,127.0,12.0,3.0
2,4873.436612,18866.0,126.0,62.0,1.0
3,6944.739992,0.0,68.0,31.0,2.0
4,2472.469209,63860.0,62.0,81.0,1.0
...,...,...,...,...,...
8723,3810.238281,0.0,108.0,57.0,1.0
8724,3815.851163,38651.0,98.0,83.0,1.0
8725,7850.590399,0.0,69.0,78.0,2.0
8726,4974.235309,0.0,70.0,74.0,3.0


## Embedded Methods

In [105]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

X_train = X_train.select_dtypes(include=np.number)
X_test  = X_test.select_dtypes(include=np.number)


In [106]:
from sklearn.linear_model import Lasso,Ridge,ElasticNet, LinearRegression
model=Lasso()

model.fit(X_train, y_train)
print(f"{model.__class__.__name__}: Train -> {model.score(X_train, y_train)}, Test -> {model.score(X_test, y_test)}")


Lasso: Train -> 0.6899591642958296, Test -> 0.655906082915434


In [107]:
from sklearn.linear_model import Lasso,Ridge,ElasticNet, LinearRegression
model=Ridge()

model.fit(X_train, y_train)
print(f"{model.__class__.__name__}: Train -> {model.score(X_train, y_train)}, Test -> {model.score(X_test, y_test)}")



Ridge: Train -> 0.7415671063241829, Test -> 0.7041586727559436


In [108]:
from sklearn.linear_model import Lasso,Ridge,ElasticNet, LinearRegression
model=ElasticNet()

model.fit(X_train, y_train)
print(f"{model.__class__.__name__}: Train -> {model.score(X_train, y_train)}, Test -> {model.score(X_test, y_test)}")



ElasticNet: Train -> 0.6894875873950914, Test -> 0.6590505847238239


In [109]:
from sklearn.linear_model import Lasso,Ridge,ElasticNet, LinearRegression
model=LinearRegression()

model.fit(X_train, y_train)
print(f"{model.__class__.__name__}: Train -> {model.score(X_train, y_train)}, Test -> {model.score(X_test, y_test)}")



LinearRegression: Train -> 0.7434997532004697, Test -> 0.711226005748496


# Feature Selection P-value

In [110]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)
import warnings
warnings.filterwarnings('ignore')
import statsmodels.api as sm
from sklearn.datasets import load_boston

In [111]:
data = pd.read_csv("Data.csv")
data.head()

Unnamed: 0.1,Unnamed: 0,Customer,State,Customer Lifetime Value,Response,Coverage,Education,Effective To Date,EmploymentStatus,Gender,Income,Location Code,Marital Status,Monthly Premium Auto,Months Since Last Claim,Months Since Policy Inception,Number of Open Complaints,Number of Policies,Policy Type,Policy,Renew Offer Type,Sales Channel,Total Claim Amount,Vehicle Class,Vehicle Size,Vehicle Type
0,0,DK49336,Arizona,4809.21696,No,Basic,College,2/18/11,Employed,M,48029,Suburban,Married,61,7.0,52,0.0,9,Corporate Auto,Corporate L3,Offer3,Agent,292.8,Four-Door Car,Medsize,
1,1,KX64629,California,2228.525238,No,Basic,College,1/18/11,Unemployed,F,0,Suburban,Single,64,3.0,26,0.0,1,Personal Auto,Personal L3,Offer4,Call Center,744.924331,Four-Door Car,Medsize,
2,2,LZ68649,Washington,14947.9173,No,Basic,Bachelor,2/10/11,Employed,M,22139,Suburban,Single,100,34.0,31,0.0,2,Personal Auto,Personal L3,Offer3,Call Center,480.0,SUV,Medsize,A
3,3,XL78013,Oregon,22332.43946,Yes,Extended,College,1/11/11,Employed,M,49078,Suburban,Single,97,10.0,3,0.0,2,Corporate Auto,Corporate L3,Offer2,Branch,484.013411,Four-Door Car,Medsize,A
4,4,QA50777,Oregon,9025.067525,No,Premium,Bachelor,1/17/11,Medical Leave,F,23675,Suburban,Married,117,,31,,7,Personal Auto,Personal L2,Offer1,Branch,707.925645,Four-Door Car,Medsize,


In [112]:
X = data.drop(columns=['Unnamed: 0', 'Customer','Total Claim Amount'], axis = 1)
y = np.log(data['Total Claim Amount'])

In [113]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

X_train = pd.DataFrame(X_train, columns=X.columns)
X_test  = pd.DataFrame(X_test, columns=X.columns)

## Fitting OLS Model

In [114]:
x = load_boston()
y = x.target
X = pd.DataFrame(x.data, columns = x.feature_names)

X_added_constant = sm.add_constant(X)
X_added_constant

Unnamed: 0,const,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,1.0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.0900,1.0,296.0,15.3,396.90,4.98
1,1.0,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.90,9.14
2,1.0,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,1.0,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,1.0,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.90,5.33
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,1.0,0.06263,0.0,11.93,0.0,0.573,6.593,69.1,2.4786,1.0,273.0,21.0,391.99,9.67
502,1.0,0.04527,0.0,11.93,0.0,0.573,6.120,76.7,2.2875,1.0,273.0,21.0,396.90,9.08
503,1.0,0.06076,0.0,11.93,0.0,0.573,6.976,91.0,2.1675,1.0,273.0,21.0,396.90,5.64
504,1.0,0.10959,0.0,11.93,0.0,0.573,6.794,89.3,2.3889,1.0,273.0,21.0,393.45,6.48


In [115]:
# we need to add this constant value of 1 for the intercepts
model = sm.OLS(y,X_added_constant).fit()
model.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.741
Model:,OLS,Adj. R-squared:,0.734
Method:,Least Squares,F-statistic:,108.1
Date:,"Mon, 21 Feb 2022",Prob (F-statistic):,6.72e-135
Time:,16:58:13,Log-Likelihood:,-1498.8
No. Observations:,506,AIC:,3026.0
Df Residuals:,492,BIC:,3085.0
Df Model:,13,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,36.4595,5.103,7.144,0.000,26.432,46.487
CRIM,-0.1080,0.033,-3.287,0.001,-0.173,-0.043
ZN,0.0464,0.014,3.382,0.001,0.019,0.073
INDUS,0.0206,0.061,0.334,0.738,-0.100,0.141
CHAS,2.6867,0.862,3.118,0.002,0.994,4.380
NOX,-17.7666,3.820,-4.651,0.000,-25.272,-10.262
RM,3.8099,0.418,9.116,0.000,2.989,4.631
AGE,0.0007,0.013,0.052,0.958,-0.025,0.027
DIS,-1.4756,0.199,-7.398,0.000,-1.867,-1.084

0,1,2,3
Omnibus:,178.041,Durbin-Watson:,1.078
Prob(Omnibus):,0.0,Jarque-Bera (JB):,783.126
Skew:,1.521,Prob(JB):,8.84e-171
Kurtosis:,8.281,Cond. No.,15100.0


## Dropping Insignificant Features

In [116]:
X_added_constant = X_added_constant.drop(['RM'], axis=1)
model = sm.OLS(y,X_added_constant).fit()
model.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.697
Model:,OLS,Adj. R-squared:,0.689
Method:,Least Squares,F-statistic:,94.43
Date:,"Mon, 21 Feb 2022",Prob (F-statistic):,2.29e-119
Time:,16:58:14,Log-Likelihood:,-1538.3
No. Observations:,506,AIC:,3103.0
Df Residuals:,493,BIC:,3158.0
Df Model:,12,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,69.4672,3.885,17.883,0.000,61.835,77.099
CRIM,-0.1159,0.035,-3.266,0.001,-0.186,-0.046
ZN,0.0659,0.015,4.501,0.000,0.037,0.095
INDUS,-0.0310,0.066,-0.468,0.640,-0.161,0.099
CHAS,2.9316,0.930,3.152,0.002,1.104,4.759
NOX,-21.0212,4.108,-5.118,0.000,-29.092,-12.951
AGE,0.0256,0.014,1.833,0.067,-0.002,0.053
DIS,-1.7184,0.213,-8.049,0.000,-2.138,-1.299
RAD,0.4017,0.071,5.677,0.000,0.263,0.541

0,1,2,3
Omnibus:,126.485,Durbin-Watson:,1.244
Prob(Omnibus):,0.0,Jarque-Bera (JB):,297.03
Skew:,1.281,Prob(JB):,3.17e-65
Kurtosis:,5.743,Cond. No.,12700.0
