# Loading libraries

In [166]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

# Load data

In [167]:
data = pd.read_csv("Data.csv")
data.head()

Unnamed: 0.1,Unnamed: 0,Customer,State,Customer Lifetime Value,Response,Coverage,Education,Effective To Date,EmploymentStatus,Gender,Income,Location Code,Marital Status,Monthly Premium Auto,Months Since Last Claim,Months Since Policy Inception,Number of Open Complaints,Number of Policies,Policy Type,Policy,Renew Offer Type,Sales Channel,Total Claim Amount,Vehicle Class,Vehicle Size,Vehicle Type
0,0,DK49336,Arizona,4809.21696,No,Basic,College,2/18/11,Employed,M,48029,Suburban,Married,61,7.0,52,0.0,9,Corporate Auto,Corporate L3,Offer3,Agent,292.8,Four-Door Car,Medsize,
1,1,KX64629,California,2228.525238,No,Basic,College,1/18/11,Unemployed,F,0,Suburban,Single,64,3.0,26,0.0,1,Personal Auto,Personal L3,Offer4,Call Center,744.924331,Four-Door Car,Medsize,
2,2,LZ68649,Washington,14947.9173,No,Basic,Bachelor,2/10/11,Employed,M,22139,Suburban,Single,100,34.0,31,0.0,2,Personal Auto,Personal L3,Offer3,Call Center,480.0,SUV,Medsize,A
3,3,XL78013,Oregon,22332.43946,Yes,Extended,College,1/11/11,Employed,M,49078,Suburban,Single,97,10.0,3,0.0,2,Corporate Auto,Corporate L3,Offer2,Branch,484.013411,Four-Door Car,Medsize,A
4,4,QA50777,Oregon,9025.067525,No,Premium,Bachelor,1/17/11,Medical Leave,F,23675,Suburban,Married,117,,31,,7,Personal Auto,Personal L2,Offer1,Branch,707.925645,Four-Door Car,Medsize,


# Defining X, y

In [168]:
X = data.drop(columns=['Unnamed: 0', 'Customer','Total Claim Amount'], axis = 1)
y = np.log(data['Total Claim Amount'])

# Data splitting

In [169]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

X_train = pd.DataFrame(X_train, columns=X.columns)
X_test  = pd.DataFrame(X_test, columns=X.columns)

In [170]:
X_train.describe()

Unnamed: 0,Customer Lifetime Value,Income,Monthly Premium Auto,Months Since Last Claim,Months Since Policy Inception,Number of Open Complaints,Number of Policies
count,8728.0,8728.0,8728.0,8201.0,8728.0,8201.0,8728.0
mean,8025.739678,37593.503093,93.291247,15.105231,48.059808,0.379954,2.970784
std,6973.335781,30343.602668,34.710942,10.04365,27.969144,0.901494,2.387027
min,1898.007675,0.0,61.0,0.0,0.0,0.0,1.0
25%,4016.439689,0.0,68.0,6.0,24.0,0.0,1.0
50%,5764.823237,33889.5,83.0,14.0,48.0,0.0,2.0
75%,8956.200142,62198.75,109.0,23.0,71.0,0.0,4.0
max,83325.38119,99981.0,298.0,35.0,99.0,5.0,9.0


# Variance threshold method

Univariate method

In [171]:
from sklearn.feature_selection import VarianceThreshold # It only works with numerical features


X_train = X_train.select_dtypes(include=np.number)
X_test  = X_test.select_dtypes(include=np.number)

#display(X_train)
print("Initial number of numerical columns: ",X_train.shape)
print()


selector = VarianceThreshold(threshold=100) # Default threshold value is 0
# Features with a training-set variance lower than this threshold will be removed.
selector.fit(X_train)

kept_features_indexes = selector.get_support(indices = True) #returns an array of integers corresponding to nonremoved features
kept_features = list(X_train.iloc[:,kept_features_indexes].columns)

X_train = selector.transform(X_train)
X_test  = selector.transform(X_test)

X_train = pd.DataFrame(X_train, columns=kept_features)
X_test  = pd.DataFrame(X_test, columns=kept_features)

print("Final number of numerical columns: ",X_train.shape)
print()
X_train

Initial number of numerical columns:  (8728, 7)

Final number of numerical columns:  (8728, 5)



Unnamed: 0,Customer Lifetime Value,Income,Monthly Premium Auto,Months Since Last Claim,Months Since Policy Inception
0,4665.129599,0.0,62.0,26.0,62.0
1,10288.924950,96337.0,127.0,19.0,12.0
2,4873.436612,18866.0,126.0,4.0,62.0
3,6944.739992,0.0,68.0,24.0,31.0
4,2472.469209,63860.0,62.0,26.0,81.0
...,...,...,...,...,...
8723,3810.238281,0.0,108.0,7.0,57.0
8724,3815.851163,38651.0,98.0,12.0,83.0
8725,7850.590399,0.0,69.0,5.0,78.0
8726,4974.235309,0.0,70.0,18.0,74.0


# Correlation matrix

Univariate method

In [172]:
import seaborn as sns
import matplotlib.pyplot as plt

c = abs(data.corr())
#c

#fig, ax = plt.subplots(figsize=(14,14))
#sns.heatmap(c, annot=True);

#c['Total Claim Amount']
c_last = c['Total Claim Amount'].sort_values(ascending=False)
#c_last
c_thr = .3
cols_to_keep = list(c_last[c_last > c_thr].index)[1:] + [list(c_last[c_last > c_thr].index)[0]]
print(cols_to_keep)

data[cols_to_keep]

['Monthly Premium Auto', 'Income', 'Total Claim Amount']


Unnamed: 0,Monthly Premium Auto,Income,Total Claim Amount
0,61,48029,292.800000
1,64,0,744.924331
2,100,22139,480.000000
3,97,49078,484.013411
4,117,23675,707.925645
...,...,...,...
10905,253,0,1214.400000
10906,65,61146,273.018929
10907,201,39837,381.306996
10908,158,64195,618.288849


# Recursive feature elimination

In [173]:
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

X_train = X_train.select_dtypes(include=np.number)
X_test  = X_test.select_dtypes(include=np.number)

X_train = pd.DataFrame(X_train, columns=X.columns)
X_test  = pd.DataFrame(X_test, columns=X.columns)

#X_train.isna().sum()
nulls = pd.DataFrame(X_train.isna().sum()).reset_index()
#nulls.head()
nulls.columns = ['Column','nas']
#nulls.head()
#nulls[nulls['nas'] > 0].head()
cols_to_drop = nulls[nulls['nas'] > 0]['Column'] # Too drastic, but made on pourpose for quick filtering (don't do this in production!!)

X_train.drop(columns=cols_to_drop, axis=1, inplace = True)
X_test.drop(columns=cols_to_drop, axis=1, inplace = True)

#display(X_train)

lm = LinearRegression()

selector = RFE(lm, n_features_to_select= 8, step = 1, verbose = 1) # Step is how many features to add or drop everytime
selector.fit(X_train, y_train)

kept_features = selector.get_support(indices = True) #returns an array of integers corresponding to nonremoved features
kept_features = list(X_train.iloc[:,kept_features].columns)

X_train = selector.transform(X_train)
X_test  = selector.transform(X_test)

X_train = pd.DataFrame(X_train, columns=kept_features)
X_test  = pd.DataFrame(X_test, columns=kept_features)

print("Final selected features: ")
display(X_train)


Final selected features: 


Unnamed: 0,Customer Lifetime Value,Income,Monthly Premium Auto,Months Since Policy Inception,Number of Policies
0,4665.129599,0.0,62.0,62.0,3.0
1,10288.924950,96337.0,127.0,12.0,3.0
2,4873.436612,18866.0,126.0,62.0,1.0
3,6944.739992,0.0,68.0,31.0,2.0
4,2472.469209,63860.0,62.0,81.0,1.0
...,...,...,...,...,...
8723,3810.238281,0.0,108.0,57.0,1.0
8724,3815.851163,38651.0,98.0,83.0,1.0
8725,7850.590399,0.0,69.0,78.0,2.0
8726,4974.235309,0.0,70.0,74.0,3.0


## Embedded Methods

In [174]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

X_train = X_train.select_dtypes(include=np.number)
X_test  = X_test.select_dtypes(include=np.number)


In [175]:
from sklearn.linear_model import Lasso,Ridge,ElasticNet, LinearRegression
model=Lasso()

model.fit(X_train, y_train)
print(f"{model.__class__.__name__}: Train -> {model.score(X_train, y_train)}, Test -> {model.score(X_test, y_test)}")


ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [None]:
from sklearn.linear_model import Lasso,Ridge,ElasticNet, LinearRegression
model=Ridge()

model.fit(X_train, y_train)
print(f"{model.__class__.__name__}: Train -> {model.score(X_train, y_train)}, Test -> {model.score(X_test, y_test)}")



In [None]:
from sklearn.linear_model import Lasso,Ridge,ElasticNet, LinearRegression
model=ElasticNet()

model.fit(X_train, y_train)
print(f"{model.__class__.__name__}: Train -> {model.score(X_train, y_train)}, Test -> {model.score(X_test, y_test)}")



In [None]:
from sklearn.linear_model import Lasso,Ridge,ElasticNet, LinearRegression
model=LinearRegression()

model.fit(X_train, y_train)
print(f"{model.__class__.__name__}: Train -> {model.score(X_train, y_train)}, Test -> {model.score(X_test, y_test)}")



# Feature Selection P-value

In [None]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)
import warnings
warnings.filterwarnings('ignore')
import statsmodels.api as sm
from sklearn.datasets import load_boston

In [None]:
data = pd.read_csv("Data.csv")
data.head()

In [None]:
X = data.drop(columns=['Unnamed: 0', 'Customer','Total Claim Amount'], axis = 1)
y = np.log(data['Total Claim Amount'])

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

X_train = pd.DataFrame(X_train, columns=X.columns)
X_test  = pd.DataFrame(X_test, columns=X.columns)

## Fitting OLS Model

In [None]:
x = load_boston()
y = x.target
X = pd.DataFrame(x.data, columns = x.feature_names)

X_added_constant = sm.add_constant(X)
X_added_constant

In [None]:
# we need to add this constant value of 1 for the intercepts
model = sm.OLS(y,X_added_constant).fit()
model.summary()

## Dropping Insignificant Features

In [None]:
X_added_constant = X_added_constant.drop(['RM'], axis=1)
model = sm.OLS(y,X_added_constant).fit()
model.summary()