# Loading libraries

In [2]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")
import statsmodels.api as sm
from sklearn.datasets import load_boston
import statsmodels.api as sm

In [3]:
# Load Data from csv file
df=pd.read_csv('Data_Marketing_Customer_Analysis_Round3.csv')

In [4]:
df

Unnamed: 0,region,customer_lifetime_value,response,coverage,education,effective_to_date,month,employment_status,gender,income,...,months_since_policy_inception,number_of_open_complaints,number_of_policies,policy_type,policy,renew_offer_type,sales_channel,total_claim_amount,vehicle_class,vehicle_size
0,central,4809,no,basic,college,2/18/11,feb,employed,m,48029,...,52,0,9,corporate auto,corporate l3,offer3,agent,292,four-door car,medsize
1,west region,2228,no,basic,college,1/18/11,jan,unemployed,f,92260,...,26,0,1,personal auto,personal l3,offer4,call center,744,four-door car,medsize
2,east,14947,no,basic,bachelor,2/10/11,feb,employed,m,22139,...,31,0,2,personal auto,personal l3,offer3,call center,480,suv,medsize
3,north west,22332,yes,extended,college,1/11/11,jan,employed,m,49078,...,3,0,2,corporate auto,corporate l3,offer2,branch,484,four-door car,medsize
4,north west,9025,no,premium,bachelor,1/17/11,jan,medical leave,f,23675,...,31,0,7,personal auto,personal l2,offer1,branch,707,four-door car,medsize
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10684,central,15563,no,premium,bachelor,1/19/11,jan,unemployed,f,61541,...,40,0,7,personal auto,personal l1,offer3,web,1214,luxury car,medsize
10685,north west,5259,no,basic,college,1/6/11,jan,employed,f,61146,...,68,0,6,personal auto,personal l3,offer2,branch,273,four-door car,medsize
10686,central,23893,no,extended,bachelor,2/6/11,feb,employed,f,39837,...,63,0,2,corporate auto,corporate l3,offer1,web,381,luxury suv,medsize
10687,west region,11971,no,premium,college,2/13/11,feb,employed,f,64195,...,27,4,6,personal auto,personal l1,offer1,branch,618,suv,medsize


# Defining X, Y

In [6]:
X = df.drop(columns=['total_claim_amount'], axis = 1)
Y = df['total_claim_amount']

# Data splitting

In [7]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.20, random_state=42)

X_train = pd.DataFrame(X_train, columns=X.columns)
X_test  = pd.DataFrame(X_test, columns=X.columns)

In [8]:
X_train.describe()

Unnamed: 0,customer_lifetime_value,income,monthly_premium_auto,months_since_last_claim,months_since_policy_inception,number_of_open_complaints,number_of_policies
count,8551.0,8551.0,8551.0,8551.0,8551.0,8551.0,8551.0
mean,7994.902701,51817.509063,93.295287,15.13554,48.19296,0.375395,2.983511
std,6848.846659,24717.379264,34.575537,10.13316,27.849503,0.899706,2.398456
min,1898.0,10074.0,61.0,0.0,0.0,0.0,1.0
25%,4020.5,29435.0,68.0,6.0,25.0,0.0,1.0
50%,5764.0,50446.0,83.0,14.0,48.0,0.0,2.0
75%,8964.0,72194.5,109.0,23.0,71.0,0.0,4.0
max,74228.0,99981.0,298.0,35.0,99.0,5.0,9.0


In [11]:
#Include only numerical variables
X_train = X_train.select_dtypes(include=np.number)
X_test  = X_test.select_dtypes(include=np.number)

In [10]:
X_train.head()

Unnamed: 0,customer_lifetime_value,income,monthly_premium_auto,months_since_last_claim,months_since_policy_inception,number_of_open_complaints,number_of_policies
9877,21423,22379,65,9,31,0,2
10069,8391,40211,106,5,98,2,6
10317,3969,49544,101,3,29,0,1
9796,14914,45963,63,3,73,2,2
8995,18060,57882,115,1,61,0,2


In [14]:
from sklearn.linear_model import Lasso,Ridge,ElasticNet, LinearRegression

## Linear Regression

In [16]:
model=LinearRegression()
model.fit(X_train, Y_train)
print(f"{model.__class__.__name__}: Train -> {model.score(X_train, Y_train)}, Test -> {model.score(X_test, Y_test)}")

LinearRegression: Train -> 0.411514336844104, Test -> 0.4012730915352387


## Scale varibles

In [18]:
from sklearn.preprocessing import StandardScaler

std_scaler=StandardScaler().fit(X_train)   ##. finding the parameters ( mean, variance from the training set )

X_train_scaled=std_scaler.transform(X_train)

X_test_scaled=std_scaler.transform(X_test)

In [19]:
model=LinearRegression()
model.fit(X_train_scaled, Y_train)
print(f"{model.__class__.__name__}: Train -> {model.score(X_train_scaled, Y_train)}, Test -> {model.score(X_test_scaled, Y_test)}")

LinearRegression: Train -> 0.4115143368441039, Test -> 0.40127309153524016


## Lasso 

In [29]:
from sklearn.linear_model import Lasso,Ridge,ElasticNet, LinearRegression
model=Lasso(alpha=.9)

model.fit(X_train, Y_train)
print(f"{model.__class__.__name__}: Train -> {model.score(X_train, Y_train)}, Test -> {model.score(X_test, Y_test)}")

Lasso: Train -> 0.41151165184815774, Test -> 0.401369651895908


## Ridge

In [31]:
model=Ridge(alpha=10000)
model.fit(X_train, Y_train)
print(f"{model.__class__.__name__}: Train -> {model.score(X_train, Y_train)}, Test -> {model.score(X_test, Y_test)}")

Ridge: Train -> 0.4115115768551342, Test -> 0.40137520889808476


In [32]:
model=Ridge(alpha=10000)
model.fit(X_train_scaled, Y_train)
print(f"{model.__class__.__name__}: Train -> {model.score(X_train_scaled, Y_train)}, Test -> {model.score(X_test_scaled, Y_test)}")

Ridge: Train -> 0.29485608961928356, Test -> 0.29102294049900856


## KNeighbors

In [37]:
from sklearn.neighbors import KNeighborsRegressor
neigh = KNeighborsRegressor(n_neighbors=2)
neigh.fit(X_train, Y_train)
KNeighborsRegressor(...)

KNeighborsRegressor(n_neighbors=Ellipsis)

# Recursive feature elimination

In [40]:
from sklearn.feature_selection import RFE 

In [44]:
lm = LinearRegression()
selector = RFE(lm, n_features_to_select= 4, step = 1, verbose = 1) # Step is how many features to add or drop everytime
selector.fit(X_train, Y_train)

kept_features = selector.get_support(indices = True) #returns an array of integers corresponding to nonremoved features
kept_features = list(X_train.iloc[:,kept_features].columns)

X_train = selector.transform(X_train)
X_test  = selector.transform(X_test)

X_train = pd.DataFrame(X_train, columns=kept_features)
X_test  = pd.DataFrame(X_test, columns=kept_features)

print("Final selected features: ")
display(X_train)

Final selected features: 


Unnamed: 0,monthly_premium_auto,months_since_last_claim,months_since_policy_inception,number_of_policies
0,65,9,31,2
1,106,5,98,6
2,101,3,29,1
3,63,3,73,2
4,115,1,61,2
...,...,...,...,...
8546,94,22,66,3
8547,98,17,78,2
8548,64,26,8,8
8549,106,23,90,2


In [45]:
model=LinearRegression()
model.fit(X_train, Y_train)
print(f"{model.__class__.__name__}: Train -> {model.score(X_train, Y_train)}, Test -> {model.score(X_test, Y_test)}")

LinearRegression: Train -> 0.3979714422152414, Test -> 0.393261916514401


## P-value

In [46]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)
import warnings
warnings.filterwarnings('ignore')
import statsmodels.api as sm
from sklearn.datasets import load_boston

In [55]:
x = df.drop(columns=['total_claim_amount'], axis = 1)
x = x.select_dtypes(include=np.number)
y = Y = df['total_claim_amount']


x_added_constant = sm.add_constant(x)
x_added_constant

Unnamed: 0,const,customer_lifetime_value,income,monthly_premium_auto,months_since_last_claim,months_since_policy_inception,number_of_open_complaints,number_of_policies
0,1.0,4809,48029,61,7,52,0,9
1,1.0,2228,92260,64,3,26,0,1
2,1.0,14947,22139,100,34,31,0,2
3,1.0,22332,49078,97,10,3,0,2
4,1.0,9025,23675,117,33,31,0,7
...,...,...,...,...,...,...,...,...
10684,1.0,15563,61541,253,12,40,0,7
10685,1.0,5259,61146,65,7,68,0,6
10686,1.0,23893,39837,201,11,63,0,2
10687,1.0,11971,64195,158,0,27,4,6


In [56]:
# we need to add this constant value of 1 for the intercepts
model = sm.OLS(y,x_added_constant).fit()
model.summary()

0,1,2,3
Dep. Variable:,total_claim_amount,R-squared:,0.41
Model:,OLS,Adj. R-squared:,0.409
Method:,Least Squares,F-statistic:,1059.0
Date:,"Mon, 28 Nov 2022",Prob (F-statistic):,0.0
Time:,18:29:06,Log-Likelihood:,-73048.0
No. Observations:,10689,AIC:,146100.0
Df Residuals:,10681,BIC:,146200.0
Df Model:,7,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,3.1816,9.701,0.328,0.743,-15.835,22.198
customer_lifetime_value,-0.0011,0.000,-3.164,0.002,-0.002,-0.000
income,-0.0013,8.78e-05,-14.699,0.000,-0.001,-0.001
monthly_premium_auto,5.4444,0.069,78.758,0.000,5.309,5.580
months_since_last_claim,0.1334,0.216,0.618,0.537,-0.290,0.556
months_since_policy_inception,-0.0771,0.078,-0.990,0.322,-0.230,0.076
number_of_open_complaints,-1.2140,2.391,-0.508,0.612,-5.901,3.473
number_of_policies,0.4745,0.907,0.523,0.601,-1.303,2.252

0,1,2,3
Omnibus:,1222.098,Durbin-Watson:,1.987
Prob(Omnibus):,0.0,Jarque-Bera (JB):,6048.456
Skew:,0.451,Prob(JB):,0.0
Kurtosis:,6.573,Cond. No.,258000.0


In [54]:
model.params[list(np.where(model.pvalues < 0.05)[0])].iloc[0:].index.tolist()
significant_features=x_added_constant[model.params[list(np.where(model.pvalues < 0.05)[0])].iloc[0:].index.tolist()]
significant_features

Unnamed: 0,customer_lifetime_value,income,total_claim_amount
0,4809,48029,292
1,2228,92260,744
2,14947,22139,480
3,22332,49078,484
4,9025,23675,707
...,...,...,...
10684,15563,61541,1214
10685,5259,61146,273
10686,23893,39837,381
10687,11971,64195,618


In [57]:
#Run new model
x_added_constant = x_added_constant.drop(['months_since_last_claim','months_since_policy_inception','number_of_open_complaints','number_of_policies'], axis=1)
model = sm.OLS(y,x_added_constant).fit()
model.summary()

0,1,2,3
Dep. Variable:,total_claim_amount,R-squared:,0.41
Model:,OLS,Adj. R-squared:,0.409
Method:,Least Squares,F-statistic:,2472.0
Date:,"Mon, 28 Nov 2022",Prob (F-statistic):,0.0
Time:,18:30:36,Log-Likelihood:,-73049.0
No. Observations:,10689,AIC:,146100.0
Df Residuals:,10685,BIC:,146100.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,2.6924,7.758,0.347,0.729,-12.514,17.899
customer_lifetime_value,-0.0011,0.000,-3.142,0.002,-0.002,-0.000
income,-0.0013,8.78e-05,-14.741,0.000,-0.001,-0.001
monthly_premium_auto,5.4425,0.069,78.779,0.000,5.307,5.578

0,1,2,3
Omnibus:,1219.932,Durbin-Watson:,1.986
Prob(Omnibus):,0.0,Jarque-Bera (JB):,6025.546
Skew:,0.451,Prob(JB):,0.0
Kurtosis:,6.566,Cond. No.,206000.0
