<b/> Instructions

- fit the models LinearRegression,Lasso and Ridge and compare the model performances.
- (Optional) Define a function that takes a list of models and trains (and tests) them so we can try a lot of them without repeating code.
- Use feature selection techniques (P-Value, RFE) to select subset of features to train the model with(if necessary).
(optional) Refit the models with the selected features.

In [38]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline
df = pd.read_csv('Data_Marketing_Customer_Analysis_Round3.csv')

In [39]:
# Dividing numerical and categorical values
numerical = df.select_dtypes(include=np.number)
numerical

Unnamed: 0,customer_lifetime_value,income,monthly_premium_auto,months_since_last_claim,months_since_policy_inception,number_of_open_complaints,number_of_policies,total_claim_amount
0,4809,48029,61,7,52,0,9,292
1,2228,92260,64,3,26,0,1,744
2,14947,22139,100,34,31,0,2,480
3,22332,49078,97,10,3,0,2,484
4,9025,23675,117,33,31,0,7,707
...,...,...,...,...,...,...,...,...
10684,15563,61541,253,12,40,0,7,1214
10685,5259,61146,65,7,68,0,6,273
10686,23893,39837,201,11,63,0,2,381
10687,11971,64195,158,0,27,4,6,618


In [40]:
# It seems there are some NaNs
numerical = numerical.dropna(axis=1)
numerical = numerical.reset_index(col_fill='')

In [41]:
# check for NaN values
print(numerical.isna().sum())

index                            0
customer_lifetime_value          0
income                           0
monthly_premium_auto             0
months_since_last_claim          0
months_since_policy_inception    0
number_of_open_complaints        0
number_of_policies               0
total_claim_amount               0
dtype: int64


In [43]:
# check the data type of every column
numerical.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10689 entries, 0 to 10688
Data columns (total 9 columns):
 #   Column                         Non-Null Count  Dtype
---  ------                         --------------  -----
 0   index                          10689 non-null  int64
 1   customer_lifetime_value        10689 non-null  int64
 2   income                         10689 non-null  int64
 3   monthly_premium_auto           10689 non-null  int64
 4   months_since_last_claim        10689 non-null  int64
 5   months_since_policy_inception  10689 non-null  int64
 6   number_of_open_complaints      10689 non-null  int64
 7   number_of_policies             10689 non-null  int64
 8   total_claim_amount             10689 non-null  int64
dtypes: int64(9)
memory usage: 751.7 KB


In [23]:
# change data type of columns
numerical = numerical.astype('float32')

In [63]:
# Defining X & Y
X = numerical.drop(columns=["total_claim_amount"])
y = df['total_claim_amount']

In [64]:
# Data Splitting
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

X_train = pd.DataFrame(X_train, columns=X.columns)
X_test  = pd.DataFrame(X_test, columns=X.columns)

In [65]:
X_train.describe()

Unnamed: 0,index,customer_lifetime_value,income,monthly_premium_auto,months_since_last_claim,months_since_policy_inception,number_of_open_complaints,number_of_policies
count,8551.0,8551.0,8551.0,8551.0,8551.0,8551.0,8551.0,8551.0
mean,5341.802128,7994.902701,51817.509063,93.295287,15.13554,48.19296,0.375395,2.983511
std,3082.460303,6848.846659,24717.379264,34.575537,10.13316,27.849503,0.899706,2.398456
min,1.0,1898.0,10074.0,61.0,0.0,0.0,0.0,1.0
25%,2665.5,4020.5,29435.0,68.0,6.0,25.0,0.0,1.0
50%,5361.0,5764.0,50446.0,83.0,14.0,48.0,0.0,2.0
75%,8001.0,8964.0,72194.5,109.0,23.0,71.0,0.0,4.0
max,10688.0,74228.0,99981.0,298.0,35.0,99.0,5.0,9.0


### Variance threshold method

Unvariate Method

In [66]:
from sklearn.feature_selection import VarianceThreshold # It only works with numerical features


X_train = X_train.select_dtypes(include=np.number)
X_test  = X_test.select_dtypes(include=np.number)

#display(X_train)
print("Initial number of numerical columns: ",X_train.shape)
print()


selector = VarianceThreshold(100) # Default threshold value is 0
# Features with a training-set variance lower than this threshold will be removed.
selector.fit(X_train)

kept_features_indexes = selector.get_support(indices = True) #returns an array of integers corresponding to nonremoved features
kept_features = list(X_train.iloc[:,kept_features_indexes].columns)

X_train = selector.transform(X_train)
X_test  = selector.transform(X_test)

X_train = pd.DataFrame(X_train, columns=kept_features)
X_test  = pd.DataFrame(X_test, columns=kept_features)

print("Final number of numerical columns: ",X_train.shape)
print()
X_train

Initial number of numerical columns:  (8551, 8)

Final number of numerical columns:  (8551, 6)



Unnamed: 0,index,customer_lifetime_value,income,monthly_premium_auto,months_since_last_claim,months_since_policy_inception
0,9877,21423,22379,65,9,31
1,10069,8391,40211,106,5,98
2,10317,3969,49544,101,3,29
3,9796,14914,45963,63,3,73
4,8995,18060,57882,115,1,61
...,...,...,...,...,...,...
8546,5734,7610,98701,94,22,66
8547,5191,35186,86134,98,17,78
8548,5390,4241,19834,64,26,8
8549,860,12941,77060,106,23,90


### Recursive feature elimination


we need to elimiante NaNs for that

In [67]:
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE  ## recursive feature elemination technique

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

X_train = X_train.select_dtypes(include=np.number)
X_test  = X_test.select_dtypes(include=np.number)

X_train = pd.DataFrame(X_train, columns=X.columns)
X_test  = pd.DataFrame(X_test, columns=X.columns)

#display(X_train)
X_train

Unnamed: 0,index,customer_lifetime_value,income,monthly_premium_auto,months_since_last_claim,months_since_policy_inception,number_of_open_complaints,number_of_policies
9877,9877,21423,22379,65,9,31,0,2
10069,10069,8391,40211,106,5,98,2,6
10317,10317,3969,49544,101,3,29,0,1
9796,9796,14914,45963,63,3,73,2,2
8995,8995,18060,57882,115,1,61,0,2
...,...,...,...,...,...,...,...,...
5734,5734,7610,98701,94,22,66,0,3
5191,5191,35186,86134,98,17,78,0,2
5390,5390,4241,19834,64,26,8,4,8
860,860,12941,77060,106,23,90,0,2


In [68]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8551 entries, 9877 to 7270
Data columns (total 8 columns):
 #   Column                         Non-Null Count  Dtype
---  ------                         --------------  -----
 0   index                          8551 non-null   int64
 1   customer_lifetime_value        8551 non-null   int64
 2   income                         8551 non-null   int64
 3   monthly_premium_auto           8551 non-null   int64
 4   months_since_last_claim        8551 non-null   int64
 5   months_since_policy_inception  8551 non-null   int64
 6   number_of_open_complaints      8551 non-null   int64
 7   number_of_policies             8551 non-null   int64
dtypes: int64(8)
memory usage: 601.2 KB


In [69]:
X_train

Unnamed: 0,index,customer_lifetime_value,income,monthly_premium_auto,months_since_last_claim,months_since_policy_inception,number_of_open_complaints,number_of_policies
9877,9877,21423,22379,65,9,31,0,2
10069,10069,8391,40211,106,5,98,2,6
10317,10317,3969,49544,101,3,29,0,1
9796,9796,14914,45963,63,3,73,2,2
8995,8995,18060,57882,115,1,61,0,2
...,...,...,...,...,...,...,...,...
5734,5734,7610,98701,94,22,66,0,3
5191,5191,35186,86134,98,17,78,0,2
5390,5390,4241,19834,64,26,8,4,8
860,860,12941,77060,106,23,90,0,2


In [70]:
lm = LinearRegression()

selector = RFE(lm, n_features_to_select= 5, step = 1, verbose = 1) # Step is how many features to add or drop everytime
selector.fit(X_train, y_train)

kept_features = selector.get_support(indices = True) #returns an array of integers corresponding to nonremoved features
kept_features = list(X_train.iloc[:,kept_features].columns)

X_train = selector.transform(X_train)
X_test  = selector.transform(X_test)

X_train = pd.DataFrame(X_train, columns=kept_features)
X_test  = pd.DataFrame(X_test, columns=kept_features)

print("Final selected features: ")
display(X_train)

Fitting estimator with 8 features.
Fitting estimator with 7 features.
Fitting estimator with 6 features.
Final selected features: 


Unnamed: 0,monthly_premium_auto,months_since_last_claim,months_since_policy_inception,number_of_open_complaints,number_of_policies
0,65,9,31,0,2
1,106,5,98,2,6
2,101,3,29,0,1
3,63,3,73,2,2
4,115,1,61,0,2
...,...,...,...,...,...
8546,94,22,66,0,3
8547,98,17,78,0,2
8548,64,26,8,4,8
8549,106,23,90,0,2
