# Feature selection

Loading libraries

In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

Loading data

In [26]:
data = pd.read_csv("customer_analysis_case_study_all_data.csv").drop(['Unnamed: 0'], axis =1)
data.head()

Unnamed: 0,total claim amount,customer lifetime value,income,monthly premium auto,months since last claim,months since policy inception,number of open complaints,number of policies,state_California,state_Nevada,...,renew offer type_Offer3.1,renew offer type_Offer4.1,sales channel_Branch.1,sales channel_Call Center.1,sales channel_Web.1,vehicle class_Luxury Car.1,vehicle class_Luxury SUV.1,vehicle class_SUV.1,vehicle class_Sports Car.1,vehicle class_Two-Door Car.1
0,292.8,4809.21696,48029.0,61,7.0,52,0.0,9,0,0,...,1,0,0,0,0,0,0,0,0,0
1,744.924331,2228.525238,50414.978,64,3.0,26,0.0,1,1,0,...,0,1,0,1,0,0,0,0,0,0
2,480.0,14947.9173,22139.0,100,34.0,31,0.0,2,0,0,...,1,0,0,1,0,0,0,1,0,0
3,484.013411,22332.43946,49078.0,97,10.0,3,0.0,2,0,0,...,0,0,1,0,0,0,0,0,0,0
4,707.925645,9025.067525,23675.0,117,15.146716,31,0.383734,7,0,0,...,0,0,1,0,0,0,0,0,0,0


Defining X/y

In [27]:
X = data.drop(columns=['total claim amount'], axis = 1)
y = np.log(data['total claim amount'])

Data splitting

In [28]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

X_train = pd.DataFrame(X_train, columns=X.columns)
X_test  = pd.DataFrame(X_test, columns=X.columns)

In [29]:
X_train.describe()

Unnamed: 0,customer lifetime value,income,monthly premium auto,months since last claim,months since policy inception,number of open complaints,number of policies,state_California,state_Nevada,state_Oregon,...,renew offer type_Offer3.1,renew offer type_Offer4.1,sales channel_Branch.1,sales channel_Call Center.1,sales channel_Web.1,vehicle class_Luxury Car.1,vehicle class_Luxury SUV.1,vehicle class_SUV.1,vehicle class_Sports Car.1,vehicle class_Two-Door Car.1
count,8516.0,8516.0,8516.0,8516.0,8516.0,8516.0,8516.0,8516.0,8516.0,8516.0,...,8516.0,8516.0,8516.0,8516.0,8516.0,8516.0,8516.0,8516.0,8516.0,8516.0
mean,8029.325042,50575.558636,93.100517,15.197895,48.155942,0.374919,2.998708,0.382457,0.092062,0.266674,...,0.162283,0.11179,0.280883,0.195632,0.149248,0.017496,0.01914,0.18295,0.05108,0.191052
std,6977.395155,20983.873582,34.105718,9.786156,27.914249,0.86875,2.415817,0.486016,0.28913,0.442247,...,0.368732,0.315126,0.449457,0.39671,0.356354,0.13112,0.137027,0.386648,0.220175,0.393153
min,1898.007675,10074.0,61.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,3996.418672,34614.0,68.0,7.0,24.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,5757.489397,50414.978,83.0,15.0,48.0,0.0,2.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,8929.743271,62488.0,109.0,23.0,71.0,0.0,4.0,1.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,83325.38119,99981.0,298.0,35.0,99.0,5.0,9.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


#### Variance threshold method

Univariate method

In [30]:
from sklearn.feature_selection import VarianceThreshold # It only works with numerical features


X_train = X_train.select_dtypes(include=np.number)
X_test  = X_test.select_dtypes(include=np.number)

#display(X_train)
print("Initial number of numerical columns: ",X_train.shape)
print()


selector = VarianceThreshold(threshold=100) # Default threshold value is 0
# Features with a training-set variance lower than this threshold will be removed.
selector.fit(X_train)

kept_features_indexes = selector.get_support(indices = True) #returns an array of integers corresponding to nonremoved features
kept_features = list(X_train.iloc[:,kept_features_indexes].columns)

X_train = selector.transform(X_train)
X_test  = selector.transform(X_test)

X_train = pd.DataFrame(X_train, columns=kept_features)
X_test  = pd.DataFrame(X_test, columns=kept_features)

print("Final number of numerical columns: ",X_train.shape)
print()
X_train

Initial number of numerical columns:  (8516, 197)

Final number of numerical columns:  (8516, 4)



Unnamed: 0,customer lifetime value,income,monthly premium auto,months since policy inception
0,5040.775434,22350.000,66.0,3.0
1,11624.821490,50414.978,113.0,66.0
2,5398.098108,73775.000,67.0,51.0
3,2514.591960,43860.000,65.0,58.0
4,4762.156975,36529.000,118.0,41.0
...,...,...,...,...
8511,5012.184756,20976.000,129.0,86.0
8512,4034.926461,50414.978,112.0,28.0
8513,6829.413399,50414.978,66.0,73.0
8514,9212.613275,70958.000,115.0,57.0


#### Correlation Matrix

Univariate method

In [31]:
import seaborn as sns
import matplotlib.pyplot as plt

c = abs(data.corr())
#c

#fig, ax = plt.subplots(figsize=(14,14))
#sns.heatmap(c, annot=True);

#c['SalePrice']
c_last = c['total claim amount'].sort_values(ascending=False)
#c_last
c_thr = .3
cols_to_keep = list(c_last[c_last > c_thr].index)[1:] + [list(c_last[c_last > c_thr].index)[0]]
print(cols_to_keep)

data[cols_to_keep]

['monthly premium auto', 'location code_Suburban.1', 'location code_Suburban', 'employmentstatus_Employed', 'employmentstatus_Employed.1', 'employmentstatus_Unemployed.1', 'employmentstatus_Unemployed', 'vehicle class_Luxury Car.1', 'vehicle class_Luxury Car', 'vehicle class_Luxury SUV.1', 'vehicle class_Luxury SUV', 'total claim amount']


Unnamed: 0,monthly premium auto,location code_Suburban.1,location code_Suburban,employmentstatus_Employed,employmentstatus_Employed.1,employmentstatus_Unemployed.1,employmentstatus_Unemployed,vehicle class_Luxury Car.1,vehicle class_Luxury Car,vehicle class_Luxury SUV.1,vehicle class_Luxury SUV,total claim amount
0,61,1,1,1,1,0,0,0,0,0,0,292.800000
1,64,1,1,0,0,1,1,0,0,0,0,744.924331
2,100,1,1,1,1,0,0,0,0,0,0,480.000000
3,97,1,1,1,1,0,0,0,0,0,0,484.013411
4,117,1,1,0,0,0,0,0,0,0,0,707.925645
...,...,...,...,...,...,...,...,...,...,...,...,...
10641,253,1,1,0,0,1,1,1,1,0,0,1214.400000
10642,65,0,0,1,1,0,0,0,0,0,0,273.018929
10643,201,0,0,1,1,0,0,0,0,1,1,381.306996
10644,158,0,0,1,1,0,0,0,0,0,0,618.288849


#### Recursive feature elimination

In [33]:
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE  ## recursive feature elemination technique

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

X_train = X_train.select_dtypes(include=np.number)
X_test  = X_test.select_dtypes(include=np.number)

X_train = pd.DataFrame(X_train, columns=X.columns)
X_test  = pd.DataFrame(X_test, columns=X.columns)

#X_train.isna().sum()
nulls = pd.DataFrame(X_train.isna().sum()).reset_index()
#nulls.head()
nulls.columns = ['Column','nas']
#nulls.head()
#nulls[nulls['nas'] > 0].head()
cols_to_drop = nulls[nulls['nas'] > 0]['Column'] # Too drastic, but made on pourpose for quick filtering (don't do this in production!!)

X_train.drop(columns=cols_to_drop, axis=1, inplace = True)
X_test.drop(columns=cols_to_drop, axis=1, inplace = True)

#display(X_train)

lm = LinearRegression()

selector = RFE(lm, n_features_to_select= 50, step = 1, verbose = 1) # Step is how many features to add or drop everytime
selector.fit(X_train, y_train)

kept_features = selector.get_support(indices = True) #returns an array of integers corresponding to nonremoved features
kept_features = list(X_train.iloc[:,kept_features].columns)

X_train = selector.transform(X_train)
X_test  = selector.transform(X_test)

X_train = pd.DataFrame(X_train, columns=kept_features)
X_test  = pd.DataFrame(X_test, columns=kept_features)

print("Final selected features: ")
display(X_train)

Fitting estimator with 197 features.
Fitting estimator with 196 features.
Fitting estimator with 195 features.
Fitting estimator with 194 features.
Fitting estimator with 193 features.
Fitting estimator with 192 features.
Fitting estimator with 191 features.
Fitting estimator with 190 features.
Fitting estimator with 189 features.
Fitting estimator with 188 features.
Fitting estimator with 187 features.
Fitting estimator with 186 features.
Fitting estimator with 185 features.
Fitting estimator with 184 features.
Fitting estimator with 183 features.
Fitting estimator with 182 features.
Fitting estimator with 181 features.
Fitting estimator with 180 features.
Fitting estimator with 179 features.
Fitting estimator with 178 features.
Fitting estimator with 177 features.
Fitting estimator with 176 features.
Fitting estimator with 175 features.
Fitting estimator with 174 features.
Fitting estimator with 173 features.
Fitting estimator with 172 features.
Fitting estimator with 171 features.
F

Unnamed: 0,state_California,state_Nevada,state_Washington,coverage_Extended,coverage_Premium,effective to date_1/10/11,effective to date_1/11/11,effective to date_1/12/11,effective to date_1/13/11,effective to date_1/2/11,...,location code_Suburban.1,location code_Urban.1,marital status_Single.1,policy type_Special Auto.1,policy_Corporate L2.1,policy_Personal L1.1,policy_Personal L2.1,policy_Personal L3.1,policy_Special L2.1,policy_Special L3.1
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8511,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
8512,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
8513,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
8514,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


#### Embedded Methods

In [34]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

X_train = X_train.select_dtypes(include=np.number)
X_test  = X_test.select_dtypes(include=np.number)

In [35]:
import numpy as np
from sklearn.impute import SimpleImputer
imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
X_train=imp_mean.fit_transform(X_train)

In [36]:
X_train

array([[1.02135634e+04, 3.97190000e+04, 1.31000000e+02, ...,
        0.00000000e+00, 1.00000000e+00, 0.00000000e+00],
       [2.61302313e+03, 5.70990000e+04, 6.70000000e+01, ...,
        0.00000000e+00, 0.00000000e+00, 1.00000000e+00],
       [1.32110109e+04, 7.46560000e+04, 1.11000000e+02, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       ...,
       [6.82941340e+03, 5.04149780e+04, 6.60000000e+01, ...,
        0.00000000e+00, 0.00000000e+00, 1.00000000e+00],
       [9.21261327e+03, 7.09580000e+04, 1.15000000e+02, ...,
        0.00000000e+00, 1.00000000e+00, 0.00000000e+00],
       [1.64682208e+04, 5.04149780e+04, 6.30000000e+01, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00]])

#### OLS

In [37]:
model=LinearRegression()
model.fit(X_train, y_train)
print(f"{model.__class__.__name__}: Train -> {model.score(X_train, y_train)}, Test -> {model.score(X_test, y_test)}")

LinearRegression: Train -> 0.73307795290425, Test -> 0.7522676038440762


lasso model

In [38]:
from sklearn.linear_model import Lasso,Ridge,ElasticNet, LinearRegression
model=Lasso(alpha=0.05)

model.fit(X_train, y_train)
print(f"{model.__class__.__name__}: Train -> {model.score(X_train, y_train)}, Test -> {model.score(X_test, y_test)}")

Lasso: Train -> 0.6412161691573145, Test -> 0.6662201281754521


Ridge

In [39]:
model=Ridge(alpha=10000)
model.fit(X_train, y_train)
print(f"{model.__class__.__name__}: Train -> {model.score(X_train, y_train)}, Test -> {model.score(X_test, y_test)}")

Ridge: Train -> 0.3576831806358597, Test -> 0.38517736436236316


ElasticNet

In [40]:
model=ElasticNet(alpha=0.1)
model.fit(X_train, y_train)
print(f"{model.__class__.__name__}: Train -> {model.score(X_train, y_train)}, Test -> {model.score(X_test, y_test)}")

ElasticNet: Train -> 0.5813837370785205, Test -> 0.607302720006617


#### Fitting OLS model

NameError: name 'sm' is not defined

In [None]:
# we need to add this constant value of 1 for the intercepts
model = sm.OLS(y,X_added_constant).fit()
model.summary()