In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import statsmodels.api as sm
from scipy.stats import boxcox
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import PowerTransformer, StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
file = pd.read_csv(r'file:///Users/kaan/Desktop/Data_Marketing_Customer_Analysis_Round3.csv')
pd.options.display.max_rows = 10



In [3]:
file.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10689 entries, 0 to 10688
Data columns (total 24 columns):
 #   Column                         Non-Null Count  Dtype 
---  ------                         --------------  ----- 
 0   region                         10689 non-null  object
 1   customer_lifetime_value        10689 non-null  int64 
 2   response                       10689 non-null  object
 3   coverage                       10689 non-null  object
 4   education                      10689 non-null  object
 5   effective_to_date              10689 non-null  object
 6   month                          10689 non-null  object
 7   employment_status              10689 non-null  object
 8   gender                         10689 non-null  object
 9   income                         10689 non-null  int64 
 10  location_code                  10689 non-null  object
 11  marital_status                 10689 non-null  object
 12  monthly_premium_auto           10689 non-null  int64 
 13  m

In [5]:
numerics = ['int16', 'int32', 'int64'] 
numerical = file.select_dtypes(include=numerics)
numerical

X = np.array(numerical)

In [6]:
X= numerical [['customer_lifetime_value', 'income','monthly_premium_auto','months_since_last_claim','months_since_policy_inception','number_of_open_complaints','number_of_policies']]
y = numerical ['total_claim_amount']

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size= 0.3, random_state = 42)
print("X_train", X_train.shape)

X_train (7482, 7)


In [8]:
X_train.describe()

Unnamed: 0,customer_lifetime_value,income,monthly_premium_auto,months_since_last_claim,months_since_policy_inception,number_of_open_complaints,number_of_policies
count,7482.0,7482.0,7482.0,7482.0,7482.0,7482.0,7482.0
mean,8006.80981,51721.031542,93.269981,15.139134,48.277065,0.381182,2.989174
std,6880.901078,24737.594443,34.517788,10.154311,27.875889,0.907636,2.399152
min,1898.0,10074.0,61.0,0.0,0.0,0.0,1.0
25%,4014.0,29194.25,68.0,6.0,25.0,0.0,1.0
50%,5780.5,50397.5,83.0,14.0,48.0,0.0,2.0
75%,8969.0,72020.75,109.0,23.0,71.0,0.0,4.0
max,74228.0,99971.0,297.0,35.0,99.0,5.0,9.0


In [9]:
from sklearn.feature_selection import VarianceThreshold # It only works with numerical features


X_train = X_train.select_dtypes(include=np.number)
X_test  = X_test.select_dtypes(include=np.number)

#display(X_train)
print("Initial number of numerical columns: ",X_train.shape)
print()


selector = VarianceThreshold(100) # Default threshold value is 0
# Features with a training-set variance lower than this threshold will be removed.
selector.fit(X_train)

kept_features_indexes = selector.get_support(indices = True) #returns an array of integers corresponding to nonremoved features
kept_features = list(X_train.iloc[:,kept_features_indexes].columns)

X_train = selector.transform(X_train)
X_test  = selector.transform(X_test)

X_train = pd.DataFrame(X_train, columns=kept_features)
X_test  = pd.DataFrame(X_test, columns=kept_features)

print("Final number of numerical columns: ",X_train.shape)
print()
X_train

Initial number of numerical columns:  (7482, 7)

Final number of numerical columns:  (7482, 5)



Unnamed: 0,customer_lifetime_value,income,monthly_premium_auto,months_since_last_claim,months_since_policy_inception
0,8663,42169,83,18,90
1,4213,12160,109,5,34
2,2359,19864,63,22,96
3,19511,40625,70,28,26
4,3576,24959,89,19,13
...,...,...,...,...,...
7477,7610,98701,94,22,66
7478,35186,86134,98,17,78
7479,4241,19834,64,26,8
7480,12941,77060,106,23,90


In [10]:
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE  ## recursive feature elemination technique

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

X_train = X_train.select_dtypes(include=np.number)
X_test  = X_test.select_dtypes(include=np.number)

X_train = pd.DataFrame(X_train, columns=X.columns)
X_test  = pd.DataFrame(X_test, columns=X.columns)

#X_train.isna().sum()
nulls = pd.DataFrame(X_train.isna().sum()).reset_index()
#nulls.head()
nulls.columns = ['Column','nas']
#nulls.head()
#nulls[nulls['nas'] > 0].head()
cols_to_drop = nulls[nulls['nas'] > 0]['Column'] # Too drastic, but made on pourpose for quick filtering (don't do this in production!!)

X_train.drop(columns=cols_to_drop, axis=1, inplace = True)
X_test.drop(columns=cols_to_drop, axis=1, inplace = True)

#display(X_train)

lm = LinearRegression()

selector = RFE(lm, n_features_to_select= 8, step = 1, verbose = 1) # Step is how many features to add or drop everytime
selector.fit(X_train, y_train)

kept_features = selector.get_support(indices = True) #returns an array of integers corresponding to nonremoved features
kept_features = list(X_train.iloc[:,kept_features].columns)

X_train = selector.transform(X_train)
X_test  = selector.transform(X_test)

X_train = pd.DataFrame(X_train, columns=kept_features)
X_test  = pd.DataFrame(X_test, columns=kept_features)

print("Final selected features: ")
display(X_train)

Final selected features: 


Unnamed: 0,customer_lifetime_value,income,monthly_premium_auto,months_since_last_claim,months_since_policy_inception,number_of_open_complaints,number_of_policies
0,21423,22379,65,9,31,0,2
1,8391,40211,106,5,98,2,6
2,3969,49544,101,3,29,0,1
3,14914,45963,63,3,73,2,2
4,18060,57882,115,1,61,0,2
...,...,...,...,...,...,...,...
8546,7610,98701,94,22,66,0,3
8547,35186,86134,98,17,78,0,2
8548,4241,19834,64,26,8,4,8
8549,12941,77060,106,23,90,0,2


In [31]:
import numpy as np
from sklearn.impute import SimpleImputer
imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
X_train=imp_mean.fit_transform(X_train)
X_test = imp_mean.fit_transform(X_test)

In [32]:
model=LinearRegression()
model.fit(X_train, y_train)
print(f"{model.__class__.__name__}: Train -> {model.score(X_train, y_train)}, Test -> {model.score(X_test, y_test)}")

LinearRegression: Train -> 0.411514336844104, Test -> 0.40127309153524193


In [36]:
from sklearn.linear_model import Lasso,Ridge,ElasticNet, LinearRegression
model=Lasso(alpha=1)

model.fit(X_train, y_train)
print(f"{model.__class__.__name__}: Train -> {model.score(X_train, y_train)}, Test -> {model.score(X_test, y_test)}")

Lasso: Train -> 0.41151124402439143, Test -> 0.40137486012323476


In [37]:
model=Ridge(alpha=25)
model.fit(X_train, y_train)
print(f"{model.__class__.__name__}: Train -> {model.score(X_train, y_train)}, Test -> {model.score(X_test, y_test)}")

Ridge: Train -> 0.4115143368110773, Test -> 0.4012734914045598


In [16]:
model=ElasticNet(alpha=0.9)
model.fit(X_train, y_train)
print(f"{model.__class__.__name__}: Train -> {model.score(X_train, y_train)}, Test -> {model.score(X_test, y_test)}")

ElasticNet: Train -> 0.41151184551775066, Test -> 0.4013745425889189


In [29]:
def fit_regularized_model(model_type, X, y, alpha=1.0, **kwargs):
    if model_type == 'lasso':
        model = Lasso(alpha=alpha, **kwargs)
    elif model_type == 'ridge':
        model = Ridge(alpha=alpha, **kwargs)
    else:
        raise ValueError

    model.fit(X, y)
    return model