In [201]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
from sklearn.linear_model import LinearRegression,Lasso,Ridge
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import VotingRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from sklearn.cluster import KMeans
from sklearn.feature_selection import SelectKBest,f_regression
from scipy.stats import pointbiserialr
from sklearn.model_selection import StratifiedKFold,KFold
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error, median_absolute_error
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

In [202]:
data = pd.read_csv('data/train_BRCpofr.csv')
data.drop('id',axis=1 ,inplace=True)
data.head()

Unnamed: 0,gender,area,qualification,income,marital_status,vintage,claim_amount,num_policies,policy,type_of_policy,cltv
0,Male,Urban,Bachelor,5L-10L,1,5,5790,More than 1,A,Platinum,64308
1,Male,Rural,High School,5L-10L,0,8,5080,More than 1,A,Platinum,515400
2,Male,Urban,Bachelor,5L-10L,1,8,2599,More than 1,A,Platinum,64212
3,Female,Rural,High School,5L-10L,0,7,0,More than 1,A,Platinum,97920
4,Male,Urban,High School,More than 10L,1,6,3508,More than 1,A,Gold,59736


In [203]:
def preprocess(data):
    data['gender'].replace(to_replace=['Male', 'Female'], value=[1,0], inplace=True)
    data['area'].replace(to_replace=['Urban', 'Rural'], value=[1,0], inplace=True)
    data['num_policies'].replace(to_replace=['More than 1', '1'], value=[1,0], inplace=True)
    qual=pd.get_dummies(data['qualification'], prefix='qualified')
    inc=pd.get_dummies(data['income'], prefix='range')
    polc=pd.get_dummies(data['policy'], prefix='policy')
    typol=pd.get_dummies(data['type_of_policy'], prefix='premium')
    data = pd.concat([data,qual,inc,polc,typol],axis=1)
    data.drop(columns=['qualification','income','policy','type_of_policy'], inplace=True)
    return data

In [204]:
train_data = preprocess(data)

In [205]:
X = train_data.drop('cltv',axis=1)
y = train_data['cltv']
X_train, X_test, y_train, y_test=train_test_split(X,y, test_size=0.1, random_state=11)

In [206]:
scaler = RobustScaler()
X_train[['claim_amount']] = scaler.fit_transform(X_train[['claim_amount']])
X_test[['claim_amount']] = scaler.transform(X_test[['claim_amount']])

In [207]:
model_gdb = GradientBoostingRegressor()
model_gdb.fit(X_train, y_train)
yhat = model_gdb.predict(X_test)
r2_score(y_test, yhat), mean_absolute_error(y_test, yhat), np.sqrt(mean_squared_error(y_test, yhat))

(0.16787358451122303, 50684.93292645225, 82606.35354655766)

In [208]:
data = pd.read_csv('data/test_koRSKBP.csv')

In [209]:
test_data = data.drop('id',axis=1)

In [210]:
test_data = preprocess(test_data)

In [211]:
test_data[['claim_amount']] = scaler.transform(test_data[['claim_amount']])
test_data.head()

Unnamed: 0,gender,area,marital_status,vintage,claim_amount,num_policies,qualified_Bachelor,qualified_High School,qualified_Others,range_2L-5L,range_5L-10L,range_<=2L,range_More than 10L,policy_A,policy_B,policy_C,premium_Gold,premium_Platinum,premium_Silver
0,0,0,0,6,-0.529268,1,0,1,0,0,1,0,0,0,1,0,0,0,1
1,0,1,0,4,0.004065,1,0,1,0,1,0,0,0,1,0,0,0,1,0
2,1,0,1,7,-0.314905,1,0,1,0,0,1,0,0,0,1,0,1,0,0
3,0,0,1,2,-1.107588,1,1,0,0,0,0,0,1,0,1,0,0,0,1
4,0,1,0,5,2.702439,1,0,1,0,1,0,0,0,0,1,0,0,0,1


In [212]:
yhat = model_gdb.predict(test_data)

In [213]:
ids = data['id']
ids

0         89393
1         89394
2         89395
3         89396
4         89397
          ...  
59590    148983
59591    148984
59592    148985
59593    148986
59594    148987
Name: id, Length: 59595, dtype: int64

In [214]:
subm = {'id':ids,'cltv':yhat}
result = pd.DataFrame(subm)
result.to_csv('submission_a.csv', index = False)