In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.metrics import  roc_auc_score
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from lightgbm import LGBMClassifier
from sklearn import linear_model

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
sub = pd.read_csv('sample_submission.csv')
train.shape , test.shape

((381109, 12), (127037, 11))

In [3]:
train = train[train.Annual_Premium < 200000]
train['Annual_Premium']=np.log(train['Annual_Premium'])
train = train.sample(frac=1).reset_index(drop=True)
raw_data = train.append(test)
raw_data.index = range(raw_data.shape[0])

In [4]:
def getInfo(dataset):
    info = pd.DataFrame({'Nunuique':dataset.nunique(),'DataType':dataset.dtypes,'NullValues':dataset.isnull().sum()})
    return info

In [5]:
def savePredictions(model,fileName,test_values):
    y_pred = np.array(model.predict_proba(test_values)[:,1])
    sub['Response'] = y_pred
    sub.to_csv(fileName+'.csv',index=False)
    return sub.shape

In [6]:
dataset = raw_data.copy()

In [149]:
#for col in columns_to_be_label_encoded:
#    le = LabelEncoder()
#    dataset[col] = le.fit_transform(dataset[col])
gender_map = {'Male':1,'Female':0}
vehicle_age_gap = {
    '1-2 Year':1,
    '< 1 Year':0,
    '> 2 Years':2}
vehicle_damage_gap ={
    'Yes':1,
    'No':0 
}
dataset['Gender'] = dataset['Gender'].map(gender_map)
dataset['Vehicle_Age'] = dataset['Vehicle_Age'].map(vehicle_age_gap)
dataset['Vehicle_Damage'] = dataset['Vehicle_Damage'].map(vehicle_damage_gap)

In [150]:
##1
#dataset['age_bin'] = pd.cut(dataset["Age"], bins=4, labels=False)
#dataset['vintage_bin'] = pd.cut(dataset["Vintage"], bins=5, labels=False)
#dataset['Annual_Premium_bins'] = pd.cut(dataset["Annual_Premium"], bins=3, labels=False)

Bins_Age = np.linspace(dataset['Age'].min(),dataset['Age'].max(),6)
Bins_Annual_Premium = np.linspace(dataset['Annual_Premium'].min(),dataset['Annual_Premium'].max(),4)
Bins_Age , Bins_Annual_Premium

transform_series = dataset.groupby('Vehicle_Age').size()/len(dataset)
dataset['Vehicle_Age_ratio']= dataset['Vehicle_Age'].apply(lambda x: transform_series[x])

dataset['Experience_level'] =  pd.cut(dataset['Age'] , Bins_Age , labels=['one','two','three','four','five'],include_lowest=True)
dataset['Category_of_Annual_Premium'] =  pd.cut(dataset['Annual_Premium'] , Bins_Annual_Premium , labels=['low','medium','high'],include_lowest=True)


In [151]:
##0.5
dataset['Damage_sums_per_region'] = dataset.groupby(['Region_Code'])['Vehicle_Damage'].transform('sum')
dataset['Mean_premium_per_region'] = dataset.groupby(['Region_Code'])['Annual_Premium'].transform('mean')
dataset['Count_unique_policy_sales_per_region'] = dataset.groupby(['Region_Code'])['Policy_Sales_Channel'].transform('nunique')
dataset['Count_policy_sales_per_region'] = dataset.groupby(['Region_Code'])['Policy_Sales_Channel'].transform('count')
dataset['Mean_vehicle_age_per_region'] = dataset.groupby(['Region_Code'])['Vehicle_Age'].transform('mean')
dataset['Mean_age_per_region'] = dataset.groupby(['Region_Code'])['Age'].transform('mean')
#dataset['Mean_salary_per_region'] = dataset.groupby(['Region_Code'])['Salary'].transform('mean')
dataset['Count_previously_insured_per_region'] = dataset.groupby(['Region_Code'])['Previously_Insured'].transform('sum')
dataset['Mean_vintage_per_region'] = dataset.groupby(['Region_Code'])['Vintage'].transform('mean')
dataset['Max_premimum_per_region'] = dataset.groupby(['Region_Code'])['Annual_Premium'].transform('max')
dataset['Max_premimum_per_region'] = dataset.groupby(['Region_Code'])['Annual_Premium'].transform('min')
dataset["Rank_premium_per_rank"] = dataset.groupby("Region_Code")['Annual_Premium'].rank(method="dense", ascending=True)


In [26]:
dataset['Policy_Sales_Channel'] = dataset.loc[dataset["Policy_Sales_Channel"].value_counts()[dataset["Policy_Sales_Channel"]].values < 200,
                                              "Policy_Sales_Channel"] = 500

In [152]:
##2
dataset['no_of_people_from_regionwise'] = dataset.groupby(['Region_Code'])['id'].transform('nunique')
dataset['no_of_people_in_policy_channel'] = dataset.groupby(['Policy_Sales_Channel'])['id'].transform('count')
dataset['no_of_people_with_same_associatedDays'] = dataset.groupby(['Vintage'])['id'].transform('count')
dataset['no_of_people_with_DL_and_their_VehicleAge'] =dataset.groupby(['Driving_License','Vehicle_Age'])['id'].transform('count')
dataset['no_of_people_with_DL_and_their_VehicleDamage'] =dataset.groupby(['Driving_License','Vehicle_Damage'])['id'].transform('count')
dataset['no_of_people_with_Dl_regionwise'] = dataset.groupby(['Region_Code','Driving_License'])['id'].transform('count')


In [153]:
dataset.drop(['id'],axis=1,inplace=True)

In [111]:
##3
dataset['sum_Lincense_region'] = dataset.groupby(['Region_Code'])['Driving_License'].transform('sum')
dataset['sum_Insurance_region'] = dataset.groupby(['Region_Code'])['Previously_Insured'].transform('sum')
dataset['count_Gender_region'] = dataset.groupby(['Region_Code'])['Gender'].transform('count')
dataset['count_policy_per_region'] = dataset.groupby(['Region_Code'])['Policy_Sales_Channel'].transform('count')
dataset['count_Vechicle_age_region'] = dataset.groupby(['Region_Code'])['Vehicle_Age'].transform('count')
dataset['Count_unique_policy_sales_per_region'] = dataset.groupby(['Region_Code'])['Policy_Sales_Channel'].transform('nunique')
dataset["Rank_premium_per_rank"] = dataset.groupby("Region_Code")['Annual_Premium'].rank(method="dense", ascending=True)


In [None]:
dataset['mean_age_per_region'] = dataset.groupby(['Region_Code'])['Age'].transform('mean')
#dataset['max_age_per_region'] = dataset.groupby(['Region_Code'])['Age'].transform('max')
dataset['std_age_per_region'] = dataset.groupby(['Region_Code'])['Age'].transform('std')

dataset['mean_vehicle_age'] = dataset.groupby(['Vehicle_Age'])['Annual_Premium'].transform('mean')
dataset['std_vehicle_age'] = dataset.groupby(['Vehicle_Age'])['Annual_Premium'].transform('std')
dataset['max_vehicle_age'] = dataset.groupby(['Vehicle_Age'])['Annual_Premium'].transform('max')
dataset['sum_vehicle_age'] = dataset.groupby(['Vehicle_Age'])['Annual_Premium'].transform('sum')

dataset['mean_AP_per_policy'] = dataset.groupby(['Policy_Sales_Channel'])['Annual_Premium'].transform('mean')
#dataset['min_AP_per_policy'] = dataset.groupby(['Policy_Sales_Channel'])['Annual_Premium'].transform('min')
#dataset['max_AP_per_policy'] = dataset.groupby(['Policy_Sales_Channel'])['Annual_Premium'].transform('max')
dataset['sum_AP_per_policy'] = dataset.groupby(['Policy_Sales_Channel'])['Annual_Premium'].transform('sum')

dataset['sum_AP_region'] = dataset.groupby(['Region_Code'])['Annual_Premium'].transform('sum')
dataset['mean_AP_region'] = dataset.groupby(['Region_Code'])['Annual_Premium'].transform('mean')
dataset['std_AP_region'] = dataset.groupby(['Region_Code'])['Annual_Premium'].transform('std')
dataset['max_AP_region'] = dataset.groupby(['Region_Code'])['Annual_Premium'].transform('max')

In [15]:
#dataset.Age.var() , dataset.age_bin.var()
#dataset.Age.apply(lambda x: np.log(1 + x)).var() , dataset.age_bin.apply(lambda x: np.log(1 + x)).var()

(0.14452786602542578, 0.2282684263174849)

In [154]:
X = dataset.iloc[ : train.shape[0], :]
X.drop(['Response'],axis=1,inplace=True)
y = train.Response.values
X_pred = dataset.iloc[train.shape[0]: , :]
X_pred.drop(['Response'],axis=1,inplace=True)
X.shape , y.shape , X_pred.shape

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


((380997, 29), (380997,), (127037, 29))

In [66]:
## 1) UnderSampling  and Oversampling
#from imblearn.under_sampling import NearMiss
#from imblearn.over_sampling import SMOTE 
#nm = NearMiss()
#X , y = nm.fit_sample(X,y)

#sm = SMOTE(random_state = 2) 
#X, y = sm.fit_sample(X,y) 

In [155]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state =207,stratify=y)
X_train.shape , X_test.shape , y_train.shape , y_test.shape

((304797, 29), (76200, 29), (304797,), (76200,))

In [156]:
lg=LGBMClassifier(boosting_type='gbdt',n_estimators=500,depth=10,learning_rate=0.03,objective='binary',metric='auc',is_unbalance=True,
                 colsample_bytree=0.5,reg_lambda=2,reg_alpha=2,random_state=294,n_jobs=-1)

lg.fit(X_train,y_train)
print(roc_auc_score(y_test,lg.predict_proba(X_test)[:,1]))

0.8590621586269899


In [136]:
savePredictions(lg,'LightGBMClassifier',X_pred)

(127037, 2)

In [157]:
df = pd.DataFrame()
df['col'] = X_test.columns
df['imp'] = lg.feature_importances_
df.sort_values('imp',ascending=False).head(30)

Unnamed: 0,col,imp
1,Age,1526
7,Annual_Premium,1498
9,Vintage,1388
25,no_of_people_with_same_associatedDays,1293
8,Policy_Sales_Channel,1253
22,Rank_premium_per_rank,1098
24,no_of_people_in_policy_channel,1094
3,Region_Code,708
14,Mean_premium_per_region,553
17,Mean_vehicle_age_per_region,489


In [101]:
from catboost import CatBoostClassifier
cb = CatBoostClassifier()
cb.fit(X_train,y_train)

In [None]:
#cb.score(X_train,y_train) , cb.score(X_test,y_test)
roc_auc_score(cb.predict_proba(X_test)[:,1],y_test)  , roc_auc_score(cb.predict_proba(X_train)[:,1],y_train)

In [66]:
savePredictions(cb,'CatboostClassifier',X_pred)

(127037, 2)

In [73]:
qw = pd.DataFrame()
qw['columns'] = X_train.columns
qw['importance'] = cb.feature_importances_
qw.sort_values(['importance'],ascending=False).head(30)

Unnamed: 0,columns,importance
4,Previously_Insured,46.607576
1,Age,8.145757
7,Annual_Premium,6.480244
6,Vehicle_Damage,5.348307
9,Vintage,4.385244
20,sum_AP_per_policy,3.84258
10,mean_age_per_region,3.084634
8,Policy_Sales_Channel,2.830498
24,max_AP_region,2.625204
19,max_AP_per_policy,2.589696


## 5) Xgboost

In [27]:
from xgboost import XGBClassifier

In [28]:
xgb = XGBClassifier()

In [29]:
xgb.fit(X_train,y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=0, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [30]:
roc_auc_score(y_test,xgb.predc)

(0.896302692519036, 0.8925239234449761)

In [32]:
savePredictions(xgb,'Xgboost',X_pred)

(127037, 2)