In [1]:
import gc
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder , OneHotEncoder
from sklearn.metrics import accuracy_score
sns.set()
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn.metrics import roc_auc_score
from lightgbm import LGBMClassifier
from scipy import stats
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier

  import pandas.util.testing as tm


In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
sub = pd.read_csv('sample_submission.csv')
train.shape , test.shape

((381109, 12), (127037, 11))

## Important custom functions

In [3]:
def savePredictions(model,fileName,test_values):
    y_pred = model.predict_proba(test_values)[:,1]
    sub['Response'] = y_pred
    sub.to_csv(fileName+'.csv',index=False)
    return sub
scores = [0]

def getInfo(dataset):
    info = pd.DataFrame({'Nunuique':dataset.nunique(),'DataType':dataset.dtypes,'NullValues':dataset.isnull().sum()})
    return info

def preprocess(train):
    sc = StandardScaler()
    train = sc.fit_transform(train)
    return train


## Append train and test

In [4]:
raw_data = train.append(test)
raw_data.index = range(len(raw_data))

In [5]:
dataset = raw_data.copy()

## Mapping 

In [6]:
vehicle_age_map = {"> 2 Years":3,"1-2 Year":2,"< 1 Year":1}
dataset["Vehicle_Age"] = dataset["Vehicle_Age"].map(vehicle_age_map)

## Adding new features and binning

In [7]:
# adding new features and binning

l = []
for i in raw_data['Age']:
    if(i<25):l.append(1)
    elif(i>=25 and i<35):l.append(3)
    elif(i>=35 and i<50):l.append(4)
    elif(i>=50 and i<60):l.append(5)
    else:l.append(2)
dataset['Salary'] = l

Bins_Age = np.linspace(dataset['Age'].min(),dataset['Age'].max(),6)
Bins_Annual_Premium = np.linspace(dataset['Annual_Premium'].min(),dataset['Annual_Premium'].max(),4)
Bins_Vintage = np.linspace(dataset['Vintage'].min(),dataset['Vintage'].max(),4)

dataset['Experience_level'] =  pd.cut(dataset['Age'] , Bins_Age , labels=['one','two','three','four','five'],include_lowest=True)
dataset['Category_of_Annual_Premium'] =  pd.cut(dataset['Annual_Premium'] , Bins_Annual_Premium , labels=['low','medium','high'],include_lowest=True)
dataset['Time_of_Trust'] = pd.cut(dataset['Vintage'] , Bins_Vintage , labels=['short','average','long'],include_lowest=True)

transform_series = dataset.groupby('Vehicle_Age').size()/len(dataset)
dataset['Vehicle_Age_ratio']= dataset['Vehicle_Age'].apply(lambda x: transform_series[x])

transform_series = dataset.groupby('Policy_Sales_Channel').size()/len(dataset)
dataset['Policy_Sales_Channel_ratio']= dataset['Policy_Sales_Channel'].apply(lambda x: transform_series[x])


dataset['Experience_level'] =  pd.cut(dataset['Age'] , Bins_Age , labels=['one','two','three','four','five'],include_lowest=True)
dataset['Category_of_Annual_Premium'] =  pd.cut(dataset['Annual_Premium'] , Bins_Annual_Premium , labels=['low','medium','high'],include_lowest=True)

dataset['Annual_Premium']=np.log(dataset['Annual_Premium'])

In [8]:
#db = dataset.groupby(['Gender','Vehicle_Damage']).size() 
#db2 = dataset.groupby(['Gender']).size()
#dataset['Probab_previously_damage_per_gender'] = dataset['Gender'].apply(lambda x:db[x][1]/db2[x])

## Label Encode the columns

In [8]:
columns_to_be_label_encoded = ['Gender','Vehicle_Damage','Category_of_Annual_Premium','Experience_level','Time_of_Trust']
for col in columns_to_be_label_encoded:
    le = LabelEncoder()
    dataset[col] = le.fit_transform(dataset[col])

## Feature Engineering 

In [24]:
df = dataset.copy()

In [25]:
df['Damage_sums_per_region'] = df.groupby(['Region_Code'])['Vehicle_Damage'].transform('sum')
df['Mean_premium_per_region'] = df.groupby(['Region_Code'])['Annual_Premium'].transform('mean')
df['Count_unique_policy_sales_per_region'] = df.groupby(['Region_Code'])['Policy_Sales_Channel'].transform('nunique')
df['Count_policy_sales_per_region'] = df.groupby(['Region_Code'])['Policy_Sales_Channel'].transform('count')
df['Mean_vehicle_age_per_region'] = df.groupby(['Region_Code'])['Vehicle_Age'].transform('mean')
df['Mean_age_per_region'] = df.groupby(['Region_Code'])['Age'].transform('mean')
df['Mean_salary_per_region'] = df.groupby(['Region_Code'])['Salary'].transform('mean')
df['Count_previously_insured_per_region'] = df.groupby(['Region_Code'])['Previously_Insured'].transform('sum')
df['Mean_vintage_per_region'] = df.groupby(['Region_Code'])['Vintage'].transform('mean')
df['Max_premimum_per_region'] = df.groupby(['Region_Code'])['Annual_Premium'].transform('max')
df['Max_premimum_per_region'] = df.groupby(['Region_Code'])['Annual_Premium'].transform('min')
df["Rank_premium_per_region"] = df.groupby("Region_Code")['Annual_Premium'].rank(method="dense", ascending=True)


In [26]:
df['Mean_cat_premium_per_experience'] = df.groupby(["Experience_level"])['Category_of_Annual_Premium'].transform('mean')
df['Mean_premium_per_experience'] = df.groupby(["Experience_level"])['Annual_Premium'].transform('mean')
df['Mean_salary_per_experience'] = df.groupby(["Experience_level"])['Salary'].transform('mean')
#df['Mean_vehicle_damage_per_experience'] = df.groupby(["Experience_level"])['Vehicle_Damage'].transform('mean')
#df['Mean_vehicle_age_per_experience'] = df.groupby(["Experience_level"])['Vehicle_Age'].transform('mean')

#df['mean_premium_per_vintage'] = df.groupby(['Vintage','Region_Code'])['Annual_Premium'].transform('max')
#df['mean_AP_per_policy'] = df.groupby(['Policy_Sales_Channel'])['Annual_Premium'].transform('mean')
#df['mean_vintage_per_policy'] = df.groupby(['Policy_Sales_Channel'])['Vintage'].transform('mean')

In [12]:
#from sklearn import model_selection
#test_df= df[df['Response'].isnull()==True]
#df = df[df['Response'].isnull()==False]


#df["kfold"] =-1
#df = df.sample(frac=1).reset_index(drop=True)

#kf = model_selection.StratifiedShuffleSplit(n_splits=3,random_state=289)

#for fold , (trn_,val_) in enumerate(kf.split(X=df,y=df.Response)):
    #df.loc[val_,'kfold'] = fold
    

## Dividing into train and test 

In [29]:
test_df.shape

(127037, 32)

In [28]:
train_df = df[df['Response'].isnull()==False]
test_df= df[df['Response'].isnull()==True]
X = train_df.drop(['Response','id'],axis=1)
y = train_df['Response'] 
X_pred =test_df.drop(['Response','id'],axis=1)
X.shape , y.shape , X_pred.shape

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.15,random_state =2020,stratify=y)
X_train.shape , X_test.shape , y_train.shape , y_test.shape

((323942, 30), (57167, 30), (323942,), (57167,))

## 1) Lightgbm

In [180]:
scores=[]
c = df.drop(['Response','kfold'],axis=1)
cols = list(c.columns)
for i in range(10):
    tra = df[df['kfold']!=i]
    val = df[df['kfold']==i]
    lgb = LGBMClassifier(boosting_type='gbdt',
                     n_estimators=450,depth=10,learning_rate=0.03,
                     objective='binary',metric='auc',is_unbalance=True,
                     colsample_bytree=0.5,reg_lambda=10,
                     reg_alpha=2,random_state=294,n_jobs=-1,
                    )
    
    lgb.fit(tra[cols],tra.Response)
    score = roc_auc_score(val.Response,lgb.predict_proba(val[cols])[:,1])
    scores.append(score)
    print(score)
    

0.8606356951658474
0.8604924274191481
0.8535283869901382
0.8588697660934378
0.8614197619497759
0.857465223430767
0.8563920434701153
0.8624780409044641
0.8552135653542067
0.861906180889106


In [30]:
lgb = LGBMClassifier(boosting_type='gbdt',
                 n_estimators=450,depth=10,learning_rate=0.03,
                 objective='binary',metric='auc',is_unbalance=True,
                 colsample_bytree=0.5,reg_lambda=10,
                 reg_alpha=2,random_state=294,n_jobs=-1,
                )
lgb.fit(X_train,y_train)

LGBMClassifier(colsample_bytree=0.5, depth=10, is_unbalance=True,
               learning_rate=0.03, metric='auc', n_estimators=450,
               objective='binary', random_state=294, reg_alpha=2,
               reg_lambda=10)

In [31]:
score = roc_auc_score(y_test,lgb.predict_proba(X_test)[:,1])
scores.append(score)
print(scores)
score

[0, 0.8611971020549286, 0.8586932775586228]


0.8586932775586228

In [193]:
sub = savePredictions(lgb,'XGBClassifier',test_df[cols])
sub

Unnamed: 0,id,Response
0,381110,0.001286
1,381111,0.325032
2,381112,0.252215
3,381113,0.010388
4,381114,0.001131
...,...,...
127032,508142,0.000861
127033,508143,0.330292
127034,508144,0.000809
127035,508145,0.001156


In [104]:
d = pd.DataFrame()
d['col'] = X_test.columns
d['imp'] = lgb.feature_importances_
d.sort_values('imp',ascending=False).head(45)

Unnamed: 0,col,imp
30,mean_premium_per_vintage,1136
9,Vintage,1119
1,Age,1115
7,Annual_Premium,999
26,Rank_premium_per_region,894
8,Policy_Sales_Channel,829
31,mean_AP_per_policy,772
15,Policy_Sales_Channel_ratio,698
32,mean_vintage_per_policy,587
3,Region_Code,550


In [32]:
## for saving the model

import pickle
pickle_out = open("Classifier.pkl","wb")
pickle.dump(lgb , pickle_out)
pickle_out.close()

In [33]:
X_pred.shape

(127037, 30)

In [34]:
l = pickle.load(open('Classifier.pkl','rb'))

In [35]:
l.predict(X_pred.iloc[1,:].values.reshape(1,30))

array([1.])