In [166]:
import pickle as pk
import pandas as pd
import numpy as np
import datetime

In [167]:
def load_model(filename):
    with open(filename, 'rb') as file:  
        model = pk.load(file)
    return model

In [619]:
def get_sub_file(model, df):
    '''
    Create Submission File
    '''
    try:
        df = df.astype(int)
    except Exception as E:
        print(E)
        
    id = str(datetime.datetime.now()).replace(':', '').replace('.','').split()[1][2:9]
    filename = f'Submissions/Submission_{id}.csv'
    
    pred = model.predict(df.drop('ID', axis=1))
    pred = model.predict_proba(df.drop('ID', axis=1))[:, 1]

    pd.DataFrame(data={'ID':df.ID,'Response':pred}).to_csv(filename, index=False)
    return filename

In [169]:
filename = 'model_6xgb.pkl'
model = load_model(f'Models/{filename}')
model

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=8, num_parallel_tree=1, random_state=441,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [170]:
seed = 441

In [367]:
# original dataframe
df = pd.read_csv('test.csv')
df.head()

Unnamed: 0,ID,City_Code,Region_Code,Accomodation_Type,Reco_Insurance_Type,Upper_Age,Lower_Age,Is_Spouse,Health Indicator,Holding_Policy_Duration,Holding_Policy_Type,Reco_Policy_Cat,Reco_Policy_Premium
0,50883,C1,156,Owned,Individual,30,30,No,,6.0,3.0,5,11934.0
1,50884,C4,7,Owned,Joint,69,68,Yes,X1,3.0,3.0,18,32204.8
2,50885,C1,564,Rented,Individual,28,28,No,X3,2.0,4.0,17,9240.0
3,50886,C3,1177,Rented,Individual,23,23,No,X3,3.0,3.0,18,9086.0
4,50887,C1,951,Owned,Individual,75,75,No,X3,,,5,22534.0


# <center> VERSION 2 <center>

In [368]:
filename = 'ss_ua.pkl'
ss_ua = load_model(f'Resources/{filename}')


filename = 'ss_la.pkl'
ss_la = load_model(f'Resources/{filename}')


filename = 'ss_pr.pkl'
ss_pr = load_model(f'Resources/{filename}')


In [369]:
# test dataframe
df2 = df.copy()

In [370]:
hi = pd.get_dummies(df2['Health Indicator'].fillna('X0'))
hi

Unnamed: 0,X0,X1,X2,X3,X4,X5,X6,X7,X8,X9
0,1,0,0,0,0,0,0,0,0,0
1,0,1,0,0,0,0,0,0,0,0
2,0,0,0,1,0,0,0,0,0,0
3,0,0,0,1,0,0,0,0,0,0
4,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...
21800,0,1,0,0,0,0,0,0,0,0
21801,0,0,0,0,0,1,0,0,0,0
21802,0,0,0,1,0,0,0,0,0,0
21803,0,0,0,0,1,0,0,0,0,0


In [371]:
df2.drop('Health Indicator', inplace = True, axis=1)

In [372]:
cc = pd.get_dummies(df2.City_Code)
cc.head()

Unnamed: 0,C1,C10,C11,C12,C13,C14,C15,C16,C17,C18,...,C33,C34,C35,C36,C4,C5,C6,C7,C8,C9
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [373]:
df2.drop('City_Code', inplace = True, axis=1)

In [374]:
df2.Holding_Policy_Duration = df2.Holding_Policy_Duration.str.replace('14\+', '15')

df2.Holding_Policy_Duration = df2.Holding_Policy_Duration.fillna(0)

df2.Holding_Policy_Duration = df2.Holding_Policy_Duration.apply(lambda x: float(x)).astype('int64')

In [375]:
df2.Holding_Policy_Type = df2.Holding_Policy_Type.fillna(0).astype('category')

In [376]:
df2.drop('Region_Code', axis=1, inplace=True)

In [377]:
df2.Accomodation_Type = df2.Accomodation_Type.apply(
    lambda x: 0 if (x == 'Rented') else 1).astype('category')

In [378]:
df2.Reco_Insurance_Type = df2.Reco_Insurance_Type.apply(
    lambda x: 0 if (x == 'Individual') else 1).astype('category')

In [379]:
df2.Is_Spouse = df2.Is_Spouse.apply(lambda x: 0
                                    if (x == 'No') else 1).astype('category')

In [380]:
df2.Reco_Policy_Cat = df2.Reco_Policy_Cat.astype('category')

In [381]:
df2 = pd.concat([df2, hi, cc], axis=1)
df2.head()

Unnamed: 0,ID,Accomodation_Type,Reco_Insurance_Type,Upper_Age,Lower_Age,Is_Spouse,Holding_Policy_Duration,Holding_Policy_Type,Reco_Policy_Cat,Reco_Policy_Premium,...,C33,C34,C35,C36,C4,C5,C6,C7,C8,C9
0,50883,1,0,30,30,0,6,3.0,5,11934.0,...,0,0,0,0,0,0,0,0,0,0
1,50884,1,1,69,68,1,3,3.0,18,32204.8,...,0,0,0,0,1,0,0,0,0,0
2,50885,0,0,28,28,0,2,4.0,17,9240.0,...,0,0,0,0,0,0,0,0,0,0
3,50886,0,0,23,23,0,3,3.0,18,9086.0,...,0,0,0,0,0,0,0,0,0,0
4,50887,1,0,75,75,0,0,0.0,5,22534.0,...,0,0,0,0,0,0,0,0,0,0


In [382]:
df2 = df2.astype('int')
df2 = df2.astype('int')

In [383]:
df2_cleaned = df2.copy()

In [165]:
df2.Upper_Age = ss_ua.transform(df2.Upper_Age.values.reshape(-1, 1))
df2.Lower_Age = ss_la.transform(df2.Lower_Age.values.reshape(-1, 1))
df2.Reco_Policy_Premium = ss_pr.transform(df2.Reco_Policy_Premium.values.reshape(-1, 1))

get_sub_file(model, df, df2)

# <center> VERSION 3 <center>

In [797]:
df3 = df2_cleaned.copy()

In [798]:
filename = 'model_31.pkl'
model = load_model(f'Models/{filename}')
model

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=8, num_parallel_tree=1, random_state=441,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [799]:
df3.Upper_Age = np.log10(df3.Upper_Age)
df3.Lower_Age = np.log10(df3.Lower_Age)
df3.Reco_Policy_Premium = np.log10(df3.Reco_Policy_Premium)

In [800]:
get_sub_file(model, df3)

'Submissions/Submission_0658694.csv'

# <center> VERSION 4 <center>

In [92]:
df4 = df2_cleaned.copy()

In [93]:
def bin_age(x):
    if 0<=x<=30:
        return 1 # kids
    elif (x>30) and(x<=48):
        return 2 # Young
    elif (x>48) and(x<=65):
        return 3 # old
    else:
        return 4 # Very old

In [94]:
df4.Upper_Age = df4.Upper_Age.apply(bin_age)

In [95]:
df4.Lower_Age = df4.Lower_Age.apply(bin_age)

In [96]:
def bin_premium(x):
    if x<8022.816674:
        return 1
    elif x<12689.950125:
        return 2
    elif x<17681.921997:
        return 3
    else:
        return 4
    

In [97]:
df4.Reco_Policy_Premium.isna().sum()

0

In [98]:
df4.Reco_Policy_Premium = df4.Reco_Policy_Premium.apply(bin_premium)

In [99]:
df4.Reco_Policy_Premium.isna().sum()

0

get_sub_file(model, df4)

# <center> VERSION 5 <center>

In [393]:
df5 = df2_cleaned.copy()

In [394]:
filename = 'pca5.pkl'
pca = load_model(f'Resources/{filename}')

In [395]:
temp = df5[['Upper_Age', 'Lower_Age', 'Holding_Policy_Duration', 'Reco_Policy_Premium']]
temp.head()

Unnamed: 0,Upper_Age,Lower_Age,Holding_Policy_Duration,Reco_Policy_Premium
0,30,30,6,11934
1,69,68,3,32204
2,28,28,2,9240
3,23,23,3,9086
4,75,75,0,22534


In [396]:
ulhr = pca.transform(temp)
ulhr

array([[-2249.99326472],
       [18020.07788272],
       [-4943.99207561],
       ...,
       [ 3652.14764824],
       [-2616.01111325],
       [ 8856.12887507]])

In [397]:
df5 = df5.drop(['Upper_Age', 'Lower_Age', 'Holding_Policy_Duration', 'Reco_Policy_Premium'], axis=1)

In [398]:
df5['ulhr'] = ulhr

In [399]:
filename = 'model_5xgb1.pkl'
model = load_model(f'Models/{filename}')
model

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=8, num_parallel_tree=1, random_state=441,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [400]:
df5['ID'] = df2_cleaned.ID

In [404]:
get_sub_file(model, df5)

'Submissions/Submission_4302348.csv'

# <center> VERSION 6 <center>

In [116]:
df6 = df2_cleaned.copy()

In [117]:
filename = 'pca6.pkl'
pca = load_model(f'Resources/{filename}')

In [118]:
temp = df6[['Upper_Age', 'Lower_Age', 'Holding_Policy_Duration', 'Reco_Policy_Premium']]
temp.head()

Unnamed: 0,Upper_Age,Lower_Age,Holding_Policy_Duration,Reco_Policy_Premium
0,30,30,6,11934
1,69,68,3,32204
2,28,28,2,9240
3,23,23,3,9086
4,75,75,0,22534


In [119]:
ulhr = pca.transform(temp)
ulhr

array([[-2260.88643162],
       [18009.18469738],
       [-4954.88523185],
       ...,
       [ 3641.25456851],
       [-2626.90429292],
       [ 8845.23576252]])

In [120]:
df6 = df6.drop(['Upper_Age', 'Lower_Age', 'Holding_Policy_Duration', 'Reco_Policy_Premium'], axis=1)

In [121]:
df6['ulhr'] = ulhr

In [126]:
get_sub_file(model, df6)

'Submissions/Submission_0009635.csv'

# <center> VERSION 7 <center>

In [188]:
df7 = df2_cleaned.copy()

In [189]:
def bin_age(x):
    if 0<=x<=30:
        return 1 # kids
    elif (x>30) and(x<=48):
        return 2 # Young
    elif (x>48) and(x<=65):
        return 3 # old
    else:
        return 4 # Very old

In [190]:
df7.Upper_Age = df7.Upper_Age.apply(bin_age)

In [191]:
df7.Lower_Age = df7.Lower_Age.apply(bin_age)

In [192]:
df7.Reco_Policy_Premium = np.log10(df7.Reco_Policy_Premium)

In [194]:
filename = 'model_7xgb.pkl'
model = load_model(f'Models/{filename}')
model

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=8, num_parallel_tree=1, random_state=441,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [195]:
get_sub_file(model, df7)

'Submissions/Submission_2020927.csv'

# <center>VERSION 9</center>

In [196]:
df9 = df2_cleaned.copy()

In [197]:
df9 = df9.astype(int)

In [198]:
df9['rented_indiv_nospouse_orother'] = (df9.Accomodation_Type+df9.Reco_Insurance_Type - df9.Is_Spouse)

In [199]:
df9['is_single_person'] = df9.apply(lambda x:int(not(x.Upper_Age-x.Lower_Age)), axis=1)

In [203]:
filename = 'mm_ua.pkl'
mm_ua = load_model(f'Resources/{filename}')


filename = 'mm_la.pkl'
mm_la = load_model(f'Resources/{filename}')

In [204]:
df9.Upper_Age = mm_ua.transform(df9.Upper_Age.values.reshape(-1, 1))
df9.Lower_Age = mm_la.transform(df9.Lower_Age.values.reshape(-1, 1))

In [205]:
filename = 'model_9.pkl'
model = load_model(f'Models/{filename}')
model

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=8, num_parallel_tree=1, random_state=441,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [207]:
get_sub_file(model, df9)

'Submissions/Submission_4807635.csv'

# <center>VERSION 10<center>

In [209]:
df10 = df9.copy()

In [213]:
filename = 'pca.pkl'
pca = load_model(f'Resources/{filename}')

In [216]:
temp = pca.transform(df10.drop('ID', axis=1))

In [211]:
filename = 'model_10.pkl'
model = load_model(f'Models/{filename}')
model

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=8, num_parallel_tree=1, random_state=441,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [218]:
id = str(datetime.datetime.now()).replace(':', '').replace('.','').split()[1][2:9]
filename = f'Submissions/Submission_{id}.csv'

pred = model.predict(temp)

pd.DataFrame(data={'ID':df.ID,'Response':pred}).to_csv(filename, index=False)

# <center>VERSION 11</center>

In [721]:
df11 = df.copy()

In [722]:
df11.drop('Region_Code', axis=1, inplace=True)

In [723]:
df11.head()

Unnamed: 0,ID,City_Code,Accomodation_Type,Reco_Insurance_Type,Upper_Age,Lower_Age,Is_Spouse,Health Indicator,Holding_Policy_Duration,Holding_Policy_Type,Reco_Policy_Cat,Reco_Policy_Premium
0,50883,C1,Owned,Individual,30,30,No,,6.0,3.0,5,11934.0
1,50884,C4,Owned,Joint,69,68,Yes,X1,3.0,3.0,18,32204.8
2,50885,C1,Rented,Individual,28,28,No,X3,2.0,4.0,17,9240.0
3,50886,C3,Rented,Individual,23,23,No,X3,3.0,3.0,18,9086.0
4,50887,C1,Owned,Individual,75,75,No,X3,,,5,22534.0


In [724]:
city_value = {
    '1':['C29', 'C35', 'C34', 'C32', 'C20', 'C4'],
    '2':['C23', 'C25', 'C33', 'C16','C6', 'C13'], 
    '3':['C9', 'C14', 'C28', 'C27', 'C18', 'C10'],
    '4':['C5', 'C11', 'C3','C17', 'C22', 'C31'],
    '5':['C8', 'C12', 'C21', 'C7', 'C19', 'C2'],
    '6':['C15','C24', 'C26', 'C36', 'C1', 'C30'],
}

In [725]:
def set_city_value(x):
    for i in range(1, 7):
        if x in city_value[str(i)]:
            return i

In [726]:
df11.City_Code = df11.City_Code.apply(set_city_value)

In [727]:
df11.Holding_Policy_Duration.unique()

array(['6.0', '3.0', '2.0', nan, '14+', '5.0', '1.0', '4.0', '12.0',
       '11.0', '7.0', '9.0', '13.0', '8.0', '14.0', '10.0'], dtype=object)

In [728]:
df11.Holding_Policy_Duration = df11.Holding_Policy_Duration.str.replace('14\+', '15')

df11.Holding_Policy_Duration = df11.Holding_Policy_Duration.fillna(0)

df11.Holding_Policy_Duration = df11.Holding_Policy_Duration.apply(lambda x: float(x)).astype('int64')

In [729]:
df11.Holding_Policy_Type.unique()

array([ 3.,  4., nan,  1.,  2.])

In [730]:
df11.Holding_Policy_Type = df11.Holding_Policy_Type.fillna(0).astype('category')

In [731]:
# Accomodation Type - 

# 0 - Rented
# 1 - Owned

In [732]:
df11.Accomodation_Type = df11.Accomodation_Type.apply(
    lambda x: 0 if (x == 'Rented') else 1).astype('category')

In [733]:
# Reco_Isurance_Type

# 0 - Individual
# 1 - Joint

In [734]:
df11.Reco_Insurance_Type = df11.Reco_Insurance_Type.apply(
    lambda x: 0 if (x == 'Individual') else 1).astype('category')

In [735]:
# Is_Spouse

# 0 - No
# 1 - Yes

In [736]:
df11.Is_Spouse = df11.Is_Spouse.apply(lambda x: 0 if (x == 'No') else 1).astype('category')

In [737]:
df11.Reco_Policy_Cat = df11.Reco_Policy_Cat.astype('category')

In [738]:
df11['Health Indicator'].isna().sum()

5027

In [739]:
df11['Health Indicator'].value_counts()

X1    5614
X2    4516
X3    2846
X4    2442
X5     681
X6     514
X7      96
X8      41
X9      28
Name: Health Indicator, dtype: int64

In [740]:
df11['Health Indicator'].fillna('X0', inplace=True)

In [741]:
df11['Health Indicator'] = df11['Health Indicator'].str[1]

In [742]:
df11.Reco_Policy_Premium = np.log10(df11.Reco_Policy_Premium)

In [743]:
df11 = df11.astype(float)

In [744]:
df11.Holding_Policy_Type = df11.Holding_Policy_Type.astype('int')
df11.Holding_Policy_Type = df11.Holding_Policy_Type.astype('category')

In [745]:
df11.Upper_Age = np.log10(df11.Upper_Age)
df11.Lower_Age = np.log10(df11.Lower_Age)

In [746]:
df11.head()

Unnamed: 0,ID,City_Code,Accomodation_Type,Reco_Insurance_Type,Upper_Age,Lower_Age,Is_Spouse,Health Indicator,Holding_Policy_Duration,Holding_Policy_Type,Reco_Policy_Cat,Reco_Policy_Premium
0,50883.0,6.0,1.0,0.0,1.477121,1.477121,0.0,0.0,6.0,3,5.0,4.076786
1,50884.0,1.0,1.0,1.0,1.838849,1.832509,1.0,1.0,3.0,3,18.0,4.507921
2,50885.0,6.0,0.0,0.0,1.447158,1.447158,0.0,3.0,2.0,4,17.0,3.965672
3,50886.0,4.0,0.0,0.0,1.361728,1.361728,0.0,3.0,3.0,3,18.0,3.958373
4,50887.0,6.0,1.0,0.0,1.875061,1.875061,0.0,3.0,0.0,0,5.0,4.352838


In [747]:
df11 = pd.get_dummies(df11)

In [748]:
df11.head()

Unnamed: 0,ID,City_Code,Accomodation_Type,Reco_Insurance_Type,Upper_Age,Lower_Age,Is_Spouse,Health Indicator,Holding_Policy_Duration,Reco_Policy_Cat,Reco_Policy_Premium,Holding_Policy_Type_0,Holding_Policy_Type_1,Holding_Policy_Type_2,Holding_Policy_Type_3,Holding_Policy_Type_4
0,50883.0,6.0,1.0,0.0,1.477121,1.477121,0.0,0.0,6.0,5.0,4.076786,0,0,0,1,0
1,50884.0,1.0,1.0,1.0,1.838849,1.832509,1.0,1.0,3.0,18.0,4.507921,0,0,0,1,0
2,50885.0,6.0,0.0,0.0,1.447158,1.447158,0.0,3.0,2.0,17.0,3.965672,0,0,0,0,1
3,50886.0,4.0,0.0,0.0,1.361728,1.361728,0.0,3.0,3.0,18.0,3.958373,0,0,0,1,0
4,50887.0,6.0,1.0,0.0,1.875061,1.875061,0.0,3.0,0.0,5.0,4.352838,1,0,0,0,0


In [749]:
df11 = df11.astype(float)

<br>

In [750]:
df11['is_single_person'] = df11.apply(lambda x:1 if (x.Upper_Age==x.Lower_Age) else 0, 1)

In [751]:
df11['living_lux'] = df11.apply(lambda x: x.City_Code*x.Accomodation_Type, 1)

In [752]:
df11['reco_policy_premium'] = df11.apply(lambda x: x.Reco_Policy_Cat*x.Reco_Policy_Premium, 1)

In [753]:
df11['sum'] = df11.apply(lambda x: x.sum(), 1)

df11.Reco_Policy_Cat = df11.Reco_Policy_Cat.astype('category')

df11 = pd.get_dummies(df11)

In [754]:
df11

Unnamed: 0,ID,City_Code,Accomodation_Type,Reco_Insurance_Type,Upper_Age,Lower_Age,Is_Spouse,Health Indicator,Holding_Policy_Duration,Reco_Policy_Cat,Reco_Policy_Premium,Holding_Policy_Type_0,Holding_Policy_Type_1,Holding_Policy_Type_2,Holding_Policy_Type_3,Holding_Policy_Type_4,is_single_person,living_lux,reco_policy_premium,sum
0,50883.0,6.0,1.0,0.0,1.477121,1.477121,0.0,0.0,6.0,5.0,4.076786,0.0,0.0,0.0,1.0,0.0,1,6.0,20.383930,50936.414959
1,50884.0,1.0,1.0,1.0,1.838849,1.832509,1.0,1.0,3.0,18.0,4.507921,0.0,0.0,0.0,1.0,0.0,0,1.0,81.142571,51001.321850
2,50885.0,6.0,0.0,0.0,1.447158,1.447158,0.0,3.0,2.0,17.0,3.965672,0.0,0.0,0.0,0.0,1.0,1,0.0,67.416424,50989.276412
3,50886.0,4.0,0.0,0.0,1.361728,1.361728,0.0,3.0,3.0,18.0,3.958373,0.0,0.0,0.0,1.0,0.0,1,0.0,71.250709,50993.932538
4,50887.0,6.0,1.0,0.0,1.875061,1.875061,0.0,3.0,0.0,5.0,4.352838,1.0,0.0,0.0,0.0,0.0,1,6.0,21.764191,50939.867152
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21800,72683.0,4.0,1.0,0.0,1.653213,1.653213,0.0,1.0,4.0,18.0,4.200960,0.0,1.0,0.0,0.0,0.0,1,4.0,75.617278,72800.124663
21801,72684.0,1.0,1.0,0.0,1.770852,1.770852,0.0,5.0,6.0,18.0,4.330211,0.0,0.0,0.0,1.0,0.0,1,1.0,77.943794,72803.815709
21802,72685.0,5.0,1.0,0.0,1.869232,1.869232,0.0,3.0,0.0,1.0,4.251297,1.0,0.0,0.0,0.0,0.0,1,5.0,4.251297,72714.241058
21803,72686.0,3.0,0.0,0.0,1.397940,1.397940,0.0,4.0,3.0,19.0,4.063258,0.0,1.0,0.0,0.0,0.0,1,0.0,77.201907,72801.061046


filename = 'pca.pkl'
pca = load_model(f'Resources/{filename}')

cols = ['Reco_Policy_Premium','reco_policy_premium','sum']

tr = df11[cols]

tr = pca.transform(tr)

tr = pd.DataFrame(tr, columns=['a', 'b', 'c'], index=df11.index)

df11 = pd.concat([df11, tr], axis=1)

df11.drop(cols, inplace=True, axis=1)

In [757]:
filename = 'model_11vclf1.pkl'
model = load_model(f'Models/{filename}')
model

VotingClassifier(estimators=[('xgb',
                              XGBClassifier(base_score=None, booster=None,
                                            colsample_bylevel=None,
                                            colsample_bynode=None,
                                            colsample_bytree=None, gamma=None,
                                            gpu_id=None, importance_type='gain',
                                            interaction_constraints=None,
                                            learning_rate=None,
                                            max_delta_step=None, max_depth=None,
                                            min_child_weight=None, missing=nan,
                                            monotone_constraints=None,
                                            n_estimators=100, n_jobs=None,
                                            num_parallel_tree=None,
                                            random_state=441, reg_alpha=None,
  

In [758]:
get_sub_file(model, df11)

'Submissions/Submission_5313307.csv'

# <center>VERSION 12</center>

In [801]:
df12 = df.copy()

In [802]:
df12.drop('Region_Code', axis=1, inplace=True)

In [803]:
df12.head()

Unnamed: 0,ID,City_Code,Accomodation_Type,Reco_Insurance_Type,Upper_Age,Lower_Age,Is_Spouse,Health Indicator,Holding_Policy_Duration,Holding_Policy_Type,Reco_Policy_Cat,Reco_Policy_Premium
0,50883,C1,Owned,Individual,30,30,No,,6.0,3.0,5,11934.0
1,50884,C4,Owned,Joint,69,68,Yes,X1,3.0,3.0,18,32204.8
2,50885,C1,Rented,Individual,28,28,No,X3,2.0,4.0,17,9240.0
3,50886,C3,Rented,Individual,23,23,No,X3,3.0,3.0,18,9086.0
4,50887,C1,Owned,Individual,75,75,No,X3,,,5,22534.0


In [804]:
city_value = {
    '1':['C29', 'C35', 'C34', 'C32', 'C20', 'C4'],
    '2':['C23', 'C25', 'C33', 'C16','C6', 'C13'], 
    '3':['C9', 'C14', 'C28', 'C27', 'C18', 'C10'],
    '4':['C5', 'C11', 'C3','C17', 'C22', 'C31'],
    '5':['C8', 'C12', 'C21', 'C7', 'C19', 'C2'],
    '6':['C15','C24', 'C26', 'C36', 'C1', 'C30'],
}

In [805]:
def set_city_value(x):
    for i in range(1, 7):
        if x in city_value[str(i)]:
            return i

In [806]:
df12.City_Code = df12.City_Code.apply(set_city_value)

In [807]:
df12.Holding_Policy_Duration.unique()

array(['6.0', '3.0', '2.0', nan, '14+', '5.0', '1.0', '4.0', '12.0',
       '11.0', '7.0', '9.0', '13.0', '8.0', '14.0', '10.0'], dtype=object)

In [808]:
df12.Holding_Policy_Duration = df12.Holding_Policy_Duration.str.replace('14\+', '15')

df12.Holding_Policy_Duration = df12.Holding_Policy_Duration.fillna(0)

df12.Holding_Policy_Duration = df12.Holding_Policy_Duration.apply(lambda x: float(x)).astype('int64')

In [809]:
df12.Holding_Policy_Type.unique()

array([ 3.,  4., nan,  1.,  2.])

In [810]:
df12.Holding_Policy_Type = df12.Holding_Policy_Type.fillna(0).astype('category')

In [811]:
# Accomodation Type - 

# 0 - Rented
# 1 - Owned

In [812]:
df12.Accomodation_Type = df12.Accomodation_Type.apply(
    lambda x: 0 if (x == 'Rented') else 1).astype('category')

In [813]:
# Reco_Isurance_Type

# 0 - Individual
# 1 - Joint

In [814]:
df12.Reco_Insurance_Type = df12.Reco_Insurance_Type.apply(
    lambda x: 0 if (x == 'Individual') else 1).astype('category')

In [815]:
# Is_Spouse

# 0 - No
# 1 - Yes

In [816]:
df12.Is_Spouse = df12.Is_Spouse.apply(lambda x: 0 if (x == 'No') else 1).astype('category')

In [817]:
df12.Reco_Policy_Cat = df12.Reco_Policy_Cat.astype('category')

In [818]:
df12['Health Indicator'].isna().sum()

5027

In [819]:
df12['Health Indicator'].value_counts()

X1    5614
X2    4516
X3    2846
X4    2442
X5     681
X6     514
X7      96
X8      41
X9      28
Name: Health Indicator, dtype: int64

In [820]:
df12['Health Indicator'].fillna('X0', inplace=True)

In [821]:
df12['Health Indicator'] = df12['Health Indicator'].str[1]

In [822]:
df12.Reco_Policy_Premium = np.log10(df12.Reco_Policy_Premium)

In [823]:
df12 = df12.astype(float)

In [824]:
df12.Holding_Policy_Type = df12.Holding_Policy_Type.astype('int')
df12.Holding_Policy_Type = df12.Holding_Policy_Type.astype('category')

In [825]:
# df12.Upper_Age = np.log10(df12.Upper_Age)
# df12.Lower_Age = np.log10(df12.Lower_Age)
df12.Upper_Age = df12.Upper_Age.apply(bin_age)
df12.Lower_Age = df12.Lower_Age.apply(bin_age)

In [826]:
df12.head()

Unnamed: 0,ID,City_Code,Accomodation_Type,Reco_Insurance_Type,Upper_Age,Lower_Age,Is_Spouse,Health Indicator,Holding_Policy_Duration,Holding_Policy_Type,Reco_Policy_Cat,Reco_Policy_Premium
0,50883.0,6.0,1.0,0.0,1,1,0.0,0.0,6.0,3,5.0,4.076786
1,50884.0,1.0,1.0,1.0,4,4,1.0,1.0,3.0,3,18.0,4.507921
2,50885.0,6.0,0.0,0.0,1,1,0.0,3.0,2.0,4,17.0,3.965672
3,50886.0,4.0,0.0,0.0,1,1,0.0,3.0,3.0,3,18.0,3.958373
4,50887.0,6.0,1.0,0.0,4,4,0.0,3.0,0.0,0,5.0,4.352838


In [827]:
df12 = pd.get_dummies(df12)

In [828]:
df12.head()

Unnamed: 0,ID,City_Code,Accomodation_Type,Reco_Insurance_Type,Upper_Age,Lower_Age,Is_Spouse,Health Indicator,Holding_Policy_Duration,Reco_Policy_Cat,Reco_Policy_Premium,Holding_Policy_Type_0,Holding_Policy_Type_1,Holding_Policy_Type_2,Holding_Policy_Type_3,Holding_Policy_Type_4
0,50883.0,6.0,1.0,0.0,1,1,0.0,0.0,6.0,5.0,4.076786,0,0,0,1,0
1,50884.0,1.0,1.0,1.0,4,4,1.0,1.0,3.0,18.0,4.507921,0,0,0,1,0
2,50885.0,6.0,0.0,0.0,1,1,0.0,3.0,2.0,17.0,3.965672,0,0,0,0,1
3,50886.0,4.0,0.0,0.0,1,1,0.0,3.0,3.0,18.0,3.958373,0,0,0,1,0
4,50887.0,6.0,1.0,0.0,4,4,0.0,3.0,0.0,5.0,4.352838,1,0,0,0,0


In [829]:
df12 = df12.astype(float)

<br>

In [830]:
df12['is_single_person'] = df12.apply(lambda x:1 if (x.Upper_Age==x.Lower_Age) else 0, 1)

In [831]:
df12['living_lux'] = df12.apply(lambda x: x.City_Code*x.Accomodation_Type, 1)

In [832]:
df12['reco_policy_premium'] = df12.apply(lambda x: x.Reco_Policy_Cat*x.Reco_Policy_Premium, 1)

In [833]:
df12['sum'] = df12.apply(lambda x: x.sum(), 1)

In [834]:
df12

Unnamed: 0,ID,City_Code,Accomodation_Type,Reco_Insurance_Type,Upper_Age,Lower_Age,Is_Spouse,Health Indicator,Holding_Policy_Duration,Reco_Policy_Cat,Reco_Policy_Premium,Holding_Policy_Type_0,Holding_Policy_Type_1,Holding_Policy_Type_2,Holding_Policy_Type_3,Holding_Policy_Type_4,is_single_person,living_lux,reco_policy_premium,sum
0,50883.0,6.0,1.0,0.0,1.0,1.0,0.0,0.0,6.0,5.0,4.076786,0.0,0.0,0.0,1.0,0.0,1,6.0,20.383930,50935.460716
1,50884.0,1.0,1.0,1.0,4.0,4.0,1.0,1.0,3.0,18.0,4.507921,0.0,0.0,0.0,1.0,0.0,1,1.0,81.142571,51006.650492
2,50885.0,6.0,0.0,0.0,1.0,1.0,0.0,3.0,2.0,17.0,3.965672,0.0,0.0,0.0,0.0,1.0,1,0.0,67.416424,50988.382095
3,50886.0,4.0,0.0,0.0,1.0,1.0,0.0,3.0,3.0,18.0,3.958373,0.0,0.0,0.0,1.0,0.0,1,0.0,71.250709,50993.209082
4,50887.0,6.0,1.0,0.0,4.0,4.0,0.0,3.0,0.0,5.0,4.352838,1.0,0.0,0.0,0.0,0.0,1,6.0,21.764191,50944.117030
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21800,72683.0,4.0,1.0,0.0,2.0,2.0,0.0,1.0,4.0,18.0,4.200960,0.0,1.0,0.0,0.0,0.0,1,4.0,75.617278,72800.818238
21801,72684.0,1.0,1.0,0.0,3.0,3.0,0.0,5.0,6.0,18.0,4.330211,0.0,0.0,0.0,1.0,0.0,1,1.0,77.943794,72806.274005
21802,72685.0,5.0,1.0,0.0,4.0,4.0,0.0,3.0,0.0,1.0,4.251297,1.0,0.0,0.0,0.0,0.0,1,5.0,4.251297,72718.502595
21803,72686.0,3.0,0.0,0.0,1.0,1.0,0.0,4.0,3.0,19.0,4.063258,0.0,1.0,0.0,0.0,0.0,1,0.0,77.201907,72800.265166


In [835]:
filename = 'model_12xgb.pkl'
model = load_model(f'Models/{filename}')
model

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=8, num_parallel_tree=1, random_state=441,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [836]:
get_sub_file(model, df12)

'Submissions/Submission_4121408.csv'