In [4]:
import numpy as np
import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.decomposition import PCA, NMF
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from IPython.display import display, HTML
from tqdm import tqdm_notebook as tqdm
import pickle

pd.options.display.max_columns = 999
pd.options.display.max_rows = 999

In [6]:
df_policy = pd.read_csv("policy_claim/policy_0702.csv")
df_claim = pd.read_csv("policy_claim/claim_0702.csv")
df_train = pd.read_csv("training-set.csv")
df_test = pd.read_csv("testing-set.csv")
df_test['Next_Premium']=0

In [7]:
file = open('myself_train_index.pickle', 'rb')
myself_train_index = pickle.load(file)
file.close()

file = open('myself_valid_index.pickle', 'rb')
myself_valid_index = pickle.load(file)
file.close()

In [8]:
myself_train_df = df_train.loc[myself_train_index].copy()
myself_valid_df = df_train.loc[myself_valid_index].copy()

In [9]:
insurance_coverages_df = pd.read_csv('output/insurance_coverages_df.csv')
df_policy_1 = pd.read_csv('output/policy_df_0804.csv')

In [10]:
insurance_coverages_df = insurance_coverages_df.fillna(0)

In [11]:
insurance_coverages_standardscaler = MinMaxScaler().fit(insurance_coverages_df.drop(['Policy_Number'], axis=1))

In [12]:
insurance_coverages_df_1 = pd.DataFrame( insurance_coverages_df.drop(['Policy_Number'], axis=1) )
insurance_coverages_df_1['Policy_Number'] = insurance_coverages_df['Policy_Number']

In [13]:
df_policy_1.shape

(351273, 106)

In [14]:
df_policy_1 = df_policy_1.drop(['Main_Insurance_Coverage_Group_1'], axis=1)

In [15]:
df_policy_1['ibirth_1'] = np.NaN
df_policy_1['ibirth_1'] = df_policy_1['ibirth_1'].astype(float)
df_policy_1.loc[~df_policy_1['ibirth'].isnull(), 'ibirth_1'] = df_policy_1.loc[~df_policy_1['ibirth'].isnull(), 'ibirth'].str.split('/').str.get(1).astype(float)

In [16]:
df_policy_1['dbirth_1'] = None
df_policy_1['dbirth_1'] = df_policy_1['dbirth_1'].astype(float)
df_policy_1.loc[~df_policy_1['dbirth'].isnull(), 'dbirth_1'] = df_policy_1.loc[~df_policy_1['dbirth'].isnull(), 'dbirth'].str.split('/').str.get(1).astype(float)

In [17]:
df_policy_1['ratio_insured_premium'] = df_policy_1['total_all_insured_amount']/df_policy_1['total_Premium']

In [18]:
df_policy_1['count_Claim'] = df_policy_1['count_Claim'].fillna(0)

In [19]:
dummy_features = ['Vehicle_Make_and_Model1', 'Vehicle_Make_and_Model2', \
                  'Imported_or_Domestic_Car', 'Coding_of_Vehicle_Branding_&_Type', \
                  'Main_Insurance_Coverage_Group', 'Insurance_Coverage', 'Coverage_Deductible_if_applied', \
                  'Distribution_Channel', 'fassured', 'fsex', 'fmarriage', 'aassured_zip', 'iply_area', \
                  'fequipment1', 'fequipment2', 'fequipment3', 'fequipment4', 'fequipment5', 'fequipment6', \
                  'fequipment9']

In [20]:
for dummy_feature in dummy_features:
    le = preprocessing.LabelEncoder()
    df_policy_1[dummy_feature] = le.fit_transform(df_policy_1[dummy_feature].astype(str)) 

In [21]:
df_policy_1.shape

(351273, 106)

In [22]:
df_policy_1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 351273 entries, 0 to 351272
Columns: 106 entries, Policy_Number to ratio_insured_premium
dtypes: float64(48), int64(49), object(9)
memory usage: 284.1+ MB


In [28]:
myself_train_df = pd.merge(myself_train_df, df_policy_1, on="Policy_Number", how="inner")
myself_train_df = pd.merge(myself_train_df, insurance_coverages_df_1, on="Policy_Number", how="inner")

myself_valid_df = pd.merge(myself_valid_df, df_policy_1, on="Policy_Number", how="inner")
myself_valid_df = pd.merge(myself_valid_df, insurance_coverages_df_1, on="Policy_Number", how="inner")

df_test_1 = pd.merge(df_test, df_policy_1, on="Policy_Number", how="inner")
df_test_1 = pd.merge(df_test_1, insurance_coverages_df_1, on="Policy_Number", how="inner")

In [24]:
myself_train_df.shape

(168610, 2)

In [25]:
drop_features = ['Policy_Number', "Insured's_ID", "Prior_Policy_Number", "Cancellation", 'Vehicle_identifier', \
                'fpt', 'ibirth', 'dbirth', 'nequipment9']

In [29]:
myself_train_df_1 = myself_train_df.drop(drop_features, axis=1)
myself_valid_df_1 = myself_valid_df.drop(drop_features, axis=1)

df_test_1 = df_test_1.drop(drop_features, axis=1)

In [30]:
print(myself_train_df_1.shape)
print(myself_valid_df_1.shape)

(168610, 518)
(42153, 518)


In [31]:
import lightgbm as lgb

In [32]:
myself_train_df_y = myself_train_df_1['Next_Premium']/myself_train_df_1['total_Premium']
myself_train_df_x = myself_train_df_1.drop('Next_Premium', axis=1)

myself_valid_df_y = myself_valid_df_1['Next_Premium']/myself_valid_df_1['total_Premium']
myself_valid_df_x = myself_valid_df_1.drop('Next_Premium', axis=1)

test_x = df_test_1.drop('Next_Premium', axis=1)

train_data=lgb.Dataset(myself_train_df_x,label=myself_train_df_y)
valid_data=lgb.Dataset(myself_valid_df_x,label=myself_valid_df_y)

param_={
'boosting_type': 'gbdt',
'objective': 'regression_l1',
'learning_rate': 0.015,
'num_leaves': 128,
'n_jobs': 4,
'max_depth': 40,
'feature_fraction': 0.6,
'lambda_l1': 0.15,
'lambda_l2': 0.0,
'min_data_in_leaf': 70,
'seed': 42,
'bagging_fraction': 1,
'metric': 'l1', # aliase for mae ,
}

lgbm = lgb.train(param_,
train_data,
20000,
valid_sets=valid_data,
early_stopping_rounds= 40,
verbose_eval=10,
)

# predict data
predictions_lgbm_prob =lgbm.predict(test_x)

Training until validation scores don't improve for 40 rounds.
[10]	valid_0's l1: 0.338704
[20]	valid_0's l1: 0.330219
[30]	valid_0's l1: 0.323119
[40]	valid_0's l1: 0.317124
[50]	valid_0's l1: 0.312023
[60]	valid_0's l1: 0.307718
[70]	valid_0's l1: 0.304087
[80]	valid_0's l1: 0.300939
[90]	valid_0's l1: 0.298256
[100]	valid_0's l1: 0.295959
[110]	valid_0's l1: 0.293936
[120]	valid_0's l1: 0.29217
[130]	valid_0's l1: 0.290705
[140]	valid_0's l1: 0.289364
[150]	valid_0's l1: 0.28816
[160]	valid_0's l1: 0.287126
[170]	valid_0's l1: 0.286303
[180]	valid_0's l1: 0.285514
[190]	valid_0's l1: 0.284824
[200]	valid_0's l1: 0.284255
[210]	valid_0's l1: 0.28375
[220]	valid_0's l1: 0.283315
[230]	valid_0's l1: 0.282926
[240]	valid_0's l1: 0.28253
[250]	valid_0's l1: 0.282195
[260]	valid_0's l1: 0.281915
[270]	valid_0's l1: 0.281666
[280]	valid_0's l1: 0.281467
[290]	valid_0's l1: 0.281263
[300]	valid_0's l1: 0.281041
[310]	valid_0's l1: 0.280893
[320]	valid_0's l1: 0.280733
[330]	valid_0's l1: 0.2

In [33]:
all_train_df = myself_train_df_1.append(myself_valid_df_1, ignore_index=True)

all_train_df_y = all_train_df['Next_Premium']/all_train_df['total_Premium']
all_train_df_x = all_train_df.drop('Next_Premium', axis=1)

test_x = df_test_1.drop('Next_Premium', axis=1)

train_data=lgb.Dataset(all_train_df_x, \
                       label=all_train_df_y)
param_={
'boosting_type': 'gbdt',
'objective': 'regression_l1',
'learning_rate': 0.015,
'num_leaves': 128,
'n_jobs': 4,
'max_depth': 40,
'feature_fraction': 0.6,
'lambda_l1': 0.15,
'lambda_l2': 0.0,
'min_data_in_leaf': 70,
'seed': 42,
'bagging_fraction': 1,
'metric': 'l1', # aliase for mae ,
}

lgbm = lgb.train(param_,
train_data,
1350,
verbose_eval=10
)

# predict data
predictions_lgbm_prob =lgbm.predict(test_x)

In [34]:
predictions_lgbm_prob.shape

(140510,)

In [35]:
result=pd.DataFrame()
result['Policy_Number']=df_test.Policy_Number
#result['Next_Premium']=np.clip(predictions_lgbm_prob, a_min=0, a_max=None)
result['Next_Premium']=np.clip(predictions_lgbm_prob*test_x['total_Premium'], a_min=0, a_max=None)

In [38]:
result.to_csv('submit_0810.csv', index=False)

In [37]:
result

Unnamed: 0,Policy_Number,Next_Premium
0,55789b8f86893761c9aa9e7bf17938e737decc68,3.292047e+03
1,b6df13a3384528ba6339c52b4fff7c149de68011,1.823577e+03
2,e112d926103147bcdcb6dab201b736185a3e2520,1.830716e+03
3,aa346fa4b1931d1c7a55f8e1bca40b0927dd65ac,6.352418e+03
4,39c4d5daaa791676ec5559c9066d7e8e8dfc51d7,3.150707e+03
5,8643863c7206bc0229bb70abdd3702ca5a7bf334,1.002685e+04
6,04fe319dc6aabf6fffb70150ba97796d17e10da1,2.119236e+04
7,1c7dc06ae177652f2b7161e05df124893853a184,1.451005e-06
8,90466dc895a97dadbf0b879f455b17689862bc11,3.884720e+03
9,0849e8db810e4bf75b5614dce3ef365d49acc950,9.441986e+03
