In [1]:
%matplotlib notebook
%matplotlib inline

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from matplotlib import cm
from pandas.plotting import scatter_matrix
from sklearn.linear_model import Ridge

from sklearn import linear_model
from sklearn.metrics import log_loss
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

import xgboost as xgb

from scipy.optimize import fmin_powell
from ml_metrics import quadratic_weighted_kappa
from sklearn.decomposition import PCA

In [3]:
# Set output options for viewing data
pd.set_option("display.max_rows",150)
pd.get_option("display.max_rows")
pd.set_option("display.max_columns",250)
pd.get_option("display.max_columns")

250

In [57]:
# load the train and the test dataset
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [60]:
# Feature generation, only the following feature generation improves the predicitve capacity of the model
# here are some of the not helpful feature generations: summing up variables of a particular "type", 
# where a number of variables with identical names, except the digit at the end, are thought to belong to a type. e.g., "Medical_History_{any digit from 1 to 41}" 
train["BMI_age"] = train["BMI"]*train["Ins_Age"]
test["BMI_age"] = test["BMI"]*test["Ins_Age"]

In [61]:
# Processing data and creating dataframes varible-type-wise from train and test data 
CATEGORICAL_COLUMNS = ["Product_Info_1", "Product_Info_2", "Product_Info_3", "Product_Info_5", "Product_Info_6",\
                       "Product_Info_7", "Employment_Info_2", "Employment_Info_3", "Employment_Info_5", "InsuredInfo_1",\
                       "InsuredInfo_2", "InsuredInfo_3", "InsuredInfo_4", "InsuredInfo_5", "InsuredInfo_6", "InsuredInfo_7",\
                       "Insurance_History_1", "Insurance_History_2", "Insurance_History_3", "Insurance_History_4", "Insurance_History_7",\
                       "Insurance_History_8", "Insurance_History_9", "Family_Hist_1", "Medical_History_2", "Medical_History_3",\
                       "Medical_History_4", "Medical_History_5", "Medical_History_6", "Medical_History_7", "Medical_History_8",\
                       "Medical_History_9", "Medical_History_11", "Medical_History_12", "Medical_History_13", "Medical_History_14",\
                       "Medical_History_16", "Medical_History_17", "Medical_History_18", "Medical_History_19", "Medical_History_20",\
                       "Medical_History_21", "Medical_History_22", "Medical_History_23", "Medical_History_25", "Medical_History_26",\
                       "Medical_History_27", "Medical_History_28", "Medical_History_29", "Medical_History_30", "Medical_History_31",\
                       "Medical_History_33", "Medical_History_34", "Medical_History_35", "Medical_History_36", "Medical_History_37",\
                       "Medical_History_38", "Medical_History_39", "Medical_History_40", "Medical_History_41"]
CONTINUOUS_COLUMNS = ["Product_Info_4", "Ins_Age", "Ht", "Wt", "BMI","BMI_age",
                      "Employment_Info_1", "Employment_Info_4", "Employment_Info_6",
                      "Insurance_History_5", "Family_Hist_2", "Family_Hist_3", "Family_Hist_4", "Family_Hist_5"]
DISCRETE_COLUMNS = ["Medical_History_1", "Medical_History_10", "Medical_History_15", "Medical_History_24", "Medical_History_32"]
DUMMY_COLUMNS = ["Medical_Keyword_{}".format(i) for i in range(1, 49)]

categorical_data = pd.concat([train[CATEGORICAL_COLUMNS], test[CATEGORICAL_COLUMNS]])
continuous_data = pd.concat([train[CONTINUOUS_COLUMNS], test[CONTINUOUS_COLUMNS]])
discrete_data = pd.concat([train[DISCRETE_COLUMNS], test[DISCRETE_COLUMNS]])
dummy_data = pd.concat([train[DUMMY_COLUMNS], test[DUMMY_COLUMNS]])

In [62]:
# feature reduction based on chi-square test. 
# drop columns, Medical_Keyword_32 and 45 from dummy data
categorical_data.shape
#list(categorical_data)
print("before dropiing:",dummy_data.shape)
dummy_data = dummy_data.drop(["Medical_Keyword_32","Medical_Keyword_45"], axis = 1)
print("after dropiing:",dummy_data.shape)

before dropiing: (79146, 48)
after dropiing: (79146, 46)


In [17]:
# scaling and clipping not needed for this dataset. Continuous variables were already normalized.

In [63]:
# scaled descrete data, normalized from 0 to 1. 
scaled_discrete_data = discrete_data.sub(discrete_data.min()).div((discrete_data.max() - discrete_data.min()))


In [64]:
y = train['Response']

In [66]:
# Converted categorical variables into dummy variables. 
categorical_data_Product_Info_2 = pd.get_dummies(categorical_data["Product_Info_2"])

In [1]:
# Merged independent features of test and train datasets
X = pd.concat([categorical_data,categorical_data_Product_Info_2,scaled_discrete_data, continuous_data, dummy_data], axis = 1)
X = X.drop(["Product_Info_2"], axis = 1)

In [69]:
# imputation: only contious and descrete data had missing values. not missing values in any categorical features.
X=X.fillna(X.mean())

In [2]:
X.isnull().sum().sort_values()

In [71]:
# Split the train and test datasets again for ML 
X_train = X[:len(y)]
X_test = X[len(y):]

In [72]:
# XGBoost Model 
clf = xgb.XGBClassifier(
                    max_depth=3,
                    learning_rate=0.2,
                    gamma=0.0,
                    min_child_weight=1,
                    max_delta_step=0.0,
                    subsample=1.0,
                    colsample_bytree=1.0,
                    colsample_bylevel=1.0,
                    reg_alpha=0.0,
                    reg_lambda=1.0,
                    n_estimators=1000,
                    silent=0,
                    scale_pos_weight=1.0,
                    base_score=0.5,
                    seed=1337,
                    missing=None,
                    booster='gbtree',
                    objective='multi:softprob',
                    early_stopping_rounds=10
                  )

In [74]:
# Fitting the XGBoost Model
%time clf.fit(X_train,y)

CPU times: user 44min 18s, sys: 21.3 s, total: 44min 39s
Wall time: 47min 3s


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1.0,
       colsample_bytree=1.0, early_stopping_rounds=10, gamma=0.0,
       learning_rate=0.2, max_delta_step=0.0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=1000, n_jobs=1,
       nthread=None, objective='multi:softprob', random_state=0,
       reg_alpha=0.0, reg_lambda=1.0, scale_pos_weight=1.0, seed=1337,
       silent=0, subsample=1.0)

In [89]:
prob_z=clf.predict_proba(X_test)
prob_z.shape

In [75]:
# predict the target variable
z = clf.predict(X_test)

In [76]:
z.shape
z[:50]

array([7, 8, 6, 8, 8, 8, 8, 8, 7, 8, 8, 8, 4, 8, 6, 8, 7, 8, 8, 2, 8, 8, 8,
       8, 6, 7, 8, 8, 1, 8, 7, 4, 5, 8, 3, 4, 8, 8, 6, 8, 2, 6, 7, 8, 7, 3,
       8, 6, 8, 6])

In [77]:
# saving the target variable and submitting the csv file
df = pd.DataFrame(data={'Id' : test['Id'], 'Response' : z})
df.to_csv('submit_xgboost_758pm_june6.csv', index=False) 

In [79]:
# Ridge regression on cleaned data
model = Ridge(alpha=1)
model.fit(X_train, y)
z_ridge = model.predict(X_test)

z_ridge = np.round(z_ridge)
z_ridge[z_ridge < 1] = 1
z_ridge[z_ridge > 8] = 8
z_ridge = z_ridge.astype(np.int64)

In [87]:
model

Ridge(alpha=1, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

In [80]:
df = pd.DataFrame(data={'Id' : test['Id'], 'Response' : z_ridge})
df.to_csv('submit_ridge_alpha1_812pm_june6.csv', index=False) # score: 

In [82]:
# fitting logistic regression to the final model
lr = linear_model.LogisticRegression()
lr.fit(X_train, y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [84]:
z_lg = lr.predict(X_test)
plt.hist(z_lg)
df = pd.DataFrame(data={'Id' : test['Id'], 'Response' : z_lg})
df.to_csv('submit_lg_818pm_june6.csv', index=False) # score: 0.50426 at 9:08 pm on June 4

In [85]:
# fitting multinomial logistic regression to the basic model
mul_lr = linear_model.LogisticRegression(multi_class='multinomial', solver='newton-cg',max_iter=200).fit(X_train,y)



In [86]:
z_mul_lg = lr.predict(X_test)
df1 = pd.DataFrame(data={'Id' : test['Id'], 'Response' : z_mul_lg})
df1.to_csv('submit_mul_lg_842pm_june6.csv', index=False) # score: 0.50426 at 9:08 pm on June 4