In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
%cd /kaggle/input/prudential-life-insurance-assessment
!cp train.csv.zip /kaggle
!cp test.csv.zip /kaggle
%cd /kaggle
!ls
!unzip train.csv.zip
!unzip test.csv.zip

In [3]:
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")
train_df.head()

In [4]:
# 觀察train data

print(train_df.shape)
print("我們共有 {} 筆訓練資料，包含 {} 特徵".format(train_df.shape[0], train_df.shape[1]))
print("測試資料有 {} 筆".format(test_df.shape[0]))

In [5]:
train_df.describe()

In [6]:
import matplotlib.pyplot as plt
import plotly.express as px

# feature 大多為 numerical. 我們目標 Response 有 8 個 level, 先觀察 Response

plt.hist(train_df["Response"])
plt.title("Train data Response distribution")

#### 目標metric為quadratic weighted kappa分數

In [7]:
# 資料為保險申請人的資料
# 由圖可見風險分數高的申請人較多，再來是風險較低的，最後是風險中間值的
# 讓我們來觀察描述資料中的特徵

# Product_info 1_6: features relating to the applied product, 然而不知道內容 

# feature may interested: age, BMI, employment history, applicant info, insurance history
# family history, medical history

train_df.boxplot("Ins_Age", by = "Response")
plt.suptitle("")

In [8]:
# 看來Response風險最低的年齡級距最大，接著隨風險值的增加年齡緩慢降低。風險值>5後，年齡均值和級距又增加

# 觀察BMI:
train_df.boxplot("BMI", by = "Response")
plt.suptitle("")

In [9]:
# 接著觀察employment feature: categorical variable

# Employment feature:
employ_hist = [col for col in train_df.columns if "Employ" in col]
train_df[employ_hist]

In [23]:
# Data cleaning
is_null = train_df.isnull().sum()
is_null = is_null[is_null > 0]
is_null.sort_values(ascending = False, inplace = True)

print(is_null)

In [27]:


# remove columns with too much missing value
null_feature = is_null[is_null > len(train_df)//2].index
print("Drop {} features: ".format(len(null_feature), null_feature))
train_df.drop(null_feature, axis = 1, inplace = True)

print("Null feature after dropping:")
is_null = train_df.isnull().sum()
is_null = is_null[is_null > 0]
is_null.sort_values(ascending = False, inplace = True)
print(is_null)

print("Left {} features".format(train_df.shape[1]))

In [35]:
# impute missing value 

# Family_Hist: continuous
# Insurance History: categorical(nominal without order)
# Employmeint Info: categorical(No Order)
# Medical history: categorical(No Order)

# Easy impute: fill continuous with median, categorical with freq. value
from sklearn.impute import SimpleImputer

num_imp = SimpleImputer(missing_values = np.nan, strategy = "median")
cat_imp = SimpleImputer(missing_values = np.nan, strategy = "most_frequent")

num_null_feature = ["Family_Hist_2", "Family_Hist_4"]
cat_null_feature = ["Employment_Info_1", "Employment_Info_4", "Employment_Info_6",\
                   "Medical_History_1", "Insurance_History_5"]

train_df[num_null_feature] = num_imp.fit_transform(train_df[num_null_feature])
train_df[cat_null_feature] = cat_imp.fit_transform(train_df[cat_null_feature])

print("After impute:")
is_null = train_df.isnull().sum()
is_null = is_null[is_null > 0]
is_null.sort_values(ascending = False, inplace = True)
print(is_null)

In [36]:
train_df.describe()

In [39]:
train_df.dtypes

In [42]:
# Data preprocessing

from sklearn.preprocessing import MinMaxScaler, LabelEncoder

# Encode all features to numerical
encoder = LabelEncoder()
train_df.Product_Info_2 = encoder.fit_transform(train_df.Product_Info_2)



In [62]:
# normalization

norm_feature = list(train_df.columns)
norm_feature = [elem for elem in norm_feature if elem not in ["Id", "Response"]]

scaler = MinMaxScaler()
train_df[norm_feature].values[:] = scaler.fit_transform(train_df[norm_feature])
train_df.describe()

In [68]:
train_df[norm_feature].columns

In [70]:
sub_df = train_df[["Product_Info_2", "Product_Info_6"]]
#sub_df = train_df[norm_feature]
print(sub_df.describe())

sub_df.values[:] = scaler.fit_transform(sub_df)
sub_df.describe()

In [14]:
# 不了解employment feature 內容，直接開始建模
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, SVR


# Split train-test data
y = train_df["Response"]
x = train_df.drop(["Id","Response"], axis = 1)

x_train, x_valid, y_train, y_valid = train_test_split(x, y, test_size = 0.2)

logit = LogisticRegression(class_weight = "balanced")
logit.fit(x_train, y_train)

