In [18]:
import pandas as pd
import numpy as np

In [19]:
class DataLoader:
    """
    This class provides method to load data
    """

    def __init__(self, path):
        self.path = path
        # print('in DataLoader')

    def loader(self):
        # print('Loading data.')
        file = pd.read_csv(self.path)
        # print('Finish loading')
        return file

In [20]:
train = DataLoader(path='train.csv')
train_data = train.loader()
test = DataLoader(path='test.csv')
test_data = test.loader()
data = train_data.append(test_data)

In [21]:
# Preprocess data #

# factorize categorical variables
data['Product_Info_2'] = pd.factorize(data['Product_Info_2'])[0]

# drop id variable
data = data.drop('Id', axis=1)

# drop response variable
data = data.drop('Response', axis=1)

# data.to_csv('complete_data.csv')

# feature scaling and standardisation/ normalisation
def feature_scale(df):
    scale_df = (df - df.mean()) / df.std(ddof=1)
    return scale_df


data = feature_scale(data)


#  dealing missing value

def check_missing(df):
    # Explore missing data
    missing_data = df.isnull().sum()
    # print(missing_data.dtypes)
    # print(type(missing_data))
    total_data = len(df)
    df_missing_data = missing_data.to_frame()
    df_missing_data.columns = ['counts']
    # Identify missing categories
    df_missing_data = df_missing_data[df_missing_data.counts != 0]
    # Calculate missing percentage
    df_missing_data['missing_percent'] = df_missing_data.counts / total_data
    print(df_missing_data)
    print(len(df_missing_data))
    return df_missing_data


# check_missing(data)

# Create list of variable types

cont_variable_list = ['Product_Info_4', 'Ins_Age', 'Ht', 'Wt', 'BMI', 'Employment_Info_1', 'Employment_Info_4',
                      'Employment_Info_6', 'Insurance_History_5', 'Family_Hist_2', 'Family_Hist_3', 'Family_Hist_4',
                      'Family_Hist_5']

dis_variable_list = ['Medical_History_1', 'Medical_History_10', 'Medical_History_15', 'Medical_History_24',
                     'Medical_History_32']

for i in range(48):
    i += 1
    dis_variable_list.append('Medical_Keyword_' + str(i))

cat_variable_list = []
for header in data.columns:
    if header in cont_variable_list and dis_variable_list:
        pass
    else:
        cat_variable_list.append(header)

missing_list = ['Employment_Info_1', 'Employment_Info_4', 'Employment_Info_6', 'Family_Hist_2', 'Family_Hist_3',
                'Family_Hist_4', 'Family_Hist_5', 'Insurance_History_5', 'Medical_History_1', 'Medical_History_10',
                'Medical_History_15', 'Medical_History_24', 'Medical_History_32']


# recommend method : pca, interpolation,svd, boosting

class MissingMethod:
    """
    This class will provide various method to handle missing values
    """

    def __init__(self, data):
        self.df = data

    def drop_response(self):
        self.df = self.df.drop('Response', axis=1, inplace=True)
        return self.df

    def fill_mode(self):
        for var in missing_list:
            if var in dis_variable_list and cat_variable_list:
                self.df[var] = self.df[var].fillna(self.df[var].mode()[0])
        return self.df

    def fill_avg(self):
        for var in missing_list:
            self.df[var] = self.df[var].fillna(self.df[var].mean())
        return self.df

    def drop_col(self):
        self.df = self.df.drop(['Medical_History_10', 'Medical_History_24',
                                'Medical_History_32'])
        return self.df


# preprocess = MissingMethod(data).fill_mode()
# preprocess = MissingMethod(data).fill_avg()
# check_missing(preprocess)
# preprocess = MissingMethod(data)

# use SVD to fill missing data
# pls normalise the data before using this function
# 1. if filling missing data, pls drop response
# 2. if use it to predict response, pls keep response
def fill_svd(df):
    col_mean = np.nanmean(df, axis=0, keepdims=1)
    valid = np.isfinite(df)
    df0 = np.where(valid, df, col_mean)
    halt = True
    maxiter = 100
    ii = 1
    normlist = []
    while halt == True:
        U, s, V = np.linalg.svd(df0, full_matrices=False)
        s1 = [(i * 0 if i <= 30 else i) for i in s]
        df1 = U.dot(np.diag(s1).dot(V))
        df2 = np.where(~valid, df1, df0)
        norm = np.linalg.norm(df2 - df1)
        normlist.append(norm)
        #        print(norm)
        df0 = df2
        if norm < 0.00001 or ii >= maxiter:
            halt = False
            error = np.nansum((df1 - df) ** 2)
        ii += 1
    print(ii)
    return df2, normlist, error

In [22]:
# add BMI*Ins_age
data['BMI_Ins_age'] = data['BMI']*data['Ins_Age']

In [23]:
#chose features
#  1. use LinearSVC with L1 C=0.01
#   The number of the selected features = 89
features = ['Product_Info_1','Product_Info_2', 'Product_Info_3', 'Product_Info_4', 'Product_Info_5', 'Product_Info_6', 'Product_Info_7',
 'Ins_Age', 'Ht', 'Wt', 'BMI', 'Employment_Info_1', 'Employment_Info_2', 'Employment_Info_3', 'Employment_Info_5', 'Employment_Info_6',
 'InsuredInfo_1', 'InsuredInfo_2', 'InsuredInfo_3', 'InsuredInfo_4', 'InsuredInfo_5','InsuredInfo_6', 'InsuredInfo_7',
 'Insurance_History_1', 'Insurance_History_2', 'Insurance_History_3', 'Insurance_History_4', 'Insurance_History_7', 'Insurance_History_8',
 'Insurance_History_9', 'Family_Hist_1', 'Family_Hist_2', 'Family_Hist_3', 'Family_Hist_4', 'Family_Hist_5', 'Medical_History_1',
 'Medical_History_2', 'Medical_History_3', 'Medical_History_4', 'Medical_History_5', 'Medical_History_6', 'Medical_History_7',
 'Medical_History_8', 'Medical_History_9', 'Medical_History_10', 'Medical_History_11', 'Medical_History_12', 'Medical_History_13',
 'Medical_History_14', 'Medical_History_15', 'Medical_History_16', 'Medical_History_17', 'Medical_History_18', 'Medical_History_19',
 'Medical_History_20', 'Medical_History_21', 'Medical_History_22', 'Medical_History_23', 'Medical_History_24', 'Medical_History_25',
 'Medical_History_26', 'Medical_History_27', 'Medical_History_28', 'Medical_History_29', 'Medical_History_30', 'Medical_History_31',
 'Medical_History_32', 'Medical_History_33', 'Medical_History_34', 'Medical_History_35', 'Medical_History_36', 'Medical_History_37',
 'Medical_History_38', 'Medical_History_39', 'Medical_History_40', 'Medical_History_41', 'Medical_Keyword_1', 'Medical_Keyword_2',
 'Medical_Keyword_3', 'Medical_Keyword_4', 'Medical_Keyword_5', 'Medical_Keyword_6', 'Medical_Keyword_7', 'Medical_Keyword_8', 'Medical_Keyword_9', 'Medical_Keyword_10', 'Medical_Keyword_12', 'Medical_Keyword_14', 'Medical_Keyword_15', 'Medical_Keyword_16',
 'Medical_Keyword_17', 'Medical_Keyword_18', 'Medical_Keyword_19', 'Medical_Keyword_20', 'Medical_Keyword_21', 'Medical_Keyword_22',
 'Medical_Keyword_23', 'Medical_Keyword_24', 'Medical_Keyword_25', 'Medical_Keyword_26', 'Medical_Keyword_28', 'Medical_Keyword_29',
 'Medical_Keyword_30', 'Medical_Keyword_31', 'Medical_Keyword_32', 'Medical_Keyword_33', 'Medical_Keyword_35', 'Medical_Keyword_36',
 'Medical_Keyword_37', 'Medical_Keyword_38', 'Medical_Keyword_39', 'Medical_Keyword_40', 'Medical_Keyword_41', 'Medical_Keyword_42',
 'Medical_Keyword_43', 'Medical_Keyword_44', 'Medical_Keyword_45', 'Medical_Keyword_46', 'Medical_Keyword_47', 'Medical_Keyword_48',
 'BMI_Ins_age']
data_3 = data[features]
# type(data_3)
data_3.shape

(79146, 121)

In [24]:
# fill missing data
# # average
# data_average = MissingMethod(data).fill_avg()
# data_average.head()
# SVD
data, list, error = fill_svd(data_3)
data = pd.DataFrame(data)

101


In [25]:
data_3 = feature_scale(data)
data_3.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,111,112,113,114,115,116,117,118,119,120
0,-0.163431,-1.142966,-2.821329,-0.890333,-0.083038,-2.246073,-0.1492,1.18016,-1.683671,-1.611867,...,-0.245961,-0.102065,-0.218231,-0.103259,-0.088131,-0.117674,-0.092677,-0.141054,-0.241129,-1.547628
1,-0.163431,-0.861071,0.31518,-0.890333,-0.083038,0.445216,-0.1492,-1.756901,-1.438982,-1.799149,...,-0.245961,-0.102065,-0.218231,-0.103259,-0.088131,-0.117674,-0.092677,-0.141054,-0.241129,2.708761
2,-0.163431,-0.579176,0.31518,-0.890333,-0.083038,0.445216,-0.1492,-1.90752,0.518537,-0.043379,...,-0.245961,-0.102065,-0.218231,-0.103259,-0.088131,-0.117674,-0.092677,-0.141054,-0.241129,0.509513
3,-0.163431,-0.297282,-2.821329,0.565558,-0.083038,0.445216,-0.1492,-1.229736,-0.460222,-0.97979,...,-0.245961,-0.102065,-0.218231,-0.103259,-0.088131,-0.117674,-0.092677,-0.141054,-0.241129,1.050366
4,-0.163431,-0.015387,0.31518,-0.344374,-0.083038,0.445216,-0.1492,0.050521,-0.704912,-0.652046,...,-0.245961,-0.102065,-0.218231,-0.103259,-0.088131,-0.117674,-0.092677,-0.141054,-0.241129,-0.149234


In [26]:
data_3.to_csv('data_features2_SVD.csv')