In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

### Training and test set
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import warnings
warnings.filterwarnings("ignore")

In [2]:
SEED = 1

In [3]:
df = pd.read_csv('./data/train.csv')

## 1.  Data preprocessing [7 marks]

### missing data

In [4]:
array_columns = np.array(df.columns)
# replace ？ and ' ' as NaN
for column in array_columns:
    df[column] = df[column].apply(lambda x: np.NaN if (str(x).isspace() or (x == '?')) else x)

In [5]:
# calculate the proportion of NaN
i = 0
percent_of_null_list = np.array([])
for column in np.array(df.columns):
    df_null = df[df[column].isnull()]
    percent_of_null = df_null.shape[0]/df.shape[0]
    if percent_of_null != 0:
        print('The percent of Null in',array_columns[i],':',percent_of_null*100 ,"%")
    percent_of_null_list = np.append(percent_of_null_list,percent_of_null)
    i+=1

The percent of Null in Blind_Make : 0.05 %
The percent of Null in Blind_Model : 0.05 %
The percent of Null in Blind_Submodel : 0.05 %
The percent of Null in Cat1 : 0.16666666666666669 %
The percent of Null in Cat2 : 35.30333333333333 %
The percent of Null in Cat3 : 0.03666666666666667 %
The percent of Null in Cat4 : 43.28333333333333 %
The percent of Null in Cat5 : 43.32666666666667 %
The percent of Null in Cat6 : 0.16666666666666669 %
The percent of Null in Cat7 : 54.93333333333334 %
The percent of Null in Cat8 : 0.006666666666666667 %
The percent of Null in Cat10 : 0.03333333333333333 %
The percent of Null in Cat11 : 0.19333333333333333 %
The percent of Null in Cat12 : 0.17333333333333334 %
The percent of Null in OrdCat : 0.06333333333333332 %


In [6]:
# sort the index 
index_of_sorted_Null = np.argsort(percent_of_null_list)
index_of_sorted_Null

array([ 0, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 33, 16, 34,  1,
        2,  3,  4, 15, 17, 10,  5,  6,  7, 20,  8, 13, 19, 18,  9, 11, 12,
       14], dtype=int64)

In [7]:
# since the null rates of columns of 9 11 12 14 are very high, we drop these columns
df = df.drop(['Cat2','Cat4','Cat5','Cat7'],axis =1)
# ID、Calendar_Year、Blind_Make、Blind_Submodel,etc are not related to the target value
df = df.drop(['Household_ID','Row_ID'],axis =1)

In [8]:
data = df
# in the rest of the columns, the null rates are very low, we can regard the 'NaN' as an
# attributes and replace the 'NaN' with '0'
data = data.fillna('0')

In [9]:
for col in ['Vehicle','Calendar_Year','Model_Year','Var1','Var2','Var3','Var4','Var5','Var6',
           'Var7','Var8','NVVar1','NVVar2','NVVar3','NVVar4']:
    data[col] = data[col].astype('float64')

In [10]:
for col in ['Blind_Model','Blind_Submodel', 'Cat1', 'Cat3', 'Cat6', 'Cat8', 'Cat9', 'Cat10', 'Cat11', 'Cat12','NVCat']:
    data[col] = data[col].astype(object)

In [11]:
# split the data in data.csv into train_data and test_data before deal with the imblance.
train_data,test_data = \
    train_test_split(data,test_size = 0.15)

In [12]:
# list two lists of categorical_features and continuous_features
categorical_features = list(data.columns[data.dtypes == 'object'])
continuous_features = list(data.columns[data.dtypes == 'float64'])
continuous_features = continuous_features[:-1]

In [13]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

# set a ColumnTransformer
full_transform = ColumnTransformer([
    ("num", StandardScaler(), continuous_features),
    ("cat", OneHotEncoder(handle_unknown = 'ignore'), categorical_features),
])

In [14]:
data_attributes = data.drop('Claim_Amount', axis=1)
data_targets = data['Claim_Amount']

In [15]:
data_attributes_prepared = full_transform.fit_transform(data_attributes)

In [16]:
train_data_attributes = train_data.drop('Claim_Amount', axis=1)
train_data_targets = train_data['Claim_Amount']

test_data_attributes = test_data.drop('Claim_Amount', axis=1)
test_data_targets = test_data['Claim_Amount']

### Deal with the imbalance of data

In [17]:
num_zero_claim = (train_data_targets == 0).sum()
num_not_zero_claim = (train_data_targets != 0).sum()
print('Zero claim in train data: ',num_zero_claim)
print('Non zero claim in train data: ',num_not_zero_claim)

Zero claim in train data:  17832
Non zero claim in train data:  7668


In [18]:
# use undersample to deal the imbalance
# get the indexes of 0
zero_claim_indices = np.array(train_data_targets[train_data_targets==0].index)
# get the indexes of non 0
not_zero_claim_indices = np.array(train_data_targets[train_data_targets!=0].index)
# get the same number indexes with non 0 index from 0 index
random_zero_claim_indices = np.random.choice(zero_claim_indices,num_not_zero_claim,replace = False)
# merge the two indexes
under_sample_indices = np.concatenate([random_zero_claim_indices,not_zero_claim_indices])

In [19]:
train_data_attributes_under_sample = train_data_attributes.loc[under_sample_indices,:]
train_data_targets = train_data_targets.loc[under_sample_indices]

### Convert categorical values 

In [20]:
# conlumns tranformer 
train_data_attributes_under_sample_prepared = full_transform.transform(train_data_attributes_under_sample)
test_data_attributes_prepared = full_transform.transform(test_data_attributes)

##  2. Performance using a single model [8 marks]

In [21]:
from sklearn.model_selection import ShuffleSplit
cv = ShuffleSplit(n_splits=5,random_state = SEED)

###  Linear regression

In [22]:
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit(train_data_attributes_under_sample_prepared, train_data_targets)

LinearRegression()

In [23]:
# the performance
linear_reg_predictions_on_X_val = lin_reg.predict(test_data_attributes_prepared)

In [24]:
from sklearn.metrics import mean_squared_error
error = np.sqrt(mean_squared_error(linear_reg_predictions_on_X_val, test_data_targets))
print('Linear regression RMSE: ',error)

Linear regression RMSE:  304.09815241610994


### Ridge regression

In [25]:
from sklearn.linear_model import Ridge,RidgeCV
ridge_reg = Ridge(alpha = 10)
ridge_reg.fit(train_data_attributes_under_sample_prepared, train_data_targets)

Ridge(alpha=10)

In [26]:
ridge_reg_predictions_on_X_val = ridge_reg.predict(test_data_attributes_prepared)

In [27]:
error_ridge = np.sqrt(mean_squared_error(ridge_reg_predictions_on_X_val, test_data_targets))
print('Ridge regression RMSE: ',error_ridge)

Ridge regression RMSE:  293.801252542613


In [28]:
# Use Grid search to find the best parameters
from sklearn.model_selection import GridSearchCV 
alpha_opts = np.array([0.1,1,5,10])
param_grid = dict(alpha = alpha_opts)
grid = GridSearchCV(Ridge(), param_grid=param_grid, cv=cv)
grid.fit(train_data_attributes_under_sample_prepared, train_data_targets)

GridSearchCV(cv=ShuffleSplit(n_splits=5, random_state=1, test_size=None, train_size=None),
             estimator=Ridge(),
             param_grid={'alpha': array([ 0.1,  1. ,  5. , 10. ])})

In [29]:
rig_grid = Ridge(alpha = grid.best_params_["alpha"])
rig_grid.fit(train_data_attributes_under_sample_prepared, train_data_targets)
ypred_rig_grid = rig_grid.predict(test_data_attributes_prepared)  

In [30]:
error_ridge_grid = np.sqrt(mean_squared_error(ypred_rig_grid, test_data_targets))
print('Ridge regression RMSE after Grid search : ',error_ridge_grid)

Ridge regression RMSE after Grid search :  293.801252542613


### Random forests for regression

In [None]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(
    n_estimators = 50) 

rf.fit(train_data_attributes_under_sample_prepared, train_data_targets)

In [None]:
rf_predictions_on_X_val = rf.predict(test_data_attributes_prepared)

In [None]:
error_rf = np.sqrt(mean_squared_error(rf_predictions_on_X_val, test_data_targets))
print('Random forests regression RMSE : ',error_rf)

In [None]:
# Use Grid search
# rf_param_grid = dict(n_estimators = [5,50,100],min_samples_split = [3,5,10])
# rf_grid = GridSearchCV(RandomForestRegressor(), param_grid = rf_param_grid, cv = cv)
# rf_grid.fit(train_data_attributes_under_sample_prepared, train_data_targets)

In [None]:
# rf_grid = RandomForestRegressor(n_estimators = rf_grid.best_params_["n_estimators"],min_samples_split = rf_grid.best_params_["min_samples_split"])
# rf_grid.fit(train_data_attributes_under_sample_prepared, train_data_targets)
# ypred_rf_grid = rf_grid.predict(test_data_attributes_prepared) 

In [None]:
# error_rf_grid = np.sqrt(mean_squared_error(ypred_rf_grid, test_data_targets))
# print('Random forests regression RMSE after Grid search: ',error_rf_grid)

###  Gradient tree boosting for regression

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
gb = GradientBoostingRegressor(n_estimators= 3, max_depth = 3)
gb.fit(train_data_attributes_under_sample_prepared, train_data_targets)

In [None]:
gb_predictions_on_X_val = gb.predict(test_data_attributes_prepared)

In [None]:
error_gb = np.sqrt(mean_squared_error(gb_predictions_on_X_val, test_data_targets))
print('Gradient tree boosting for regression RMSE: ',error_gb)

In [None]:
#  Grid search
# n_estimators_opts = [3, 13, 50, 100]
# max_depth_opts = [1,6,9,12]

# gb_param_grid = dict(n_estimators = n_estimators_opts, max_depth = max_depth_opts)
# gb_grid = GridSearchCV(GradientBoostingRegressor(), param_grid = gb_param_grid, cv = cv)
# gb_grid.fit(train_data_attributes_under_sample_prepared, train_data_targets)

In [None]:
# gb_grid = GradientBoostingRegressor(n_estimators = gb_grid.best_params_["n_estimators"], max_depth = gb_grid.best_params_["max_depth"])
# gb_grid.fit(train_data_attributes_under_sample_prepared, train_data_targets)
# ypred_gb_grid = gb_grid.predict(test_data_attributes_prepared) 

In [None]:
# error_gb_grid = np.sqrt(mean_squared_error(ypred_gb_grid, test_data_targets))
# print('Gradient tree boosting RMSE after Grid search: ',error_gb_grid)

## 3. Performance using a combination of two models [6 marks]

### a) binary classifier
- random  forests  for classification
- gradient boosting for classification 

#### random forests for classification

In [None]:
# transform train_data_targets and test_data_targets to 0 1 
train_data_targets_transformed = 1 * (train_data_targets != 0)
test_data_targets_transformed = 1 * (test_data_targets != 0)

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()
rfc.fit(train_data_attributes_under_sample_prepared, train_data_targets_transformed)

In [None]:
rfc_predictions_on_X_val = rfc.predict(test_data_attributes_prepared)

In [None]:
Accuracy_rfc = (rfc_predictions_on_X_val == test_data_targets_transformed).sum()/len(test_data_targets_transformed)
print('Accuracy of RandomForestClassifier: ', Accuracy_rfc*100,'%')

#### gradient boosting for classification

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
gbc = GradientBoostingClassifier()
gbc.fit(train_data_attributes_under_sample_prepared, train_data_targets_transformed)

In [None]:
gbc_predictions_on_X_val = gbc.predict(test_data_attributes_prepared)

In [None]:
Accuracy_gbc = (gbc_predictions_on_X_val == test_data_targets_transformed).sum()/len(test_data_targets_transformed)
print('Accuracy of GradientBoostingClassifier: ',Accuracy_gbc*100,'%')

###   b) If  the  claim  was  different  from  zero, train  a  regression model to predict the actual value of the claim

In [None]:
non_zero_train_data_attributes_under_sample = train_data_attributes_under_sample[train_data_targets != 0]
non_zero_train_data_targets = train_data_targets[train_data_targets != 0]

In [None]:
non_zero_train_data_attributes_under_sample_prepared = full_transform.transform(non_zero_train_data_attributes_under_sample)

In [None]:
# Linear regression
lin_reg_in_Q3b = LinearRegression()
lin_reg_in_Q3b.fit(non_zero_train_data_attributes_under_sample_prepared, non_zero_train_data_targets)

In [None]:
# Ridge regression
ridge_reg_in_Q3b = Ridge(alpha = 10)
ridge_reg_in_Q3b.fit(non_zero_train_data_attributes_under_sample_prepared, non_zero_train_data_targets)

In [None]:
# RandomForestRegressor
rf_in_Q3b = RandomForestRegressor(n_estimators = 50)
rf_in_Q3b.fit(non_zero_train_data_attributes_under_sample_prepared, non_zero_train_data_targets)

In [None]:
# Gradient tree boosting for regression
gb_in_Q3b = GradientBoostingRegressor(n_estimators= 3, max_depth = 3)
gb_in_Q3b.fit(non_zero_train_data_attributes_under_sample_prepared, non_zero_train_data_targets)

### c) tandem  model 

In [None]:
# combine two models
def tandem_model(X_test, model1 = rfc , model2 = rf):
    tandem_pridiction  = np.empty((X_test.shape[0],))
    mask_of_not_zero = (model1.predict(X_test) != 0) 
    for i in range(len(gbc.predict(X_test))):
        if mask_of_not_zero[i]:
            tandem_pridiction[i] = model2.predict(X_test[i])
        else:
            tandem_pridiction[i] = 0
    return tandem_pridiction

In [None]:
for model1 in [rfc,gbc]:
    for model2 in [lin_reg_in_Q3b, ridge_reg_in_Q3b,rf_in_Q3b,gb_in_Q3b]:
        prediction_tan = tandem_model(test_data_attributes_prepared, model1 = model1 , model2 = model2)
        print(f'Prediction of {model1} and {model2}:')
        print('RMSE:', np.sqrt(mean_squared_error(prediction_tan,test_data_targets)))
        print()

## 4. Report the performance of the best models over the test set [2 marks]

In [None]:
# In step 2, the best model is the Gradient tree boosting model
prediction_of_test_data_via_gb = gb.predict(test_data_attributes_prepared)

In [None]:
# the performence
test_error_on_gradientTree = np.sqrt(mean_squared_error(prediction_of_test_data_via_gb,test_data_targets))
print('RMSE of Gradient tree: ',test_error_on_gradientTree)

In [None]:
# In step 3, the best model is the tandem model based on GradientBoostingClassifier and GradientBoostingRegressor
predictions_on_test = tandem_model(test_data_attributes_prepared,model1 = gbc, model2 = gb_in_Q3b)

In [None]:
# the performence
error_tandem = np.sqrt(mean_squared_error(predictions_on_test,test_data_targets))
print('RMSE of tandem model: ',error_tandem)

## 5. Present your solution [4 marks]
Provide four interesting and meaningful observations/comments about your machine learning pipeline, with minimum three sentences for each observation/comment.

1. The missing value

Due to various reasons, our original data is not complete, and the machine learning model cannot directly deal with the missing values, so it is necessary to deal with the missing values before fitting the data. There are many ways to deal with the missing values, each with advantages and disadvantages. In the training data involved in this question, there are four attributes whose missing values exceed 30%, so these four columns are directly discarded. And 0 is used to fill in the remaining missing values.

2. Data imbalance

A data imbalance is an imbalance of the target data. If not preprocessed, the data fitting will not be good enough. The model will treat the claim amount of 0 as the norm and the fitted model will not perform well when dealing with non-0 data. Usually, there are two ways to deal with data imbalance: undersample and oversample. In this paper, the method of Undersample is adopted. The advantage of this method is that all data are real values, while the disadvantage is that all data sets cannot be fully utilized.

3. GridSearchCV

Models for many regression and classification problems. When we don't know how the parameters of the model are set, we can use GridSearchCV to traverse the set of parameters we have given and filter out the best parameters from it. In most cases, results with GridSearchCV are better than results without GridSearchCV. It is worth noting that in the process the random forest, the number of decision trees is not as good as more, and the result will fluctuate up and down at a stable level as the number of trees reaches a certain value.

4. Categorical attributes

There are many categorical attributes in the data. The model cannot handle them directly, so it needs to transcode them into discrete encoding space that the model can handle. Onehotencoder is used to handle such data. Because there are many elements, a sparse matrix is formed which is much wider than the data

5.  Overfitting

In this paper, RMSE values are used to measure the performance of the models. However, a low RMSE value not equal to a good predictor. For the GB model, although RMSE results are good, overfitting occurs. The GB model is very powerful and easy to be overfitted completely.

## 6.  Create a function that contains the best model you built from Steps 1 to 4 that we will use to assess the performance of your design over an independent test set [3 marks].

In [None]:
def my_insurance_claim_predictor(Xtest):
    Xtest = Xtest.drop(['Cat2','Cat4','Cat5','Cat7'],axis =1)
    Xtest = Xtest.drop(['Household_ID','Row_ID'],axis =1)
    Xtest=Xtest.replace(to_replace='?', value=np.nan)
    Xtest = Xtest.fillna('0')
    for col in ['Vehicle','Calendar_Year','Model_Year','Var1','Var2','Var3','Var4','Var5','Var6', 'Var7','Var8','NVVar1','NVVar2','NVVar3','NVVar4']:
        Xtest[col] = Xtest[col].astype('float64')
    for col in ['Blind_Model','Blind_Submodel', 'Cat1', 'Cat3', 'Cat6', 'Cat8', 'Cat9', 'Cat10', 'Cat11', 'Cat12','NVCat','OrdCat']:
        Xtest[col] = Xtest[col].astype(object)
    Xtest_temp = full_transform.transform(Xtest)
    return gb.predict(Xtest_temp)

In [None]:
Xtest = pd.read_csv('./data/test.csv')

In [None]:
prediction = my_insurance_claim_predictor(Xtest)