
## Regression: experimenting with Lasso and Ridge
#### Project:  Allstate Claims Severity
#### Author:   Joshep Downs, James Peng, Megan Pera, Diana Rodenberger 
#### Purpose:  Predicting cost and severity of claims for AllState
#### Updated: 12/4/16

### Team name in Kaggle: UCB_207_1

## Link to Leaderboard
https://www.kaggle.com/c/allstate-claims-severity/leaderboard


In [1]:
import unittest

# General libraries.
import re, os, sys
import numpy as np
import pandas as pd
import logging
import matplotlib.pyplot as plt

# Libraries for feature selection and model creation
from sklearn.pipeline import Pipeline
#from sklearn.feature_extraction import 
from sklearn import preprocessing
from sklearn.utils import shuffle
from sklearn.metrics import mean_absolute_error
from sklearn import linear_model
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge # l2 regularization
from sklearn.linear_model import Lasso # l1 regularization

from sklearn.feature_selection import mutual_info_regression

In [2]:
# Reading in the data
df_data = pd.read_csv('./data_in/train.csv')

In [3]:
df_data_encoded = df_data.copy()

# encoding into the categorical value
le = preprocessing.LabelEncoder()
for c in df_data_encoded.columns:
    if c.find('cat') >=0: # -1: substring not found, >=0, starting index
        df_data_encoded[c] = le.fit_transform(df_data_encoded[c])

# De-identifying and de-loss-ing dataframe
np.random.seed(100)

col = list(df_data_encoded.columns)
col.remove('loss') 
col.remove('id')

X = df_data_encoded[col] # data frame without IDs or loss variable
y = np.log10(df_data_encoded.loss) # takes the log{10} of the loss parameter
id = df_data_encoded.id # id variable
loss = df_data_encoded.loss # keep this for feature selection

# Set variables to hold dev and training data.
dev_data, dev_labels, dev_id, dev_loss = X[168318:], y[168318:], id[168318:], loss[168318:]
train_data, train_labels, train_id, train_loss = X[:168318], y[:168318], id[:168318], loss[:168318]
mini_train_data, mini_train_labels, mini_train_id, mini_train_loss = X[:1000], y[:1000], id[:1000], loss[:1000]



In [5]:
# Experimenting with mutual_info_regression
# Returns an array of estimated mutual information between each feature and its target
# this takes a LONG TIME for the whole dataset; not ideal

# Defining inputs for the function
loss_array = np.asarray(mini_train_loss) # this is the target vector
mini_train_matrix = mini_train_data.as_matrix(col) # variable df, as a matrix

# Running mutual_info_regression
mutual_info_regression(mini_train_matrix,loss_array).shape


(130,)

In [8]:
# Fitting a linear regression model
lm = LinearRegression()
lm.fit(train_data, train_labels)
lm_dev_mse = ((lm.predict(dev_data) - dev_labels) ** 2).mean()
dev_pred = lm.predict(dev_data)

# Fitting a lienar regression model with L1 regularization using Ridge
lm_l1 = Ridge(alpha=0.05, normalize=False)
lm_l1.fit(train_data, train_labels)
lml1_dev_mse = ((lm_l1.predict(dev_data) - dev_labels) ** 2).mean()
dev_pred_l1 = lm_l1.predict(dev_data)

# Fitting a lienar regression model with L2 regularization using Lasso
lm_l2 = Lasso(alpha=0.05, normalize=True, max_iter=100)
lm_l2.fit(train_data, train_labels)
lml2_dev_mse = ((lm_l2.predict(dev_data) - dev_labels) ** 2).mean()
dev_pred_l2 = lm_l2.predict(dev_data)

# Comparing the models based on MSE
print("Linear Regression, MSE on dev data:", lm_dev_mse)
print("Linear Regression with L1 regularization, MSE on dev data:", lml1_dev_mse)
print("Linear Regression with L2 regularization, MSE on dev data:", lml2_dev_mse)
print("\t")
print("Linear regression worse than L1:",lm_dev_mse > lml1_dev_mse)
print("Linear regression worse than L2:",lm_dev_mse > lml2_dev_mse)
print("L1 worse than L2:", lml1_dev_mse > lml2_dev_mse )


# Calculating mean absolute error for each
mae = mean_absolute_error(dev_labels, dev_pred)
mae_l1 = mean_absolute_error(dev_labels, dev_pred_l1)
mae_l2 = mean_absolute_error(dev_labels, dev_pred_l2)

print("\t")
print("Mean squared error, original:",mae)
print("Mean squared error, l1:",mae_l1)
print("Mean squared error, l2:",mae_l2)


Linear Regression, MSE on dev data: 0.06337191367597368
Linear Regression with L1 regularization, MSE on dev data: 0.06337223222747933
Linear Regression with L2 regularization, MSE on dev data: 0.12403214372261884
	
Linear regression worse than L1: False
Linear regression worse than L2: False
L1 worse than L2: False
	
Mean squared error, original: 0.196981281532
Mean squared error, l1: 0.19698271581
Mean squared error, l2: 0.287028383645


True

James' original code below:

In [26]:
df_test = pd.read_csv('./data_in/test.csv')
df_test_encoded = df_test.copy()

# encoding into the categorical value
le = preprocessing.LabelEncoder()
for c in df_test_encoded.columns:
    if c.find('cat') >=0: # -1: substring not found, >=0, starting index
        df_test_encoded[c] = le.fit_transform(df_test_encoded[c])

np.random.seed(100)

col = list(df_test_encoded.columns)
col.remove('id')

test_data = df_test_encoded[col]
test_id = df_test_encoded.id

In [61]:
#id_train, x_train,  y_train = shuffle( df_train.id, df_train[col] , df_train.loss, random_state=0)

pd.options.mode.chained_assignment = None

#use same linear model previously fit with training data
y_test_log_pred = lr.predict(X_test)

# convert from log10 scale to linear scale 
y_test_pred = np.power(10, y_test_log_pred)

# assamble the final dataset
y_test_predicted_loss=pd.DataFrame(id_test)

y_test_pred_df=pd.DataFrame({"loss": y_test_pred})

y_test_predicted_loss=pd.concat([y_test_predicted_loss,y_test_pred_df], axis=1)


#check final output
print("number of ids: ", len(id_test))
print("number of rows in predicted test set: ",len(y_test_predicted_loss) )

#create csv file to submit
y_test_predicted_loss.to_csv('./data_out/AllState_LossPrediction_v1.csv', encoding='utf-8', index=False)


number of ids:  125546
number of rows in predicted test set:  125546
