![logo_ironhack_blue 7](https://user-images.githubusercontent.com/23629340/40541063-a07a0a8a-601a-11e8-91b5-2f13e4e6b441.png)

<h1 style="color: #00BFFF;">00 | Comparing regression models</h1>

For this lab, we will be using the same dataset we used in the previous labs. We recommend using the same notebook since you will be reusing the same variables you previous created and used in labs. 

In [37]:
# 📚 Basic libraries
import pandas as pd # data manipulation
import numpy as np # numerical operations
import matplotlib.pyplot as plt # 2D visualizations
import os # filemanagment
import seaborn as sns # high-resolution visualization

# 🤖 Machine Learning
from sklearn.preprocessing import StandardScaler # data normalization
from sklearn.model_selection import train_test_split # splitting data into train/test sets
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import ElasticNet # another regression model
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error # model evaluation metrics

# ⚙️ Settings
pd.set_option('display.max_columns', None)

# 🔄 Functions
import sys # system path to our functions
module = "C:/Users/apisi/01. IronData/01. GitHub/01. IronLabs/usefulness/easy"
sys.path.append(os.path.abspath(module))

from functions import open_data  # quick data overview
from functions import snake_columns  # snake_case
from functions import explore_data  # checks for duplicates, NaN & empty spaces

In [14]:
file_path = os.path.join('C:/Users/apisi/01. IronData/01. GitHub/01. IronLabs/unit_4_py/lab-comparing-regression-models/01_data/result.csv')
data = pd.read_csv(file_path, index_col=0).fillna(0)
data.head(3)

Unnamed: 0,response,coverage,education,policy_type,policy,vehicle_size,year,month,day,state_Arizona,state_California,state_Nevada,state_Oregon,state_Washington,employmentstatus_Disabled,employmentstatus_Employed,employmentstatus_Medical Leave,employmentstatus_Retired,employmentstatus_Unemployed,gender_F,gender_M,location_code_Rural,location_code_Suburban,location_code_Urban,marital_status_Divorced,marital_status_Married,marital_status_Single,renew_offer_type_Offer1,renew_offer_type_Offer2,renew_offer_type_Offer3,renew_offer_type_Offer4,sales_channel_Agent,sales_channel_Branch,sales_channel_Call Center,sales_channel_Web,vehicle_class_Four-Door Car,vehicle_class_SUV,vehicle_class_Sports Car,vehicle_class_Two-Door Car,response.1,coverage.1,education.1,policy_type.1,policy.1,vehicle_size.1,year.1,month.1,day.1,customer_lifetime_value,total_claim_amount
0,0.0,0.0,1.0,1.0,5.0,1.0,2011.0,2.0,24.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,5.0,1.0,2011.0,2.0,24.0,-1.04711,0.003709
2,0.0,2.0,1.0,0.0,2.0,1.0,2011.0,2.0,19.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,2.0,1.0,0.0,2.0,1.0,2011.0,2.0,19.0,0.491933,0.711125
3,0.0,0.0,1.0,1.0,4.0,1.0,2011.0,1.0,20.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,4.0,1.0,2011.0,1.0,20.0,-1.031295,-1.199195


<h3 style="color: #008080;">1. In this final lab, we will model our data. Import sklearn `train_test_split` and separate the data.</h3>

In [16]:
Y = data['total_claim_amount']
X = data.drop(['total_claim_amount'], axis=1)

In [17]:
# We define train and test for X and Y
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=42) 

# test_size = We give 30% for testing and 70% for testing
# random_state = it'll improve the model to divide always the model in the same way

<h3 style="color: #008080;">2. Try a simple linear regression with all the data to see whether we are getting good results.</h3>

In [24]:
lr = LinearRegression() # A simple Linear Regression model
lr.fit(X_train, y_train) # Train data for the model

 # Predictions
predictions = lr.predict(X_test)
r2_3 = r2_score(y_test, predictions)
RMSE_3 = mean_squared_error(y_test, predictions, squared=False)
MSE_3 = mean_squared_error(y_test, predictions)
MAE_3 = mean_absolute_error(y_test, predictions)

#Printing the results
print("R2 = ", round(r2_3, 4))
print("RMSE = ", round(RMSE_3, 4))
print("The value of the metric MSE is ", round(MSE_3, 4))
print("MAE = ", round(MAE_3, 4))

R2 =  0.0117
RMSE =  0.8513
The value of the metric MSE is  0.7248
MAE =  0.5953


<h3 style="color: #008080;">3. Great! Now define a function that takes a list of models and train (and tests) them so we can try a lot of them without repeating code.</h3>

In [42]:
def model_test(X_train, y_train, X_test):
    # Linear Regression
    lr = LinearRegression() # A simple Linear Regression model
    lr.fit(X_train, y_train) # Train data for the model
    # Predictions
    predictions = lr.predict(X_test)
    r2_3 = r2_score(y_test, predictions)
    RMSE_3 = mean_squared_error(y_test, predictions, squared=False)
    MSE_3 = mean_squared_error(y_test, predictions)
    MAE_3 = mean_absolute_error(y_test, predictions)
    #Printing the results
    print("Linear Regression Results")
    print("R2 = ", round(r2_3, 4))
    print("RMSE = ", round(RMSE_3, 4))
    print("The value of the metric MSE is ", round(MSE_3, 4))
    print("MAE = ", round(MAE_3, 4))
    print()
    
    
    # ElasticNet
    # Settings
    alpha = 1.0
    l1_ratio = 0.5
    elasticnet = ElasticNet(alpha=alpha, l1_ratio=l1_ratio)
    elasticnet.fit(X_train, y_train) # Train data for the model
    # Predictions
    predictions = elasticnet.predict(X_test)
    er2_3 = r2_score(y_test, predictions)
    eRMSE_3 = mean_squared_error(y_test, predictions, squared=False)
    eMSE_3 = mean_squared_error(y_test, predictions)
    eMAE_3 = mean_absolute_error(y_test, predictions)
    #Printing the results
    print("ElasticNet Results")
    print("R2 = ", round(er2_3, 4))
    print("RMSE = ", round(eRMSE_3, 4))
    print("The value of the metric MSE is ", round(eMSE_3, 4))
    print("MAE = ", round(eMAE_3, 4))
    print()
    
    # KNeighborsRegressor
    KN = KNeighborsRegressor() # A simple Linear Regression model
    KN.fit(X_train, y_train) # Train data for the model
    # Predictions
    predictions = KN.predict(X_test)
    kr2_3 = r2_score(y_test, predictions)
    kRMSE_3 = mean_squared_error(y_test, predictions, squared=False)
    kMSE_3 = mean_squared_error(y_test, predictions)
    kMAE_3 = mean_absolute_error(y_test, predictions)
    #Printing the results
    print("KNeighborsRegressor Results")
    print("R2 = ", round(kr2_3, 4))
    print("RMSE = ", round(kRMSE_3, 4))
    print("The value of the metric MSE is ", round(kMSE_3, 4))
    print("MAE = ", round(kMAE_3, 4))
    print()
    
    # MLPRegressor
    ML = KNeighborsRegressor() # A simple Linear Regression model
    ML.fit(X_train, y_train) # Train data for the model
    # Predictions
    predictions = ML.predict(X_test)
    mr2_3 = r2_score(y_test, predictions)
    mRMSE_3 = mean_squared_error(y_test, predictions, squared=False)
    mMSE_3 = mean_squared_error(y_test, predictions)
    mMAE_3 = mean_absolute_error(y_test, predictions)
    #Printing the results
    print("MLPRegressor Results")
    print("R2 = ", round(mr2_3, 4))
    print("RMSE = ", round(mRMSE_3, 4))
    print("The value of the metric MSE is ", round(mMSE_3, 4))
    print("MAE = ", round(mMAE_3, 4))
    print()

<h3 style="color: #008080;">4. Use the function to check `LinearRegressor` and `KNeighborsRegressor`.
</h3>

<h3 style="color: #008080;">5. You can check also the `MLPRegressor` for this task!</h3>

In [44]:
model_test(X_train, y_train, X_test)

Linear Regression Results
R2 =  0.0117
RMSE =  0.8513
The value of the metric MSE is  0.7248
MAE =  0.5953

ElasticNet Results
R2 =  -0.0012
RMSE =  0.8569
The value of the metric MSE is  0.7342
MAE =  0.5978

KNeighborsRegressor Results
R2 =  -0.105
RMSE =  0.9002
The value of the metric MSE is  0.8104
MAE =  0.6688

MLPRegressor Results
R2 =  -0.105
RMSE =  0.9002
The value of the metric MSE is  0.8104
MAE =  0.6688



<h3 style="color: #008080;">6. Check and discuss the results.
</h3>

In [45]:
# Data must be not right, because I got weird values as results.
# To be honest, I kind of did this lab in a rush and I will try to get back to it (having a a goodfunction to run different models it's a must)

# From the results above, Linear Regression still proves to have the best r2 score
# ElasticNet is one I used in the past, and it's more usefull for overfitted data

# Again, I'll come back and try to make it better... I just wanted to get this one done.