In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
categorical = pd.read_csv('categoric.csv', index_col=0)
categorical.head()

Unnamed: 0,response,coverage,gender,education_Bachelor,education_College,education_Doctor,education_High School or Below,education_Master,employment_status_Disabled,employment_status_Employed,...,sales_channel_Agent,sales_channel_Branch,sales_channel_Call Center,sales_channel_Web,vehicle_class_Four-Door Car,vehicle_class_Luxury Car,vehicle_class_Luxury SUV,vehicle_class_SUV,vehicle_class_Sports Car,vehicle_class_Two-Door Car
0,0,0,1,1,0,0,0,0,0,1,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0,1,1,1,0,0,0,0,0,0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,0,2,1,1,0,0,0,0,0,1,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0,0,0,1,0,0,0,0,0,0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0,0,0,1,0,0,0,0,0,1,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [3]:
numerical = pd.read_csv('numeric.csv', index_col=0)
numerical.head()

Unnamed: 0,income,monthly_premium_auto,months_since_last_claim,months_since_policy_inception,number_of_open_complaints,number_of_policies
0,0.999999,0.001226,0.000569,8.9e-05,0,1
1,0.999996,0.002496,0.000345,0.001115,0,8
2,0.999997,0.002215,0.000369,0.000779,0,2
3,0.999994,0.002815,0.000478,0.001726,0,7
4,0.999998,0.001665,0.000274,0.001004,0,1


In [4]:
from pandas import concat
data = concat(
    [
        numerical.reset_index(drop=True),
        categorical.reset_index(drop=True)
    ],
    axis=1,
    ignore_index=True,
)

concatenated_dataframes_columns = [
    list(numerical.columns),
    list(categorical.columns)
]
    
flatten = lambda nested_lists: [item for sublist in nested_lists for item in sublist]

data.columns = flatten(concatenated_dataframes_columns)
data.head()

Unnamed: 0,income,monthly_premium_auto,months_since_last_claim,months_since_policy_inception,number_of_open_complaints,number_of_policies,response,coverage,gender,education_Bachelor,...,sales_channel_Agent,sales_channel_Branch,sales_channel_Call Center,sales_channel_Web,vehicle_class_Four-Door Car,vehicle_class_Luxury Car,vehicle_class_Luxury SUV,vehicle_class_SUV,vehicle_class_Sports Car,vehicle_class_Two-Door Car
0,0.999999,0.001226,0.000569,8.9e-05,0,1,0,0,1,1,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.999996,0.002496,0.000345,0.001115,0,8,0,1,1,1,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,0.999997,0.002215,0.000369,0.000779,0,2,0,2,1,1,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.999994,0.002815,0.000478,0.001726,0,7,0,0,0,1,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0.999998,0.001665,0.000274,0.001004,0,1,0,0,0,1,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [5]:
data.dtypes

income                             float64
monthly_premium_auto               float64
months_since_last_claim            float64
months_since_policy_inception      float64
number_of_open_complaints            int64
number_of_policies                   int64
response                             int64
coverage                             int64
gender                               int64
education_Bachelor                   int64
education_College                    int64
education_Doctor                     int64
education_High School or Below       int64
education_Master                     int64
employment_status_Disabled           int64
employment_status_Employed           int64
employment_status_Medical Leave      int64
employment_status_Retired            int64
employment_status_Unemployed         int64
location_code_Rural                  int64
location_code_Suburban               int64
location_code_Urban                  int64
week                                 int64
month      

In [6]:
#In this final lab, we will model our data. Import sklearn train_test_split and separate the data.
from sklearn import linear_model
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split

In [7]:
X = data.drop('monthly_premium_auto', axis=1)
y = data['monthly_premium_auto']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 100)

In [8]:
#Try a simple linear regression with all the data to see whether we are getting good results.
lm = linear_model.LinearRegression()
model = lm.fit(X_train, y_train)
predictions = lm.predict(X_test)
r2_score(y_test, predictions)

0.9080061086040625

In [9]:
#Great! Now define a function that takes a list of models and train (and tests) them so we can try a lot of them without repeating code.
#Use the function to check LinearRegressor and KNeighborsRegressor.
#You can check also the MLPRegressor for this task!
#Check and discuss the results.

In [10]:
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split

def train_test_models(models, X, y):
    """
    Train and test a list of regression models and return R2 scores.
    """
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)
    
    results = []
    for model in models:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        score = r2_score(y_test, y_pred)
        results.append(score)
        
    return results


In [11]:
models = [LinearRegression(), KNeighborsRegressor()]

results = train_test_models(models, X, y)

for i, model in enumerate(models):
    print(f"{model.__class__.__name__} R2 score: {results[i]}")


LinearRegression R2 score: 0.8866934115050138
KNeighborsRegressor R2 score: 0.39715121630588057


In [12]:
models = [LinearRegression(), KNeighborsRegressor(), MLPRegressor()]

results = train_test_models(models, X, y)

for i, model in enumerate(models):
    print(f"{model.__class__.__name__} R2 score: {results[i]}")


LinearRegression R2 score: 0.8866934115050138
KNeighborsRegressor R2 score: 0.39715121630588057
MLPRegressor R2 score: -824.2756716914148
