<a href="https://colab.research.google.com/github/jeampierrjimenez/Internacional_Machine_Learning_Bootcamp_by_Global_AI_Hub-Classification_Machine_Learning_Project/blob/main/eda_and_regression_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## I. Importing Required Libraries

In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split


from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from lightgbm import LGBMRegressor
from xgboost.sklearn import XGBRegressor
from catboost import CatBoostRegressor

from sklearn.model_selection import KFold
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import mean_squared_error, r2_score

### Columns

* age: age of primary beneficiary

* sex: insurance contractor gender, female, male

* bmi: Body mass index, providing an understanding of body, weights that are relatively high or low relative to height, objective index of body weight (kg / m ^ 2) using the ratio of height to weight, ideally 18.5 to 24.9

* children: Number of children covered by health insurance / Number of dependents

* smoker: Smoking

* region: the beneficiary's residential area in the US, northeast, southeast, southwest, northwest.

* charges: Individual medical costs billed by health insurance

In [None]:
data = pd.read_csv("insurance.csv")
data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [None]:
data.tail()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
1333,50,male,30.97,3,no,northwest,10600.5483
1334,18,female,31.92,0,no,northeast,2205.9808
1335,18,female,36.85,0,no,southeast,1629.8335
1336,21,female,25.8,0,no,southwest,2007.945
1337,61,female,29.07,0,yes,northwest,29141.3603


## II. Perform An Exploratory Data Analysis

### 1. Analyze the data and draw meaningful conclusions from the data.

In [None]:
# Create a column to categorize the variable 'bmi'
bins = [-np.inf,18.5, 25, 30, np.inf]
labels = ["underweight","healthy","overweight","obese"]
data['bmi_cat'] = pd.cut(data['bmi'], bins=bins, labels=labels)

* Examine the distribution of Bmi (Body Mass Index)

In [None]:
fig = px.histogram(data, x="bmi")
fig.show()

ANSWER: According to the histogram we can see that the BMI variable has a normal distribution.

* Examine the relationship between “smoker” and “charges”.

In [None]:
# Number of people per smoker
round((data['smoker'].value_counts()/len(data)), 2)

no     0.8
yes    0.2
Name: smoker, dtype: float64

In [None]:
charges_by_smoker = data.groupby('smoker').charges.sum().sort_values(ascending = False)
round(charges_by_smoker)

smoker
no     8974061.0
yes    8781764.0
Name: charges, dtype: float64

In [None]:
px.bar(charges_by_smoker, 
        x=charges_by_smoker.index, 
        y=charges_by_smoker, text_auto='.', 
        title="Health insurance costs per smoker")

ANSWER: According to the graph we can see that people who do not smoke have higher expenses for health insurance than those who smoke, with a value of approx. $9 million vs. about $8.8 million for those who smoke

In [None]:
fig = px.box(data, x="smoker", y="charges", points="all")
fig.show()

ANSWER: Despite the fact that the range of expenses for health insurance of people who do not smoke is below 37K dollars, a much lower figure compared to the range in which the health expenses of people who smoke operate, this The client segment presents the highest amount of insurance expenses since it concentrates approximately 80% of the people with health insurance.

* Examine the relationship between “smoker” and “region”

In [None]:
df_region_and_smoker = pd.DataFrame((data.groupby(by = ['region', 'smoker'])).size())

In [None]:
df_region_and_smoker.rename(columns = {0: 'count'}, inplace=True)
df_region_and_smoker

Unnamed: 0_level_0,Unnamed: 1_level_0,count
region,smoker,Unnamed: 2_level_1
northeast,no,257
northeast,yes,67
northwest,no,267
northwest,yes,58
southeast,no,273
southeast,yes,91
southwest,no,267
southwest,yes,58


In [None]:
reg_and_smok = df_region_and_smoker.copy()
reg_and_smok = reg_and_smok.reset_index()
reg_and_smok

Unnamed: 0,region,smoker,count
0,northeast,no,257
1,northeast,yes,67
2,northwest,no,267
3,northwest,yes,58
4,southeast,no,273
5,southeast,yes,91
6,southwest,no,267
7,southwest,yes,58


In [None]:
fig = px.bar(reg_and_smok, x= "region", y= 'count',
             color='smoker', barmode='group', text_auto='.',
             height=400)
fig.show()

ANSWER: The region with the highest number of people who smoke is the "Southeast" region with a total of 273 people, while if we look at the region with the least number of smokers we will find a tie between the Northwest and Southwest regions with 58 people who smoke each.

* Examine the relationship between “bmi” and “sex”.

In [None]:
round((data['sex'].value_counts()/len(data)), 2)

male      0.51
female    0.49
Name: sex, dtype: float64

In [None]:
fig = px.box(data, x="sex", y="bmi", points="all")
fig.show()

ANSWER: From the box graph we can see that the number of women and men with health insurance is relatively even with a participation of 51% for men and 49% for women.

In [None]:

avg_bmi_by_bmi_cat_and_sex = data.groupby(['bmi_cat', 'sex']).bmi.mean().sort_values(ascending = False)
df_avg_bmi_by_bmi_cat_and_sex = pd.DataFrame(avg_bmi_by_bmi_cat_and_sex)
df_avg_bmi_by_bmi_cat_and_sex = df_avg_bmi_by_bmi_cat_and_sex.reset_index().copy()

In [None]:
df_avg_bmi_by_bmi_cat_and_sex

Unnamed: 0,bmi_cat,sex,bmi
0,obese,male,35.344272
1,obese,female,35.210614
2,overweight,female,27.630609
3,overweight,male,27.626296
4,healthy,female,22.673432
5,healthy,male,22.635787
6,underweight,female,17.771538
7,underweight,male,17.349375


In [None]:
fig = px.bar(df_avg_bmi_by_bmi_cat_and_sex, x= "bmi_cat", y= 'bmi',
             color='sex', barmode='group', text_auto='.2f',
             height=400)
fig.show()

ANSWER: From the bar graph we can see that the average BMI according to health status of the person who spends on insurance are relatively equal in both men and women who spend for health insurance.

* Find the "region" with the most "children".

In [None]:
children_qty = data.groupby('region')['children'].sum().sort_values(ascending = False)
children_qty

region
southeast    382
northwest    373
southwest    371
northeast    339
Name: children, dtype: int64

ANSWER: According to our consultation, the region with the most children covered by health insurance is the Southeast region.

* Examine the relationship between “age” and “bmi”.

In [None]:
round((data['bmi_cat'].value_counts()/len(data)), 2)

obese          0.53
overweight     0.29
healthy        0.17
underweight    0.02
Name: bmi_cat, dtype: float64

In [None]:
avg_age_by_bmi_cat = data.groupby('bmi_cat').age.mean().sort_values(ascending = False)
avg_age_by_bmi_cat

bmi_cat
obese          40.361702
overweight     38.823834
healthy        36.893805
underweight    32.380952
Name: age, dtype: float64

In [None]:
fig = px.box(data, x="bmi_cat", y="age", points='all')
fig.show()

ANSWER: From the box graph we can see that the largest number of people who spend on health insurance are those with an Obese health status, this segment representing 53% of the total.

In [None]:
px.bar(avg_age_by_bmi_cat, x=avg_age_by_bmi_cat.index, y=avg_age_by_bmi_cat, text_auto='.3f', title="Average age by health condition")

ANSWER: From the box plot we can see that the highest average age according to health status is obese people, with an age of 40.4 years. We can also note that as the average age increases, the health status of the person worsens to a condition of obesity. People with a healthy state of health are averaging 36.8 years of age.

* Examine the relationship between “bmi” and “children”.

In [None]:
round((data['children'].value_counts()/len(data)), 2)

0    0.43
1    0.24
2    0.18
3    0.12
4    0.02
5    0.01
Name: children, dtype: float64

In [None]:
children_qty_by_bmi_cat = data.groupby('bmi_cat').children.sum().sort_values(ascending = False)
children_qty_by_bmi_cat

bmi_cat
obese          781
overweight     408
healthy        254
underweight     22
Name: children, dtype: int64

In [None]:
px.bar(children_qty_by_bmi_cat, 
        x=children_qty_by_bmi_cat.index, 
        y=children_qty_by_bmi_cat, text_auto='.', 
        title="Number of children by health condition of the insured person")

ANSWER: According to the graph, we can see that people with obese health conditions have the largest number of dependent children, this being a total value of 781 children.

* Is there an outlier in the "bmi" variable? Please review

In [None]:
fig = px.box(data, y="bmi")
fig.show()

In [None]:
# Calculating Q1
Q1 = data['bmi'].quantile(0.25)
print('First Quantile is: ', Q1)

# Calculating Q3
Q3 = data['bmi'].quantile(0.75)
print('Second Quantile is: ', Q3)

# Calculating IQR
IQR = Q3 - Q1
print('IQR is: ', IQR)

# Calculating median
med = data['bmi'].median()
print('Median is: ', med)

# Calculating minimum value
minimum_value = data['bmi'].min()
print('Minimum value is: ', minimum_value)

# Calculating maximum value
maximum_value = data['bmi'].max()
print('Maximum value is: ', maximum_value)

First Quantile is:  26.29625
Second Quantile is:  34.69375
IQR is:  8.3975
Median is:  30.4
Minimum value is:  15.96
Maximum value is:  53.13


In [None]:
# Calculating upper whisker
lower_whisker = (Q1 - 1.5 * IQR)
print('lower_whisker value is: ', lower_whisker)

# Calculating lower whisker
upper_whisker = (Q3 + 1.5 * IQR)
print('upper_whisker value is: ', upper_whisker)

lower_whisker value is:  13.7
upper_whisker value is:  47.290000000000006


In [None]:
# LOCATING THE OUTLIERS PART1

lower_location = data['bmi'] < lower_whisker 
upper_location = data['bmi'] > upper_whisker

In [None]:
# LOCATING THE OUTLIERS PART2
data[lower_location | upper_location]

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,bmi_cat
116,58,male,49.06,0,no,southeast,11381.3254,obese
286,46,female,48.07,2,no,northeast,9432.9253,obese
401,47,male,47.52,1,no,southeast,8083.9198,obese
543,54,female,47.41,0,yes,southeast,63770.42801,obese
847,23,male,50.38,1,no,southeast,2438.0552,obese
860,37,female,47.6,2,yes,southwest,46113.511,obese
1047,22,male,52.58,1,yes,southeast,44501.3982,obese
1088,52,male,47.74,1,no,southeast,9748.9106,obese
1317,18,male,53.13,0,no,southeast,1163.4627,obese


ANSWER: From the table we can see that there are 9 obese people who exceed the upper limit of BMI. In addition to this, the maximum age that one of the people reaches is 58 years, while the minimum age was 18 years. We also note that people who exceed this limit at most have up to 2 dependents. We also noticed that of those people who exceed the upper limit of BMI, only 3 are women. Something that can also be noticed in these people is that more than half of them come from the Southeast region.

* Examine the relationship between “bmi” and “charges”.

In [None]:
expense_by_bmi_cat = data.groupby('bmi_cat').charges.sum().sort_values(ascending = False)
expense = round(expense_by_bmi_cat)
expense

bmi_cat
obese          10970453.0
overweight      4245152.0
healthy         2358410.0
underweight      181810.0
Name: charges, dtype: float64

In [None]:
px.bar(expense_by_bmi_cat, 
        x=expense_by_bmi_cat.index, 
        y=expense_by_bmi_cat, text_auto='.3s', 
        title="Total expenses per health condition")

ANSWER: The most important health insurance expenses are made by obese people with a value of approximately 11M dollars.

In [None]:
fig = px.box(data, x="bmi_cat", y="charges", points='all')
fig.show()

ANSWER: We see from the box plot that there is a higher concentration of people in those who are obese. Even this segment compared to the other health states has the largest number of outliers.

* Examine the relationship between “region”, “smoker” and “bmi” using bar plot.

In [None]:
relation = data.groupby(['region', 'smoker']).bmi.sum()
df_relation = pd.DataFrame(relation)
df_relation = df_relation.reset_index().copy()

In [None]:
df_relation

Unnamed: 0,region,smoker,bmi
0,northeast,no,7538.345
1,northeast,yes,1913.87
2,northwest,no,7799.785
3,northwest,yes,1690.145
4,southeast,no,9129.78
5,southeast,yes,3011.8
6,southwest,no,8145.6
7,southwest,yes,1798.3


In [None]:
fig = px.bar(df_relation, x="region", y="bmi",
             color='smoker', barmode='group',
             height=400, text_auto='.3s')
fig.show()

ANSWER: This bar chart shows us the sum of all the BMI of each of the inhabitants by region, segmented by smoker. In this graph we can see that the highest BMI sum is presented by the Southeast region with approximately 12.1K, of which 75% do not smoke.

### 2. Try to use data visualization techniques as much as possible while examining the data.

#### REGION - CHARGES - SMOKER (BOXPLOTS)

In [None]:
fig = px.box(data, x="region", y="charges", color="smoker")
fig.show()

#### REGION - CHARGES - SEX (BOXPLOTS)

In [None]:
fig = px.box(data, x="region", y="charges", color="sex")
fig.show()

#### REGION - CHARGES - BMI_CAT (BOXPLOTS)

In [None]:
fig = px.box(data, x="region", y="charges", color="bmi_cat")
fig.show()

#### REGION - BMI - SMOKER (BOXPLOTS)

In [None]:
fig = px.box(data, x="region", y="bmi", color="smoker")
fig.show()

#### REGION - BMI - SEXO (BOXPLOTS)

In [None]:
fig = px.box(data, x="region", y="bmi", color="sex")
fig.show()

#### REGION - BMI - BMI_CAT (BOXPLOTS)

In [None]:
fig = px.box(data, x="region", y="bmi", color="bmi_cat")
fig.show()

#### IMPORTANT DATA

In [None]:
# PERCENTAGE OF PEOPLE BY SEX
round((data['sex'].value_counts()/len(data)), 2)

male      0.51
female    0.49
Name: sex, dtype: float64

In [None]:
# PERCENTAGE OF PEOPLE BY REGION
round((data['region'].value_counts()/len(data)), 2)

southeast    0.27
southwest    0.24
northwest    0.24
northeast    0.24
Name: region, dtype: float64

In [None]:
# PERCENTAGE OF PEOPLE BY SMOKER
round((data['smoker'].value_counts()/len(data)), 2)

no     0.8
yes    0.2
Name: smoker, dtype: float64

In [None]:
# PERCENTAGE OF PEOPLE BY HEALTH STATE
round((data['bmi_cat'].value_counts()/len(data)), 2)

obese          0.53
overweight     0.29
healthy        0.17
underweight    0.02
Name: bmi_cat, dtype: float64

In [None]:
# PERCENTAGE OF PEOPLE BY THE NUMBER OF DEPENDENTS
round((data['children'].value_counts()/len(data)), 2)

0    0.43
1    0.24
2    0.18
3    0.12
4    0.02
5    0.01
Name: children, dtype: float64

### 3. Please add the meanings you deduced from the analyzes as a comment line

* 51% of people who spend on health insurance are men.
* The largest number of people spending on health insurance comes from the Southeast region at 27%.
* People who spend on health insurance and who do not smoke represent 80% of the total.
* 53% of people who spend on health insurance are obese.
* 43% of people who spend on health insurance have no dependents (zero children).

## III. Data Preprocessing

### 1. In this section, prepare the data you have, for training the model.

In [None]:
data = pd.read_csv("insurance.csv")
data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


### 2. Use Label Encoding and One-Hot Encoding techniques to deal with categorical variables.

In [None]:
# create a copy of the original data
preprocessed_data = data.copy()

In [None]:
preprocessed_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [None]:
#create a list of variables to apply labelencoder()
var_to_le = ['sex', 'smoker']

# Encode Categorical Columns
le = LabelEncoder()
preprocessed_data[var_to_le] = preprocessed_data[var_to_le].apply(le.fit_transform)

In [None]:
preprocessed_data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,27.9,0,1,southwest,16884.924
1,18,1,33.77,1,0,southeast,1725.5523
2,28,1,33.0,3,0,southeast,4449.462
3,33,1,22.705,0,0,northwest,21984.47061
4,32,1,28.88,0,0,northwest,3866.8552


In [None]:
# encode column 'region'
dummy = pd.get_dummies(preprocessed_data['region'], prefix = 'region')

# Delete the column 'region'
preprocessed_data = preprocessed_data.drop('region', axis = 1)

# concatenate the new columns
preprocessed_data = pd.concat([preprocessed_data, dummy], axis = 1)

# check the dataframe
preprocessed_data.head()

Unnamed: 0,age,sex,bmi,children,smoker,charges,region_northeast,region_northwest,region_southeast,region_southwest
0,19,0,27.9,0,1,16884.924,0,0,0,1
1,18,1,33.77,1,0,1725.5523,0,0,1,0
2,28,1,33.0,3,0,4449.462,0,0,1,0
3,33,1,22.705,0,0,21984.47061,0,1,0,0
4,32,1,28.88,0,0,3866.8552,0,1,0,0


### 3. Split your dataset into X_train,X_test, y_train, y_test.

In [None]:
df_train, df_test = train_test_split(preprocessed_data, 
                                     train_size = 0.7, 
                                     test_size = 0.3, 
                                     random_state = 100)

### 4. Scale the dataset by normalizing it(Min-Max Scaling or Standard Scaling).

In [None]:
# rescale the features
scaler = StandardScaler()

In [None]:
# apply scaler() to all the numeric columns 
numeric_vars = ['age', 'bmi', 'charges']

In [None]:
# apply rescaling to the training dataset
df_train[numeric_vars] = scaler.fit_transform(df_train[numeric_vars])
df_train.head()

Unnamed: 0,age,sex,bmi,children,smoker,charges,region_northeast,region_northwest,region_southeast,region_southwest
966,0.847399,1,-0.966302,2,1,0.890355,0,1,0,0
522,0.847399,0,0.502693,0,0,-0.27234,1,0,0,0
155,0.349938,1,1.405512,0,0,-0.51291,0,1,0,0
671,-0.716049,0,0.058934,0,0,-0.760694,1,0,0,0
1173,-0.076457,1,-0.247106,2,0,-0.553383,0,1,0,0


In [None]:
# apply rescaling to the test dataset
df_test[numeric_vars] = scaler.fit_transform(df_test[numeric_vars])
df_test.head()

Unnamed: 0,age,sex,bmi,children,smoker,charges,region_northeast,region_northwest,region_southeast,region_southwest
12,-1.181362,1,0.694997,0,0,-0.969244,0,0,0,1
306,-0.823643,0,-0.491718,2,0,0.553671,0,0,0,1
318,0.321057,0,-0.46678,0,0,-0.504975,0,1,0,0
815,-1.395994,0,0.189353,0,0,-0.965005,0,0,1,0
157,-1.539081,1,-0.891589,0,1,0.166985,1,0,0,0


In [None]:
# divide into X_train, y_train

X_train = df_train[['age', 'sex', 'bmi', 'children', 
                    'smoker','region_northeast', 'region_northwest', 
                    'region_southeast', 'region_southwest']]

y_train = df_train['charges']


# divide into X_test, y_test


X_test = df_test[['age', 'sex', 'bmi', 'children', 
                    'smoker','region_northeast', 'region_northwest', 
                     'region_southeast', 'region_southwest']]

y_test = df_test['charges']

## IV. Model Selection

### 1. Select several regression models and train them with the preprocessed data.

In [None]:
# ----------A. LinearRegression----------

# train model
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)

In [None]:
# ----------B. DecisionTreeRegressor----------

# train model
tree_reg = DecisionTreeRegressor()
tree_reg.fit(X_train, y_train)

In [None]:
# ----------C. SVR----------

# train model
svr_reg = SVR(C=0.1, epsilon=0.02, kernel='linear')
svr_reg.fit(X_train, y_train)

In [None]:
# ----------D. GradientBoostingRegressor----------

# train model
gb_reg = GradientBoostingRegressor()
gb_reg.fit(X_train, y_train)

In [None]:
# ----------E. CatBoostRegressor----------

# train model
cb_reg = CatBoostRegressor()
cb_reg.fit(X_train, y_train)

Learning rate set to 0.040517
0:	learn: 0.9686903	total: 134ms	remaining: 2m 14s
1:	learn: 0.9427364	total: 140ms	remaining: 1m 9s
2:	learn: 0.9170730	total: 144ms	remaining: 48s
3:	learn: 0.8908031	total: 149ms	remaining: 37s
4:	learn: 0.8672689	total: 153ms	remaining: 30.5s
5:	learn: 0.8445081	total: 159ms	remaining: 26.3s
6:	learn: 0.8220426	total: 163ms	remaining: 23.1s
7:	learn: 0.8012332	total: 166ms	remaining: 20.6s
8:	learn: 0.7803752	total: 168ms	remaining: 18.5s
9:	learn: 0.7608857	total: 171ms	remaining: 16.9s
10:	learn: 0.7433680	total: 171ms	remaining: 15.4s
11:	learn: 0.7271559	total: 173ms	remaining: 14.3s
12:	learn: 0.7110936	total: 174ms	remaining: 13.2s
13:	learn: 0.6951385	total: 176ms	remaining: 12.4s
14:	learn: 0.6780657	total: 177ms	remaining: 11.6s
15:	learn: 0.6661210	total: 177ms	remaining: 10.9s
16:	learn: 0.6518789	total: 179ms	remaining: 10.4s
17:	learn: 0.6392959	total: 180ms	remaining: 9.82s
18:	learn: 0.6250503	total: 182ms	remaining: 9.39s
19:	learn: 0.6

<catboost.core.CatBoostRegressor at 0x29dc59c8160>

In [None]:
# ----------F. LGBMRegressor----------

# train model
lgbm_reg = LGBMRegressor()
lgbm_reg.fit(X_train, y_train)

In [None]:
# ----------G. XGBRegressor----------

# train model
xgb_reg = XGBRegressor()
xgb_reg.fit(X_train, y_train)

In [None]:
# ----------H. RandomForestRegressor----------

# train model
rf_reg = RandomForestRegressor()
rf_reg.fit(X_train, y_train)

### 2. Examine the performances of the selected models using cross validation.

In [None]:
# create a KFold object with 5 splits
folds = KFold(n_splits = 5, shuffle = True, random_state = 100)


# A. LinearRegression
scores_lin_reg = cross_val_score(lin_reg, X_train, y_train, scoring='r2', cv=folds)

# B. DecisionTreeRegressor
scores_tree_reg = cross_val_score(tree_reg, X_train, y_train, scoring='r2', cv=folds)

# C. SVR
scores_svr_reg = cross_val_score(svr_reg, X_train, y_train, scoring='r2', cv=folds)

# D. GradientBoostingRegressor
scores_gb_reg = cross_val_score(gb_reg, X_train, y_train, scoring='r2', cv=folds)

# E. CatBoostRegressor
scores_cb_reg = cross_val_score(cb_reg, X_train, y_train, scoring='r2', cv=folds)

# F. LGBMRegressor
scores_lgbm_reg = cross_val_score(lgbm_reg, X_train, y_train, scoring='r2', cv=folds)

# G. XGBRegressor
scores_xgb_reg = cross_val_score(xgb_reg, X_train, y_train, scoring='r2', cv=folds)

# H.RandomForestRegressor
scores_rf_reg = cross_val_score(rf_reg, X_train, y_train, scoring='r2', cv=folds)

Learning rate set to 0.039107
0:	learn: 0.9604339	total: 1.42ms	remaining: 1.42s
1:	learn: 0.9358584	total: 2.71ms	remaining: 1.35s
2:	learn: 0.9121842	total: 3.74ms	remaining: 1.24s
3:	learn: 0.8873772	total: 4.87ms	remaining: 1.21s
4:	learn: 0.8652935	total: 5.95ms	remaining: 1.18s
5:	learn: 0.8441663	total: 7.1ms	remaining: 1.18s
6:	learn: 0.8236691	total: 8.2ms	remaining: 1.16s
7:	learn: 0.8045257	total: 9.34ms	remaining: 1.16s
8:	learn: 0.7859765	total: 10.4ms	remaining: 1.15s
9:	learn: 0.7684389	total: 11.5ms	remaining: 1.14s
10:	learn: 0.7511801	total: 12.6ms	remaining: 1.14s
11:	learn: 0.7336436	total: 13.3ms	remaining: 1.1s
12:	learn: 0.7169673	total: 14.5ms	remaining: 1.1s
13:	learn: 0.7007563	total: 15.7ms	remaining: 1.1s
14:	learn: 0.6861481	total: 17.2ms	remaining: 1.13s
15:	learn: 0.6712824	total: 18.2ms	remaining: 1.12s
16:	learn: 0.6579559	total: 19ms	remaining: 1.1s
17:	learn: 0.6446547	total: 20.2ms	remaining: 1.1s
18:	learn: 0.6324204	total: 21.2ms	remaining: 1.1s
19

In [None]:
combined_array = np.column_stack(
    (scores_lin_reg, scores_tree_reg, 
    scores_svr_reg, scores_gb_reg,
    scores_cb_reg, scores_lgbm_reg, 
    scores_xgb_reg, scores_rf_reg)
    )

In [None]:
score_columns = ['lin_reg_r2', 'tree_reg_r2', 
                'svr_reg_r2', 'gb_reg_r2', 
                'cb_reg_r2', 'lgbm_reg_r2', 
                'xgb_reg_2', 'rf_reg']

df_r2 = pd.DataFrame(combined_array, columns = score_columns)

df_r2

Unnamed: 0,lin_reg_r2,tree_reg_r2,svr_reg_r2,gb_reg_r2,cb_reg_r2,lgbm_reg_r2,xgb_reg_2,rf_reg
0,0.780059,0.67699,0.68894,0.882141,0.852074,0.854782,0.827916,0.853687
1,0.706203,0.659461,0.659554,0.793916,0.787095,0.778234,0.737468,0.789918
2,0.683659,0.544992,0.647733,0.811915,0.800167,0.786509,0.753496,0.794373
3,0.721112,0.750857,0.657948,0.841886,0.824331,0.821181,0.797102,0.826986
4,0.736396,0.722546,0.63226,0.833293,0.834869,0.822719,0.812275,0.837763


In [None]:
models_performance = df_r2.mean().sort_values(ascending = False)
models_performance

gb_reg_r2      0.832630
rf_reg         0.820546
cb_reg_r2      0.819707
lgbm_reg_r2    0.812685
xgb_reg_2      0.785651
lin_reg_r2     0.725486
tree_reg_r2    0.670969
svr_reg_r2     0.657287
dtype: float64

### 3. Choose the best performing model

In [None]:
px.bar(models_performance, x=models_performance.index, y=models_performance, text_auto='.3f', title="models_performance")

Answer:

The best model that we are going to select is: GradientBoostingRegressor

## V. Hyper-parameter Optimization

### 1. Optimize the hyper-parameters of the model selected in the previous step.

In [None]:
gb_reg = GradientBoostingRegressor()

In [None]:
grid = dict()
grid['n_estimators'] = [10, 50, 100, 500]
grid['learning_rate'] = [0.0001, 0.001, 0.01, 0.1, 1.0]
grid['subsample'] = [0.5, 0.7, 1.0]
grid['max_depth'] = [3, 7, 9]

In [None]:
# define the evaluation procedure
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)

### 2. Optimize parameters with Grid Search. (Grid Search or Randomized Search)


In [None]:
# define the grid search procedure
gscv = GridSearchCV(estimator = gb_reg, param_grid = grid, n_jobs=-1, cv=cv)

# execute the grid search
gb_reg_cv = gscv.fit(X_train, y_train)

In [None]:
# summarize the best score and configuration
print("Best: %f using %s" % (gb_reg_cv.best_score_, gb_reg_cv.best_params_))

Best: 0.843644 using {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 500, 'subsample': 0.7}


## VI.  Model Evaluation

### 1. Evaluate the optimized model using regression model evaluation metrics. (Ex. Mean Squared Error, Mean Absolute Error etc.)

In [None]:
# make predictions
y_pred_gb_reg = gb_reg_cv.predict(X_test)

In [None]:
# Evaluate mean squared error
print("Mean Squared Error(MSE):", mean_squared_error(y_test, y_pred_gb_reg))

# Evaluate root mean squared error
print("Root Mean Squared Error(RMSE):", np.sqrt(mean_squared_error(y_test, y_pred_gb_reg)))

# Evaluate R-square
print("R-Square:",r2_score(y_test, y_pred_gb_reg))

Mean Squared Error(MSE): 0.131796538972961
Root Mean Squared Error(RMSE): 0.36303793048793265
R-Square: 0.868203461027039


#### Validating the model with the best parameters

In [None]:
gb_reg_opt = GradientBoostingRegressor(random_state=0, 
                                        learning_rate=0.01, 
                                        max_depth=3, 
                                        n_estimators=500, 
                                        subsample=0.7)

gb_reg_opt.fit(X_train, y_train)

In [None]:
y_pred_gb_reg_opt = gb_reg_opt.predict(X_test)

In [None]:
# Evaluate mean squared error
print("Mean Squared Error(MSE):", mean_squared_error(y_test, y_pred_gb_reg_opt))

# Evaluate root mean squared error
print("Root Mean Squared Error(RMSE):", np.sqrt(mean_squared_error(y_test, y_pred_gb_reg_opt)))

# Evaluate R-square
print("R-Square:",r2_score(y_test, y_pred_gb_reg_opt))

Mean Squared Error(MSE): 0.12954053405857002
Root Mean Squared Error(RMSE): 0.3599173989383814
R-Square: 0.87045946594143
