João Victor Nascimento da Silva

Consider the dataset below that contains **heights** (in inches/inches) and **weights** (in pounds/lbs) of fake people. <br/>
https://www.kaggle.com/mustafaali96/weight-height

Perform the following tasks:
- Split the dataset into 80% for training and 20% for testing
- Compute the correlation between the training set variables
- Train a regression model considerint **height** as the independent variable and **weight** as the dependent one
   + Compute the model determination coefficient
   + Plot a scatterplot of the two variables containing the regression model (line)
- Predict the test set
   + Plot a scatterplot of the two variables containing the regression model (line)
   + Compute error metrics for regression

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

params = {'legend.fontsize': 'x-large',
          'figure.figsize': (15, 5),
         'axes.labelsize': 'x-large',
         'axes.titlesize':'x-large',
         'xtick.labelsize':'x-large',
         'ytick.labelsize':'x-large'}
plt.rcParams.update(params)

In [None]:
df = pd.read_csv('datasets/weight-height.csv')
df.head()

In [None]:
df = df[['Height', 'Weight']]
df

In [None]:
df.shape

In [None]:
df.corr()

In [None]:
X = df[['Height']]
X

In [None]:
y = df['Weight']
y

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
# Split the dataset into 80% for training and 20% for testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=69) #42

In [None]:
df_train = pd.DataFrame({'Height': X_train['Height'], 'Weight': y_train})
df_train

In [None]:
# Compute the correlation between the training set variables
df_train.corr()

In [None]:
# Plot a scatterplot of the two variables containing the regression model (line)
sns.regplot(data=df_train, x='Height', y='Weight')

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
reg = LinearRegression()
reg

In [None]:
# Train a regression model considering height as the independent variable and weight as the dependent one
reg.fit(X_train, y_train)

In [None]:
# Compute the model determination coefficient
reg.coef_

In [None]:
# Predict the test set
y_pred = reg.predict(X_test)
y_pred

In [None]:
# Conferindo a predição
res = pd.DataFrame({
    'Height': X_test['Height'],
    'Weight': y_pred.round(decimals=2),
    'Weight (true)': y_test.round(decimals=2),
    'Diferença residual': (y_test - y_pred).round(decimals=2),
    'Diferença absoluta': abs((y_test - y_pred).round(decimals=2))
})
res

In [None]:
# Plot a scatterplot of the two variables containing the regression model (line)
fig, axs = plt.subplots(1, 2, figsize=(20, 6), sharex=True, sharey=True)
regression_line_x = X['Height']
regression_line_y = reg.predict(X)

sns.scatterplot(x=X_train['Height'], y=y_train, ax=axs[0])
sns.lineplot(x=regression_line_x, y=regression_line_y, color="red", ax=axs[0])
axs[0].set_title('Height vs Weight (Training Set)')
axs[0].set_xlabel('Height')
axs[0].set_ylabel('Weight')

sns.scatterplot(x=X_test['Height'], y=y_test, color="green", marker='*', s=200, ax=axs[1])
sns.lineplot(x=regression_line_x, y=regression_line_y, color="red", ax=axs[1])
axs[1].set_title('Height vs Weight (Testing Set)')
axs[1].set_xlabel('Height')
axs[1].set_ylabel('Weight')

display()

In [None]:
display()
X_labeled = X.copy()
X_labeled['Label'] = 'Train'
X_labeled.loc[X_test.index, 'Label'] = 'Test'

sns.scatterplot(data=X_labeled, x='Height', y=y, hue='Label', style='Label', s=100)
sns.lineplot(x=regression_line_x, y=regression_line_y, color="red")
plt.title('Height vs Weight (Training and Testing Set)')
plt.xlabel('Height')
plt.ylabel('Weight')

In [None]:
# Compute error metrics for regression
from sklearn.metrics import mean_absolute_error
MAE = mean_absolute_error(y_test, y_pred)
MAE

In [None]:
from sklearn.metrics import mean_squared_error
MSE = mean_squared_error(y_test, y_pred)
MSE

In [None]:
RMSE = mean_squared_error(y_test, y_pred, squared=False)
RMSE