In [None]:
# MBA DATA SCIENCE & ANALYTICS USP/Esalq
#Supervised ML - SIMPLE AND MULTIPLE REGRESSION ANALYSIS
#Isabela Pereira Lima Dias

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm
import numpy as np
from statsmodels.iolib.summary2 import summary_col
from skimage import io
import plotly.graph_objs as go
from scipy.stats import pearsonr
from sklearn.preprocessing import LabelEncoder

In [None]:
#dataset
df = pd.read_csv("babies.csv", delimiter= ",")
df.rename(columns = {'comprimento':'length', 'idade':'age'}, inplace = True)
df

In [None]:
df.plot(kind='scatter', x='age', y='length', title='Dispersion', s=32, alpha=.5)
plt.gca().spines[['top', 'right',]].set_visible(False)

In [None]:
df['length'].plot(kind='hist', bins=20, title='length')
plt.gca().spines[['top', 'right',]].set_visible(False)

In [None]:
df['age'].plot(kind='hist', bins=20, title='age')
plt.gca().spines[['top', 'right',]].set_visible(False)

In [None]:
#Linear model
linear_model = sm.OLS.from_formula("length ~ age", df).fit()
linear_model.summary()

In [None]:
#graph with non-linear (plum) and linear fits (lightblue)
plt.figure(figsize=(10,10))
sns.regplot(x="age", y="length", data=df,
            x_estimator=np.mean, logx=True, color='plum')
plt.plot(df['age'],linear_model.fittedvalues, color='lightblue')
plt.title('Dispersion', fontsize=17)
plt.xlabel('Age(weeks)', fontsize=16)
plt.ylabel('Length(cm)', fontsize=16)
plt.show()


Technical analysis

In [None]:
#Shapiro-Wilk test <- normality  test (n<30)
from scipy.stats import shapiro
shapiro(linear_model.resid) #residuals

In [None]:
#Shapiro- Francia test (n>=30)
# Function 'shapiroFrancia' of package'sfrancia'
# Authors: Luiz Paulo Fávero e Helder Prado Santos
#!pip install sfrancia==1.0.8
from sfrancia import shapiroFrancia
shapiroFrancia(linear_model.resid)


In [None]:
plt.figure(figsize=(10,10))
sns.histplot(data=linear_model.resid, kde=True, bins=30)
plt.xlabel('Residuals', fontsize=16)
plt.ylabel('Frequency', fontsize=16)
plt.show()

In [None]:
#Box-Cox transform. Box and Cox (1964) developed a family of transformations designed to reduce nonnormality of the errors in a linear model
#Source: https://www.css.cornell.edu/faculty/dgr2/_static/files/R_html/Transformations.html
#(Y*) = x; lambda_ = box-cox lambda
from scipy.stats import boxcox
x, lambda_  = boxcox(df['length'])
df['length_bc'] = x
df

In [None]:
df.plot(kind='scatter', x='length_bc', y='age', s=32, alpha=.5)
df.plot(kind='scatter', x='length', y='age', s=32, alpha=.5)
plt.gca().spines[['top', 'right',]].set_visible(False)

In [None]:
df['length_bc'].plot(kind='hist', bins=20, title='length_bc')
plt.gca().spines[['top', 'right',]].set_visible(False)

In [None]:
df['length'].plot(kind='hist', bins=20, title='length')
plt.gca().spines[['top', 'right',]].set_visible(False)

In [None]:
# Now, we can build a new model based on our box-cox transformation
bc_model =sm.OLS.from_formula('length_bc ~ age', df).fit()
bc_model.summary()

In [None]:
#COMPARING THE LINEAR_MODEL AND BC_MODEL
summary_col([linear_model,bc_model],
            model_names=["Linear model", "Box-Cox Model "],
            stars=True,
            info_dict={'N': lambda x: "{0:d}".format(int(x.nobs))})

In [None]:
pd.DataFrame({'R-squared OLS':[round(linear_model.rsquared,4)],
              'R-squared Box-Cox':[round(bc_model.rsquared,4)]})

In [None]:
# Shapiro-Francia test
shapiroFrancia(bc_model.resid)

In [None]:
plt.figure(figsize=(10,10))
sns.histplot(data=bc_model.resid, kde=True, bins=30)
plt.title('Histogram')
plt.xlabel('Residuals', fontsize=16)
plt.ylabel('Frequency', fontsize=16)
plt.show()


Predictions

In [None]:
linear_model.predict(pd.DataFrame({'age':[52]}))

In [None]:
bc_model.predict(pd.DataFrame({'age':[52]}))
(54251.109775 * lambda_ + 1) ** (1 / lambda_)

In [None]:
# fitted values for both models
df['yhat_linear'] = linear_model.fittedvalues
df['yhat_bc_model'] = (bc_model.fittedvalues * lambda_ + 1) ** (1 / lambda_)

In [None]:
df

In [None]:
_df_7.plot(kind='scatter', x='length_bc', y='yhat_bc_model', s=32, alpha=.5)
_df_7.plot(kind='scatter', x='length', y='age', s=32, alpha=.5)
plt.gca().spines[['top', 'right',]].set_visible(False)

Fitted values x real values

In [None]:
from scipy.optimize import curve_fit

def objective(x, a, b, c, d, e):
    return (a * x) + (b * x**2) + (c * x**3) + (d * x**4) + e

xdata = df['length']
ydata_linear = df['yhat_linear']
ydata_bc = df['yhat_bc_model']

plt.figure(figsize=(10,10))

popt, _ = curve_fit(objective, xdata, ydata_linear)
a, b, c, d, e = popt
x_line = np.arange(min(xdata), max(xdata), 1)
y_line = objective(x_line, a, b, c, d, e)
plt.plot(x_line, y_line, '--', color='plum', linewidth=3)

popt, _ = curve_fit(objective, xdata, ydata_bc)
a, b, c, d, e = popt
x_line = np.arange(min(xdata), max(xdata), 1)
y_line = objective(x_line, a, b, c, d, e)
plt.plot(x_line, y_line, '--', color='lightblue', linewidth=3)

plt.plot(xdata,xdata, color='gray', linestyle='-')
plt.scatter(xdata,ydata_linear, alpha=0.5, s=100, color='plum')
plt.scatter(xdata,ydata_bc, alpha=0.5, s=100, color='lightblue')
plt.xlabel('Length', fontsize=16)
plt.ylabel('Fitted Values', fontsize=16)
plt.legend(['OLS Linear','Box-Cox','45º'], fontsize=17)
plt.title('Dispersion and Fitted Values', fontsize=16)
plt.show()