# IT Academy - Data Science with Python
## Sprint 12: Supervised Regressions
### [Github Supervised Regressions](https://github.com/jesussantana/Supervised-Regression)

[![forthebadge made-with-python](http://ForTheBadge.com/images/badges/made-with-python.svg)](https://www.python.org/)  
[![Made withJupyter](https://img.shields.io/badge/Made%20with-Jupyter-orange?style=for-the-badge&logo=Jupyter)](https://jupyter.org/try)  
[![wakatime](https://wakatime.com/badge/github/jesussantana/Supervised-Regression.svg)](https://wakatime.com/badge/github/jesussantana/Supervised-Regression)

In [None]:
import numpy as np
import pandas as pd
import warnings

from matplotlib import pyplot as plt
import seaborn as sns
from PIL import Image
%matplotlib inline

plt.figure(figsize=(16, 6))
warnings.filterwarnings('ignore')
sns.set_theme(style='darkgrid', palette='deep')

In [None]:
path = "../data/"
file = "processed/DelayedFlightsProcessed.csv"

In [None]:
df_raw = pd.read_csv(path + file)

### Exercise 1: 
  - Create at least three different regression models to try to best predict DelayedFlights.csv flight delay (ArrDelay).

### Exercise 2: 
  - Compare them based on MSE and R2.

### Exercise 3: 
  - Train them using the different parameters they support

### Exercise 4: 
  - Compare your performance using the traint / test approach or using all data (internal validation)

In [None]:
df = df_raw.copy()

In [None]:
df.head()

In [None]:
df.columns

In [None]:
df.info()

In [None]:
df.head()

In [None]:
type(df)

### Simple Linear Regression

In [None]:
# Matrix
X = df.iloc[:, 6].values.reshape((-1, 1))
# Vector
y = df.iloc[:, 0].values

- Divide the data set into training set and test set

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

- Create Simple Linear Regression model with training set

In [None]:
"""from sklearn.linear_model import LinearRegression

regression = LinearRegression()
regression.fit(X_train, y_train)"""

- Linear regressions Internal, External

In [None]:
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

regrINT = linear_model.LinearRegression()
regrINT.fit(X,y)
predINT = regrINT.predict(X)
print("R2 Intern: ", r2_score(y, predINT))

regrEXT = linear_model.LinearRegression()
regrEXT.fit(X_train,y_train)
predEXT = regrEXT.predict(X_test)
print("R2 Extern: ", r2_score(y_test, predEXT))

- View training results

In [None]:
plt.scatter(X_train, y_train, color = "red")
plt.plot(X_train, regrEXT.predict(X_train), color = "blue")
plt.title("Departure Delay vs Arrived Delay (Trainning Set)")
plt.xlabel("Departure Delay")
plt.ylabel("Arrived Delay")
plt.show()

## Multiple Linear Regression

In [None]:
# Matrix
X = df.iloc[:, :-1].values
# Vector
y = df.iloc[:, 0].values

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [None]:
regression = LinearRegression()
regression.fit(X_train, y_train)

In [None]:
y_pred = regression.predict(X_test)

- Build the optimal RLM model

In [None]:
import statsmodels.api as sm

In [None]:
X_opt = X[:, [1, 2, 3, 4, 5, 6]]
regression_OLS = sm.OLS(endog = y, exog = X_opt.tolist()).fit()
regression_OLS.summary()

X_opt = X[:, [1, 2, 3, 4, 5]]
regression_OLS = sm.OLS(endog = y, exog = X_opt.tolist()).fit()
regression_OLS.summary()

X_opt = X[:, [1, 2, 3, 4]]
regression_OLS = sm.OLS(endog = y, exog = X_opt.tolist()).fit()
regression_OLS.summary()

X_opt = X[:, [1, 2, 3]]
regression_OLS = sm.OLS(endog = y, exog = X_opt.tolist()).fit()
regression_OLS.summary()

X_opt = X[:, [1, 2]]
regression_OLS = sm.OLS(endog = y, exog = X_opt.tolist()).fit()
regression_OLS.summary()

- Build the optimal RLM model using Automatic Backward Elimination

In [None]:
def backwardElimination(x, sl):    
    numVars = len(x[0])    
    for i in range(0, numVars):        
        regressor_OLS = sm.OLS(y, x.tolist()).fit()        
        maxVar = max(regressor_OLS.pvalues).astype(float)        
        if maxVar > sl:            
            for j in range(0, numVars - i):                
                if (regressor_OLS.pvalues[j].astype(float) == maxVar):                    
                    x = np.delete(x, j, 1)    
        
    return regressor_OLS.summary()
 
SL = 0.05
X_opt = X[:, [1, 2, 3, 4, 5, 6]]
X_Modeled = backwardElimination(X_opt, SL)

In [None]:
X_Modeled

## KFold

In [None]:
df = df.sample(frac=1).head(5000)

df = df.reset_index() # Importante resetear index al hacer seleccion aleatoria

X = df[["ArrTime", "Distance", "DepDelay"]]
y = df["ArrDelay"]

In [None]:
# Replicar el proceso anterior multiples veces 
from sklearn.model_selection import KFold 

kf = KFold(n_splits= 10, shuffle = True)  # particiones / mezcla de datos

kf.get_n_splits(X) # particiones concretas

In [None]:
# ajustar un modelo regresion lineal
regr = linear_model.LinearRegression()

resultados = []

for train_index, test_index in kf.split(X):  # indices
    X_train, X_test = X.loc[train_index,], X.loc[test_index] # objetos validacion externa
    y_train, y_test = y[train_index], y[test_index]
    regr.fit(X_train, y_train)  #ajustar el modelo
    predicciones = regr.predict(X_test)  # prediciones test
    print("R2: ", r2_score(y_test, predicciones))  # evaluar con test
    resultados.append(r2_score(y_test, predicciones))

print("R2 medio: ", np.mean(resultados))

In [None]:
# como de dependiente es la evaluacion del modelo en relacion del conjunto train,test utilizado

## Polynomial Regression

In [None]:
df.head()

In [None]:
df_sample = df.sample(n=25, random_state=1)

In [None]:
# Matrix
X = df_sample.iloc[:, 1:2].values

# Vector
y = df_sample.iloc[:, 0:1].values


In [None]:
X.shape
y.shape

- Fit Polynomial regression with the dataset

In [None]:
lin_reg = LinearRegression()
lin_reg.fit(X, y)

In [None]:
from sklearn.preprocessing import PolynomialFeatures

poly_reg = PolynomialFeatures(degree = 2)
X_poly = poly_reg.fit_transform(X)

lin_reg_2 = LinearRegression()
lin_reg_2.fit(X_poly, y)

## Visualization of the results of the Polynomial Model vs Linear Regression

In [None]:
plt.scatter(X, y, color = "red")
plt.plot(X, lin_reg.predict(X), color = "blue")
plt.title("Linear Regression Model")
plt.xlabel("Deep Delay")
plt.ylabel("Arrived Delay")
plt.show()

In [None]:
X_grid = np.arange(min(X), max(X), 0.1)
X_grid = X_grid.reshape(len(X_grid), 1)

plt.scatter(X, y, color = "red")
plt.plot(X_grid, lin_reg_2.predict(poly_reg.fit_transform(X_grid)), color = "blue")
plt.title("Polynomial Regression Model")
plt.xlabel("Deep Delay")
plt.ylabel("Arrived Delay")
plt.show()