# IT Academy - Data Science with Python
## Sprint 12: Supervised Regressions
### [Github Supervised Regressions](https://github.com/jesussantana/Supervised-Regression)

[![forthebadge made-with-python](http://ForTheBadge.com/images/badges/made-with-python.svg)](https://www.python.org/)  
[![Made withJupyter](https://img.shields.io/badge/Made%20with-Jupyter-orange?style=for-the-badge&logo=Jupyter)](https://jupyter.org/try)  
[![wakatime](https://wakatime.com/badge/github/jesussantana/Supervised-Regression.svg)](https://wakatime.com/badge/github/jesussantana/Supervised-Regression)

In [None]:
import pandas as pd
import numpy as np
import warnings

from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier

from matplotlib import pyplot as plt
import seaborn as sns
from PIL import Image
%matplotlib inline

plt.figure(figsize=(16, 6))
warnings.filterwarnings('ignore')
sns.set_theme(style='darkgrid', palette='deep')

In [None]:
df_raw = pd.read_csv("../data/raw/DelayedFlights.csv")

### Exercise 1: 
  - Create at least three different regression models to try to best predict DelayedFlights.csv flight delay (ArrDelay).

### Linear Regression

In [None]:
df = df_raw.dropna(subset = ["ArrDelay"])
df = df.sample(frac=1).head(100000)
Y = df["ArrDelay"]
X = df[["DepDelay"]]

In [None]:
df.head()

In [None]:
df.columns

In [None]:
regr = linear_model.LinearRegression()
regr.fit(X,Y)

#Beta coefficient
print("Coefficients: ", regr.coef_)  
Y_pred = regr.predict(X)
print("R Square: ", r2_score(Y, Y_pred))

In [None]:
plt.scatter(X[1:10000], Y[1:10000], color = "black") 
plt.plot(X[1:10000], Y_pred[1:10000], color = "Blue")  
plt.show()

### Logistical Regression

In [None]:
df = df_raw.dropna(subset = ["ArrDelay"])
df = df.sample(frac=1).head(100000)
Y = df["ArrDelay"] > 30  # True = 1 / False= 0 // Interpretacion: Vuelos sin retraso Notable
X = df[["ArrTime"]]

In [None]:
logreg = LogisticRegression()
logreg.fit(X, Y)
Y_pred = logreg.predict(X)

In [None]:
np.round(logreg.predict_proba(X),3)

In [None]:
np.mean(Y_pred == Y)  # 88% correcto, comprobaremos si estan compensadas con la siguiente instruccion

In [None]:
np.mean(Y) # no es tan bueno como parecia el modelo

In [None]:
confusion_matrix = confusion_matrix(Y,Y_pred)
print(confusion_matrix)

### Naive Bayes

In [None]:
df = df.dropna(subset=["ArrDelay"])

In [None]:
Y = df["ArrDelay"] > 0

In [None]:
df["Month"] = df["Month"].apply(str) # convertimos en string para que sean categoricas 
df["DayofMonth"] = df["DayofMonth"].apply(str) 
df["DayOfWeek"] = df["DayOfWeek"].apply(str) 
df["TailNum"] = df["TailNum"].apply(str) 

X = pd.get_dummies(data=df[['Month','DayofMonth','TailNum','DayOfWeek', 'Origin', 'Dest', 'UniqueCarrier']])

In [None]:
X.head()

In [None]:
# Ajustar el modelo
clf= BernoulliNB()
#clf = MultinomialNB()
clf.fit(X,Y)
Y_pred = clf.predict(X)

In [None]:
np.mean(Y == Y_pred)

In [None]:
X = df[['AirTime','Distance','ArrDelay','TaxiOut']] #DepDelay
clf = GaussianNB()
clf.fit(X,Y)
Y_pred = clf.predict(X)

In [None]:
np.mean(Y == Y_pred)

### K-Nearest-Neihbours


In [None]:
newdf = df[["AirTime", "Distance", "TaxiOut", "ArrDelay"]].dropna()
cols = newdf[newdf.columns[newdf.columns != "ArrDelay"]]

In [None]:
filtro = newdf["ArrDelay"] > 10 # variable respuesta binaria

In [None]:
newdf["ArrDelay"][filtro] = "Delayed"
newdf["ArrDelay"][filtro == False] = "Not Delayed"  #añadimos etiquetas para visualizarlo

In [None]:
newdf["ArrDelay"].head()

In [None]:
nbrs_3 = KNeighborsClassifier(n_neighbors=3, n_jobs = -1) # especificar numero de vecinos del modelo

In [None]:
nbrs_3.fit(cols, newdf["ArrDelay"])

In [None]:
predicciones_3 = nbrs_3.predict(cols) # hemos clasificado correctamente el 

In [None]:
np.mean(predicciones_3 == newdf["ArrDelay"]) # hemos clasificado correctamente casi el 80% de los datos

In [None]:
np.mean(newdf["ArrDelay"] == "Not Delayed") # el modelo mejoro en mas d eun 50% la prediccion

In [None]:
nbrs_1 = KNeighborsClassifier(n_neighbors=1, n_jobs = -1)  #probamos a ver con un solo vecino
nbrs_1.fit(cols, newdf["ArrDelay"])
predicciones_1 = nbrs_1.predict(cols)
np.mean(predicciones_1 == newdf["ArrDelay"]) # mejora un 2% con un solo vecino

In [None]:
np.mean(newdf["ArrDelay"] == "Not Delayed")

In [None]:

#confusion_matrix = confusion_matrix(newdf["ArrDelay"], predicciones_1)

### Random Forest

In [None]:
df = df.dropna(subset=["ArrDelay"])
df = df.sample(frac=1) # mezclamos
dftest = df.tail(500000)
df = df.head(500000)

In [None]:
clf = tree.DecisionTreeClassifier()

X = df[["Distance","AirTime",'DepTime','TaxiIn','TaxiOut','DepDelay']]
X_test = dftest[["Distance","AirTime",'DepTime','TaxiIn','TaxiOut','DepDelay']]
Y = df["ArrDelay"] > 10
Y_test = dftest["ArrDelay"] > 10

# Entrenamos el modelo
clf = clf.fit(X, Y)
Y_pred_test = clf.predict(X_test)

np.mean(Y_test == Y_pred_test)

In [None]:
clf = RandomForestClassifier(n_estimators= 100, n_jobs= -1) #genere 100 arboles y todos los nucleos
clf.fit(X, Y)
Y_pred_test = clf.predict(X_test)

clf.feature_importances_
# n_estimators / max_features / bootstrap / n_jobs

In [None]:
np.mean(Y_test == Y_pred_test) #mejora un 4% el modelo de prediccion

In [None]:
#RandomForestRegressor()

### Exercise 2: 
  - Compare them based on MSE and R2.