<a href="https://colab.research.google.com/github/leobioinf0/Supervisat_Classificacio/blob/main/S10_T01_Aprenentage_Supervisat_NO_OUTPUTS.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# [**Supervised Classification**](https://github.com/leobioinf0/Supervisat_Classificacio)
Course: *Data Science amb Python*

Sprint: 10. Supervised Learning - classification 


Task: S10 T01: Supervised Learning - Classification 

Date: *2022-01-17*

[Leo Madsen](https://github.com/leobioinf0)



#### Exercises: 
- Level 1
    - Exercise 1: 
    
    Create at least three different classification models to try to best predict DelayedFlights.csv flight delay (ArrDelay). Consider whether the flight is late or not (ArrDelay> 0).
    - Exercise 2: 
    
    Compare classification models using accuracy, a confidence matrix, and other more advanced metrics.
    - Exercise 3: 
    
    Train them using the different parameters they support.
    - Exercise 4: 
    
    Compare your performance using the traint / test or cross-validation approach.
- Level 2
    - Exercise 5: 
    
    Perform some variable engineering process to improve prediction
- Level 3
    - Exercise 6: 
    
    Do not use the DepDelay variable when making predictions

# Prerequisites

## Upgrade modules

In [None]:
!pip install --upgrade pip
!pip install --upgrade lightgbm 
!pip install --upgrade xgboost
!pip install --upgrade feature-engine
!pip install --upgrade matplotlib
!pip install --upgrade scikit-learn
!pip install --upgrade numpy
!pip install --upgrade keras
!pip install --upgrade tensorflow
!pip install --upgrade pandas

## Load modules

In [None]:
## System
import os
import pickle
import glob

## Data treatment
import pandas as pd
import numpy as np
import datetime
import math 
from scipy.stats import sem

## Graphics
import missingno as msno #missing data visualizations
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import Image, display
from tqdm import tqdm_notebook
from google.colab import data_table

## Metrics
from sklearn.metrics import euclidean_distances
from sklearn.metrics import roc_auc_score
from sklearn.metrics import recall_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import zero_one_loss
from sklearn.metrics import explained_variance_score
from sklearn.metrics import max_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import median_absolute_error
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.metrics import r2_score

## Data preprocessing
from feature_engine.creation import CyclicalTransformer
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import Binarizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder

## Linear Models
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import RidgeClassifier
from sklearn.linear_model import RidgeClassifierCV
from sklearn.linear_model import Perceptron
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.linear_model import SGDClassifier

## Preprocessing and modeling
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import cross_val_score

## Neural Network
from sklearn.neural_network import MLPRegressor
from sklearn.neural_network import MLPClassifier

## Tree Models
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import ExtraTreeClassifier

## Ensemble methods
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import ExtraTreesClassifier

## Naive Bayes algorithms
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import GaussianNB

## Gradient boosting
from xgboost import XGBRegressor
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

## Gaussian Process
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF

## Support Vector Machine algorithms
from sklearn.svm import SVC
from sklearn.svm import LinearSVC 
from sklearn.svm import NuSVC 

## k-nearest neighbors algorithm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestCentroid

## semi-supervised learning algorithms.
from sklearn.semi_supervised import LabelSpreading
from sklearn.semi_supervised import LabelPropagation

## DummyClassifier makes predictions that ignore the input features.
from sklearn.dummy import DummyClassifier

## Calibration of predicted probabilities.
from sklearn.calibration import CalibratedClassifierCV

## Linear Discriminant Analysis and Quadratic Discriminant Analysis
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

## Warnings configuration
import warnings
warnings.filterwarnings('ignore')

In [None]:
data_table.enable_dataframe_formatter()
data_table.DataTable.max_columns = 50
data_table.DataTable.num_rows_per_page = 55
data_table.DataTable.max_rows = 56

In [None]:
%matplotlib inline
sns.set(rc={'figure.figsize':(15,7)})

## Define functions

In [None]:
def timenow():
    '''
    Return the current time in hours, minutes and seconds.
    '''
    return datetime.datetime.now().time().replace(microsecond=0)

def mk_dir(name):
    '''
    Create directory with the name passed in the case that it does not exist and return its full path.
    '''
    path = os.path.join(os.getcwd(), name)
    if not os.path.exists(path):
        try:
            os.makedirs(path, 0o700)
        except OSError as e:
            if e.errno != errno.EEXIST:
                raise
        print("Directory created: {}".format(path))
    else:
        print("Directory already existing: {}".format(path))
    return path

def rounder(n):
    """
    Round up to the first digit that is not zero
    """
    if n == 0:
        return n
    else:
        k = 1 - int(math.log10(n))
        return round(n, 1 if n > 1 else k)

def describer(dataframe):
    """
    Description of dataframe
    """
    desc_df = round(dataframe.describe(include="all"),2).T
    desc_df["unique"]=dataframe.nunique()
    desc_df["NullAny"]= dataframe.isnull().any()
    desc_df["NullSum"]=dataframe.isnull().sum()
    desc_df["NullPct"]=(dataframe.isnull().sum()/len(dataframe)*100).apply(rounder)
    desc_df["dtypes"]=dataframe.dtypes
    desc_df.sort_values(by="dtypes", inplace=True)
    return(desc_df)


def kfcv_evaluator(X, y, model, n_s):
    '''
    Evaluate a R2 score by K-Folds cross-validation of a given model.
    '''
    # prepare the cross-validation procedure
    cv = KFold(n_splits=n_s, random_state=1, shuffle=True)
    # evaluate model
    scores = cross_val_score(model, X, y, scoring='r2', cv=cv, verbose=3)
    return scores.round(4)

def kfcv_evaluator_class(X, y, model, n_s):
    '''
    Evaluate a Accuracy score by K-Folds cross-validation of a given model.
    '''
    # prepare the cross-validation procedure
    cv = KFold(n_splits=n_s, random_state=1, shuffle=True)
    # evaluate model
    scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv)
    return scores.round(4)

def rkfcv_evaluator(X, y, model, n_s, n_r):
    '''
    Evaluate a R2 score by  Repeated K-Folds cross-validation of a given model.
    '''
    cv = RepeatedKFold(n_splits=n_s, n_repeats=n_r, random_state=1)
    scores = cross_val_score(model, X, y, scoring='r2', cv=cv, verbose=3)
    return scores.round(4)

def rkfcv_evaluator_class(X, y, model, n_s, n_r):
    '''
    Evaluate a Accuracy score by  Repeated K-Folds cross-validation of a given model.
    '''
    cv = RepeatedKFold(n_splits=n_s, n_repeats=n_r, random_state=1)
    scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv)
    return scores.round(4)

def numToTime(x):
    """
    Convert aeronautical time to datetime.time object
    """
    from datetime import time
    if pd.isnull(x):
        return np.nan
    else:
        x = str(int(x))
        h = x[:-2]
        if h == "" or h == "24":
            h = "0"
        m = x[-2:]
        return time(int(h), int(m))

def to_flat(regular_list):
    '''
    Flatten a given list.
    '''
    flat_list = [item for sublist in regular_list for item in sublist]
    return(flat_list)

def class_metrics(model, name, X_train, X_test, y_train, y_test, plt_path):
    '''
    Evaluate a classification model and calculate various metrics. Return the metrics in a dictionary
    '''
    y_pred = model.predict(X_test)
    conf_matrix = confusion_matrix(y_test,y_pred)
    
    scores = {'Model': name,
        'Accuracy':accuracy_score(y_test,y_pred).round(4),
        'B_Accuracy':balanced_accuracy_score(y_test,y_pred).round(4),
        'F1_Score':f1_score(y_test,y_pred).round(4),
        'Precision':precision_score(y_test,y_pred).round(4),
        'Recall':recall_score(y_test,y_pred).round(4),
        'Specificity' : recall_score(y_test, y_pred, pos_label=0).round(4),
        'FP_rate' : (1 - recall_score(y_test, y_pred, pos_label=0)).round(4),
        'Misclass_rate' : zero_one_loss(y_test, y_pred).round(4),
        'Misclass' : zero_one_loss(y_test, y_pred, normalize=False).round(4),
        'TP' : conf_matrix[1, 1].round(4),
        'TN' : conf_matrix[0, 0].round(4),
        'FP' : conf_matrix[0, 1].round(4),
        'FN' : conf_matrix[1, 0].round(4)}
    
    print("Confusion matrix :\n{}".format(conf_matrix))

    conf_matrix = confusion_matrix(y_test,y_pred,normalize='true')
    print("Normalized Confusion matrix:\n{}".format(conf_matrix))
    

    scores_table=pd.DataFrame(data=scores.items(), columns=["METRIC", "VALUES"])
    desc_table=pd.DataFrame(data=model.get_params().items(), columns=["PARAMS", "VALUES"])

    ax= plt.subplot()
    sns.heatmap(conf_matrix, annot=True, fmt='g', ax=ax, square=True, ); 
    desc_t = ax.table(cellText=desc_table.values,
                colLabels=desc_table.columns,
                bbox=[1.5, 0, 1.0, 1.0])
    desc_t.auto_set_column_width(col=list(range(len(desc_table.columns))))

    scores_t = ax.table(cellText=scores_table.values,
                colLabels=scores_table.columns,
                bbox=[-1.1, 0, 1.0, 1.0])
    scores_t.auto_set_column_width(col=list(range(len(scores_table.columns))))
    plt.title(name)
    plt.savefig(plt_path)
    plt.show()



    return scores

def reg_metrics(name, model, X_test, y_true):
    '''
    Evaluate a regression model and calculate various metrics. Return the metrics in a dataframe
    '''
    y_pred = model.predict(X_test)
    me = max_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    ev = explained_variance_score(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    mape = mean_absolute_percentage_error(y_true, y_pred)
    medae = median_absolute_error(y_true, y_pred)
    reg_metrics_df = pd.DataFrame(data=[[r2,mse, mae,  mape, ev,me, medae]], 
                                  columns=["R2","MSE", "MAE", "MAPE", "EV", "ME", "MedAE"], 
                                  index=[name])
    return(reg_metrics_df)

def rkfcv_to_df(results):
    """
    Tabulate the scores of a Given  dictionary with Repeated K-Folds cross-validation scores.
    And save the table to a .csv file. 
    """
    df = pd.DataFrame.from_dict({(i,j): rkfcv_results[i][j] 
                        for i in rkfcv_results.keys() 
                        for j in rkfcv_results[i].keys()},
                    orient='index')
    df.reset_index(level=[0,1], inplace=True)
    df.columns = ["model","n_splits"] + [str(i) for i in n_repeats]
    df.to_csv(rkfcv_filename_path+".csv")
    return df
    
def plot_rkfcv(df):
    """
    Boxplot of Repeated K-Folds cross-validation scores.
    """
    fig, axes = plt.subplots(nrows=df.model.nunique(), 
                            ncols=df.n_splits.nunique(),
                            figsize=(df.n_splits.nunique()*8,df.model.nunique()*4),
                            sharex=True, sharey = True)

    for i, modl in enumerate(df.model.unique()):
        for j, splts in enumerate(df.n_splits.unique()):
            data = df[(df.model==modl) & (df.n_splits==splts)][[str(i) for i in n_repeats]].apply(to_flat)
            axes[i][j].boxplot(data, showmeans=True)
            ttl="{} ({} splits)".format(modl, splts)
            axes[i][j].set_title(ttl, fontweight = "bold")

    fig.suptitle(t='Number of Repeats Performance Comparison in \nRepeated k-Fold Cross-Validation ', 
                fontsize = 18, fontweight = "bold")
    fig.supxlabel('Nº repeats',fontsize = 18, fontweight = "bold")
    fig.supylabel('Score',fontsize = 18, fontweight = "bold")

    filename = "All_Repeated_K-fold_cv.png"
    plt.savefig(os.path.join(exe04_plots_path, filename))
    plt.show()

## Directories

In [None]:
# working directory

cwd = './'
#cwd = '/content/drive/MyDrive/Data_Science_amb_Python/Sprint10-Aprenentatge_Supervisat_Classificacio/Supervisat_Classificacio/'
os.chdir(cwd)
print(os.getcwd())

# Read Data

In [None]:
# Read a smaller version of the dataset from raw.githubusercontent
filepath = "https://raw.githubusercontent.com/leobioinf0/Supervitat_Regressio/main/DelayedFlights3mb.csv"
df = pd.read_csv(filepath, index_col=0)
df.head()

____________________
This dataset is composed by the following variables: 

1.	**Year**	2016
2.	**Month**	1-12
3.	**DayofMonth**	1-31
4.	**DayOfWeek**	1 (Monday) - 7 (Sunday)
5.	**DepTime**	actual departure time (local, hhmm)
6.	**CRSDepTime**	scheduled departure time (local, hhmm)
7.	**ArrTime**	actual arrival time (local, hhmm)
8.	**CRSArrTime**	scheduled arrival time (local, hhmm)
9.	**UniqueCarrier**	unique carrier code
10.	**FlightNum**	flight number
11.	**TailNum** plane tail number: aircraft registration, unique aircraft identifier
12.	**ActualElapsedTime**	in minutes
13.	**CRSElapsedTime**	in minutes
14.	**AirTime**	in minutes
15.	**ArrDelay**	arrival delay, in minutes: **A flight is counted as "on time" if it operated less than 15 minutes later the scheduled time shown in the carriers' Computerized Reservations Systems (CRS).** 
16.	**DepDelay**	departure delay, in minutes
17.	**Origin**	origin IATA airport code
18.	**Dest**	destination IATA airport code
19.	**Distance**	in miles
20.	**TaxiIn**	taxi in time, in minutes
21.	**TaxiOut**	taxi out time in minutes
22.	**Cancelled**	*was the flight cancelled
23.	**CancellationCode**	reason for cancellation (A = carrier, B = weather, C = NAS, D = security)
24.	**Diverted**	1 = yes, 0 = no
25.	**CarrierDelay**	in minutes: Carrier delay is within the control of the air carrier. Examples of occurrences that may determine carrier delay are: aircraft cleaning, aircraft damage, awaiting the arrival of connecting passengers or crew, baggage, bird strike, cargo loading, catering, computer, outage-carrier equipment, crew legality (pilot or attendant rest), damage by hazardous goods, engineering inspection, fueling, handling disabled passengers, late crew, lavatory servicing, maintenance, oversales, potable water servicing, removal of unruly passenger, slow boarding or seating, stowing carry-on baggage, weight and balance delays.
26.	**WeatherDelay**	in minutes: Weather delay is caused by extreme or hazardous weather conditions that are forecasted or manifest themselves on point of departure, enroute, or on point of arrival.
27.	**NASDelay**	in minutes: Delay that is within the control of the National Airspace System (NAS) may include: non-extreme weather conditions, airport operations, heavy traffic volume, air traffic control, etc. 
28.	**SecurityDelay**	in minutes: Security delay is caused by evacuation of a terminal or concourse, re-boarding of aircraft because of security breach, inoperative screening equipment and/or long lines in excess of 29 minutes at screening areas.
29.	**LateAircraftDelay**	in minutes: Arrival delay at an airport due to the late arrival of the same aircraft at a previous airport. The ripple effect of an earlier delay at downstream airports is referred to as delay propagation.

In [None]:
# Full description of dataframe
describer(df)

# Preprocessing

## Missing in ArrDelay

- We eliminate the flights that do not have data in the response variable.

In [None]:
df = df[df['ArrDelay'].notna()].copy()

## Cancelled/CancellationCode

- First we eliminate all the flights that were canceled since they do not provide information.
- Then we remove the columns ["Canceled", "CancellationCode"]

In [None]:
df = df[df.Cancelled==0].copy()
df.drop(labels=["Cancelled", "CancellationCode"], axis=1, inplace=True)

## Diverted
- First we eliminate all the flights that were diverted since they do not provide information.
- Then we remove the column ["Diverted"]

In [None]:
df = df[df.Diverted==0].copy()
df.drop(labels=["Diverted"], axis=1, inplace=True)

## Reindex

In [None]:
df.reset_index(inplace=True, drop=True)

## Missing values

In [None]:
df.isna().any()

In [None]:
msno.matrix(df);

The variables that still contain nan are the ones that describe the delay.
We assume that these nan are due to the "ArrDelay" being low enough not to be described, so we will replace them with zeros.


In [None]:
df.fillna(0, inplace=True)

In [None]:
df.isnull().sum().sum()

## Target Binarizing
 
Create a variable Target depending on whether the flight was late or not (ArrDelay> 0)

In [None]:
# 1 Means Delay, 0 Means no Delay
transformer = Binarizer()
binary = transformer.fit_transform(df[['ArrDelay']])
df['ArrDelay'] = pd.Series(binary.flatten()).astype(int) #df['ArrDelay'] = df['ArrDelay'].apply(lambda x: 0 if x <=0 else 1)

We can see that the dataset is unbalanced. Although we know that this is a problem, we will solve it in exercise 5, so we can see the effect that the balance of the data has on the predictions.

In [None]:
print(df['ArrDelay'].value_counts(normalize=True).round(2))
print(df['ArrDelay'].value_counts())

## Save Preproceced data.

In [None]:
df.to_csv("DelayedFlights_Processed.csv")

#  Exercise 1: 
  - Create at least three different classification models to try to best predict DelayedFlights.csv flight delay (ArrDelay). Consider whether the flight is late or not (ArrDelay> 0).

Classification models we are going to use:

In [None]:
classifiers = [DummyClassifier(),
               KNeighborsClassifier(),
               SVC(),
               DecisionTreeClassifier(),
               RandomForestClassifier(),
               MLPClassifier(),
               AdaBoostClassifier(),
               GaussianNB(),
               QuadraticDiscriminantAnalysis(),
               NearestCentroid(),
               BaggingClassifier(),
               Perceptron(),
               PassiveAggressiveClassifier(),
               ExtraTreeClassifier(),
               LabelSpreading(),
               LabelPropagation(),
               LogisticRegression(),
               SGDClassifier(),
               LinearDiscriminantAnalysis(),
               CalibratedClassifierCV(),
               RidgeClassifier(),
               RidgeClassifierCV(),
               LinearSVC(),
               BernoulliNB(),
               ExtraTreesClassifier(),
               LGBMClassifier(),
               XGBClassifier()]

We read the already processed data

In [None]:
df = pd.read_csv("DelayedFlights_Processed.csv", index_col=0)

We create the directory in which we will save the results.

In [None]:
exe01_path = mk_dir("exe01")
exe01_models_path = mk_dir("exe01/exe01_models")
exe01_tables_path = mk_dir("exe01/exe01_tables")
exe01_plots_path = mk_dir("exe01/exe01_plots")

Transform Categorical Variables into ordinal values

In [None]:
encoder = OrdinalEncoder()
df[['UniqueCarrier', 'Origin', 'Dest', 'TailNum']]= encoder.fit_transform(df[['UniqueCarrier', 'Origin', 'Dest', 'TailNum']])

Divide data into Explanatory and Response variables. 

In [None]:
X = df.drop(['ArrDelay'], axis = 'columns')
y = df['ArrDelay']

We scale the data and divide it into training and test sets

We use MinMaxScaler because we do not assume that the shape of all our features follows a normal distribution.

In [None]:
X = MinMaxScaler().fit_transform(X)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=130)



1.   For each model we create the name of the model and the name of the file where the trained model will be saved.
2.   If the file already exists we load it and if it does not exist we train it and save it.

3.  Then we evaluate the model and store the metrics in a list.

4.  Each model with its metrics are tabulated in a dataframe and sorted in descending order according to its Accuracy.

5.  Finally we save the table in a file.

Train models and make predictions.

In [None]:
metrics_lst = []
for model in classifiers:
    # Model name 
    model_str = str(model).split("(")[0]
    model_name = '{}-{}'.format("01", model_str)
    # File name
    model_filename = model_name + ".pkl"
    model_filename_path = os.path.join(exe01_models_path, model_filename)
    if os.path.isfile(model_filename_path):
        # load existing model
        print("\n{}: Loading model:\t{}".format(timenow(), model_filename))
        model = pickle.load(open(model_filename_path, 'rb'))
    else:
        # Fit model
        print("\n{}: Start Fitting model:\t{}".format(timenow(), model_name))
        tiempo_inicio_fit = datetime.datetime.now()
        model.fit(X_train, y_train)
        fit_time = datetime.datetime.now() - tiempo_inicio_fit
        print("Fitting time:\t{}".format(fit_time))

        # save the model to disk
        pickle.dump(model, open(model_filename_path, 'wb'))

    # Evaluate model
    plt_path = os.path.join(exe01_plots_path, model_name + "_conf.png")
    print("\n{}: Evaluate model:\n".format(timenow()))
    metrics_lst.append(class_metrics(model, model_name, X_train, X_test, y_train, y_test, plt_path))
# Tabualate resulst
class_metrics_df = pd.DataFrame(metrics_lst)
class_metrics_df.set_index(keys="Model", inplace=True)
class_metrics_df.sort_values(by=["Accuracy"], ascending=False, inplace=True)
# Save results
class_metrics_df.to_csv(os.path.join(exe01_tables_path, "01-classification_metrics.csv"))

# Exercise 2: 
  - Compare classification models using accuracy, a confidence matrix, and other more advanced metrics.

We create the directory in which we will save the results.

In [None]:
exe02_path = mk_dir("exe02")
exe02_plots_path = mk_dir("exe02/exe02_plots")

We read the results of exercise 1 and plot them.

In [None]:
metrics_01_path = os.path.join(exe01_tables_path, "01-classification_metrics.csv")
metrics_lst = []
for metrics_file in [metrics_01_path]:
    all_metrics = pd.read_csv(metrics_file, index_col=0)
    metrics_lst.append(all_metrics)
final_metrics_df_1 = pd.concat(metrics_lst)
data_table.enable_dataframe_formatter()
data_table.DataTable(final_metrics_df_1, num_rows_per_page=final_metrics_df_1.shape[0])

In [None]:
for metric in final_metrics_df_1.columns:
    if metric in ["FP_rate","Misclass_rate","Misclass","FP","FN"]:
        ascend=True
    else:
        ascend=False
    final_metrics_df_1.sort_values(by=metric, inplace=True, ascending=ascend)
    plt.figure(figsize = (15,final_metrics_df_1.shape[0]/4))
    sns.barplot(data=final_metrics_df_1, x = metric, y = final_metrics_df_1.index)
    plt.title(metric)
    plt_path = os.path.join(exe02_plots_path, metric + "_summary.png")
    plt.savefig(plt_path)
    plt.show()

# Exercise 3: 
  - Train them using the different parameters they support

We create the directory in which we will save the results.

In [None]:
mk_dir("exe03")
exe03_models_path = mk_dir("exe03/exe03_models")
exe03_tables_path = mk_dir("exe03/exe03_tables")

- [***LogisticRegression***](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html?highlight=logisticregression#sklearn.linear_model.LogisticRegression)
    - **solver** (Algorithm to use in the optimization problem.):
        - newton-cg
        - lbfgs
        - liblinear
        - sag
        - saga
    - **C** (Inverse of regularization strength.)
        - gbtree
        - gblinear
    - **penalty** (Specify the norm of the penalty):
        - none
        - l2
        - l1
        - elasticnet

- [***RidgeClassifier***](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.RidgeClassifier.html?highlight=ridgeclassifier#sklearn.linear_model.RidgeClassifier)
    - **alpha** (Regularization strength):
        - 0.1
        - 0.2
        - 0.3
        - 0.4
        - 0.5
        - 0.6
        - 0.7
        - 0.8
        - 0.9
        - 1.0

- [***KNeighborsClassifier***](https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html?highlight=kneighborsclassifier#sklearn.neighbors.KNeighborsClassifier)
    - **weights** (Weight function used in prediction.):
        - uniform
        - distance
    - **metric** (The distance metric to use for the tree.):
        - euclidean
        - manhattan
        - minkowski

- [***SVC***](https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html?highlight=svc#sklearn.svm.SVC)
    - **kernel** (Specifies the kernel type to be used in the algorithm.):
        - linear
        - poly
        - rbf
        - sigmoid
    - **C** (Regularization parameter.):
        - 1
        - 10
        - 100
    - **gamma** (Kernel coefficient):
        - scale
        - auto
        
    
- [***BaggingClassifier***](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.BaggingClassifier.html?highlight=baggingclassifier#sklearn.ensemble.BaggingClassifier)
    - **n_estimators** (The number of base estimators in the ensemble.):
        - 10
        - 100
        - 1000

- [***RandomForestClassifier***](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html?highlight=randomforestclassifier#sklearn.ensemble.RandomForestClassifier)
    - **max_depth** (The maximum depth of the tree.):
        - 3
        - 5
        - 10
        - None
    - **min_samples_split** (The minimum number of samples required to split an internal node):
        - 2
        - 5
        - 10
    - **max_features** (The number of features to consider when looking for the best split):
        - sqrt
        - log2
        - None


In [None]:
selected_classifiers = [LogisticRegression(random_state = 130), 
                      RidgeClassifier(random_state = 130),
                      KNeighborsClassifier(), 
                      SVC(random_state = 130),
                      BaggingClassifier(random_state = 130),
                      RandomForestClassifier(random_state = 130)]

LogisticRegression_param_grid = {"solver":['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
                                 "C":[10, 1.0, 0.1, 0.01],
                                 "penalty":['l1', 'l2', 'elasticnet', 'none']}
RidgeClassifier_param_grid = {"alpha":[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]}
KNeighborsClassifier_param_grid = {"weights":['uniform', 'distance'],
                                 "metric":['euclidean', 'manhattan', 'minkowski']}
SVC_param_grid = {"C":[1, 10, 100],
                  "kernel":['linear', 'poly', 'rbf', 'sigmoid'],
                  "gamma":['scale', 'auto']}        
BaggingClassifier_param_grid = {"n_estimators":[10, 100, 1000]}  
RandomForestClassifier_param_grid = {"max_depth":[3, 5, 10, None],
                                 "min_samples_split":[2, 5, 10],
                                 "max_features":['sqrt', 'log2', None]}     
param_grids = [LogisticRegression_param_grid,
               RidgeClassifier_param_grid,
               KNeighborsClassifier_param_grid,
               SVC_param_grid,
               BaggingClassifier_param_grid,
               RandomForestClassifier_param_grid]                                                   

1.   First we define the estimator that we will use in the search for parameters.

1.   Create a dictionary with parameters names as keys and lists of parameter settings to try as values.

1.   Then we define the name of the GridSearchCV object, the name of the file where the search result is saved and the path of the file.

1.   If the GridSearchCV has already been trained and saved, that is, if the file already exists, it is read.

1.   If it has not been previously trained, we define the GridSearchCV object with the estimator and the indicated parameters.

1.   Then it is trained, and saved to disk.

1.   Finally we tabulate the results and save the results in a csv file.
1.   In the last step we show the table with the results (they are not ordered)

In [None]:
for model, param_grid in zip(selected_classifiers, param_grids):
    print(model)
    print(param_grid)
    # GridSearchCV name and filename
    model_name = '{}-{}'.format("03", "GSCV_" + str(model).split("(")[0])
    model_filename = model_name + ".pkl"
    model_filename_path = os.path.join(exe03_models_path, model_filename)

    if os.path.isfile(model_filename_path):
        # load existing model
        print("\n{}: Loading model:\t{}".format(timenow(), model_filename))
        GSCV_model = pickle.load(open(model_filename_path, 'rb'))
    else:
        # set GridSearchCV
        GSCV_model = GridSearchCV(estimator=model, 
                                  param_grid=param_grid, 
                                  verbose=1, 
                                  cv = 2)
        print("\n{}: Start Fitting model:\t{}".format(timenow(), model_name))
        tiempo_inicio_fit = datetime.datetime.now()
        # Fit GridSearchCV
        GSCV_model.fit(X, y)
        print("\n{}: Finish. Fitting time:\t{}".format(timenow(), datetime.datetime.now() - tiempo_inicio_fit))
        # save the GridSearchCV to disk
        pickle.dump(GSCV_model, open(model_filename_path, 'wb'))

    # Results filename
    GSCV_results_filename = model_name + "_metrics.csv"
    # Results dataframe
    GSCV_model_results = pd.DataFrame(GSCV_model.cv_results_)
    GSCV_model_results.index =  model_name + '-'+GSCV_model_results.filter(regex="param_").astype(str).agg('_'.join, axis=1)
    GSCV_model_results.sort_values(by="rank_test_score", inplace=True)
    cols = list(GSCV_model_results.columns)
    cols.reverse()
    GSCV_model_results = GSCV_model_results[cols]
    # Results to file
    print("\n{}: Save GSCV results in:\t{}".format(timenow(), GSCV_results_filename))
    GSCV_model_results.to_csv(os.path.join(exe03_tables_path, GSCV_results_filename))
    display(GSCV_model_results)

In a similar way to the exercise 2, we read the results of each GridSearchCV.

For each model we plot its Accuracy score hued by each by its Grid Searched Parameter. 

In [None]:
for GSCV_results_filename in os.listdir(exe03_tables_path):
    GSCV_results_df = pd.read_csv(os.path.join(exe03_tables_path,GSCV_results_filename), index_col=0)
    GSCV_name = GSCV_results_filename.split(".")[0]
    print('\n\t',GSCV_name,'\n')
    if GSCV_name == "03-GSCV_RandomForestClassifier_metrics":
        GSCV_results_df.fillna("none", inplace=True)
    params = GSCV_results_df.filter(regex="param_").columns.to_list()
    if len(params) == 1:
        sns.barplot(data=GSCV_results_df, x="mean_test_score", y=GSCV_results_df.index)
        plt.show()
    else:
        for parm_to_hue in params:
            plt.figure(figsize = (15,GSCV_results_df.shape[0]/3))
            sns.barplot(data=GSCV_results_df, x="mean_test_score", y=GSCV_results_df.index, hue=parm_to_hue)
            plt.show()

- LogisticRegression:
     - **solver**: `newton-cg` and `lbfgs` produces the best results.
     - **C**: `gbtree` and `gblinear` produce similar results.
     - **penalty**: `none` produces the best results, `elasticnet` does not get results.
- RidgeClassifier:
     - **alpha**: all produce similar results.
- KNeighboursClassifier:
     - **weights**: all produce similar results.
     - **metric**: all produce similar results.
- CVS:
     - **kernel**: `linear` > `rbf` > `poly` > `sigmoid`.
     - **C**: the higher the value, the better results are produced.
     - **gamma**: `scale` and `auto` all produce similar results.
- BaggingClassifier:
     - **n_estimators**: the higher the value, the better results are produced.
- RandomForestClassifier:
     - **max_depth**: the higher the value, the better results are produced.
     - **min_samples_split**: all produce similar results.
     - **max_features**: `None` produces the best results.

We order the model by the Accuracy score in ascending order and finally we show the table and and plot them.

In [None]:
exe03_tables_path = mk_dir("exe03/exe03_tables")
metrics_lst = []
for GSCV_results_filename in os.listdir(exe03_tables_path):
    GSCV_results_df = pd.read_csv(os.path.join(exe03_tables_path,GSCV_results_filename), index_col=0)
    metrics_lst.append(GSCV_results_df[["mean_test_score"]])
all_metrics_df = pd.concat(metrics_lst)
all_metrics_df.sort_values(by=["mean_test_score"], ascending=[False], inplace=True)
display(all_metrics_df)
plt.figure(figsize = (20,int(all_metrics_df.shape[0]/3.5)))
sns.barplot(data=all_metrics_df, x = "mean_test_score", y = all_metrics_df.index)
plt.show()

The best results were obtained by the LogisticRegression estimator with the following parameters:
- solver: newton-cg
- penalty: none

# Exercise 4: 
  - Compare your performance using the train / test approach or using all data (internal validation)

We create the directory in which we will save the results.

In [None]:
mk_dir("exe04")
exe04_plots_path = mk_dir("exe04/exe04_plots")
exe04_tables_path = mk_dir("exe04/exe04_tables")

## Train Test approach

We read the results from the exercise 1

In [None]:
class_metrics_df = pd.read_csv(os.path.join(exe01_tables_path, "01-classification_metrics.csv"), index_col=0)
display(class_metrics_df.round(5))

In [None]:
plt.figure(figsize = (20,int(class_metrics_df.shape[0]/3.5)))
sns.barplot(data=class_metrics_df, x = "Accuracy", y = class_metrics_df.index)
plt.title("Accuracy")
plt.show()

## k-Fold Cross-Validation (internal validation)

Split dataset into k consecutive folds.

Each fold is then used once as a validation while the k-1 remaining folds form the training set.

We will be increasing the K from 2 to 0 to see the effect on the results.

In [None]:
#Train models and make predictions.
max_splits = 10
n_splits = range(2,max_splits)
kfcv_filename = 'kfcv_{}'.format(max_splits)
kfcv_filename_path = os.path.join(exe04_tables_path, kfcv_filename)

if os.path.isfile(kfcv_filename_path+".pkl"):
    # load existing model
    print("\n{}: loading k-Fold Cross-Validation Scores:\t{}".format(timenow(), kfcv_filename))
    kfcv_df = pd.read_csv(kfcv_filename_path+".csv", index_col=0)
    kfcv_file = open(kfcv_filename_path+".pkl", "rb")
    model_kfcv_results = pickle.load(kfcv_file)
    kfcv_means_df = pd.DataFrame(index=model_kfcv_results.keys(), columns=n_splits)
    for model_str, kfcv_results in model_kfcv_results.items():
        print("\n{}: Model: {}".format(timenow(), model_str))
        for score in kfcv_results:
            print("{}:  N splits: {}".format(timenow(),len(score)))
            print("{}:    Accuracy scores: {}\n\t     Mean (std): \t{:.3f} ({:.5f})".format(timenow(),score.round(3), np.mean(score), sem(score)))
            kfcv_means_df.loc[model_str,len(score)] = np.mean(score)
        plt.boxplot(kfcv_results, showmeans=True, labels=n_splits)
        plt.xlabel("Nº splits")
        plt.ylabel("Accuracy score")
        plt.title(model_str, fontweight = "bold")
        filename = model_str+"-K-fold_cv.png"
        plt.savefig(os.path.join(exe04_plots_path, filename))
        plt.show()
else:    
    model_kfcv_results = dict()
    for model in selected_classifiers:
        model_str = str(model).split("(")[0]
        print("\n{}: Model: {}".format(timenow(), model_str))
        kfcv_results = list()
        for n_s in n_splits:
            print("{}:  N splits: {}".format(timenow(),n_s))
            # evaluate using a given number of repeats
            scores = kfcv_evaluator_class(X, y,model, n_s)
            # summarize
            print("{}:    Accuracy scores: {}\n\t     Mean (std): \t{:.3f} ({:.5f})".format(timenow(),scores.round(3), np.mean(scores), sem(scores)))
            # store
            kfcv_results.append(scores)
        model_kfcv_results[model_str] = kfcv_results

        plt.boxplot(kfcv_results, showmeans=True, labels=n_splits)
        plt.xlabel("Nº splits")
        plt.ylabel("Accuracy score")
        plt.title(model_str, fontweight = "bold")
        
        filename = model_str+"-K-fold_cv.png"
        plt.savefig(os.path.join(exe04_plots_path, filename))
        plt.show()
    
    kfcv_file = open(kfcv_filename_path+".pkl", "wb")
    pickle.dump(model_kfcv_results, kfcv_file)
    kfcv_file.close()
    kfcv_df = pd.DataFrame.from_dict(model_kfcv_results, orient='index', columns=n_splits)
    kfcv_df.to_csv(kfcv_filename_path+".csv")

display(kfcv_df)

1.   We calculate the mean of the results obtained in each model and by number of splits.
2. The results are plotted to see the effect of the number of splits on each model.

In [None]:
kfcv_file = open(kfcv_filename_path+".pkl", "rb")
model_kfcv_results = pickle.load(kfcv_file)

kfcv_means_df = pd.DataFrame(index=model_kfcv_results.keys(), columns=n_splits)
for model_str, kfcv_results in model_kfcv_results.items():
    for score in kfcv_results:
        kfcv_means_df.loc[model_str,len(score)] = np.mean(score)
kfcv_means_df.to_csv(kfcv_filename_path+"_means.csv")
kfcv_means_df = kfcv_means_df.round(3)
display(kfcv_means_df)

for model in kfcv_means_df.index:
    plt.plot(kfcv_means_df.loc[model], label = model)

plt.xlabel("Nº splits (K-folds)")
plt.ylabel("Accuracy")
plt.title("K-Fold Cross-Validation results\n Scores by number of splits ")
plt.legend(title="Models",loc='upper right', bbox_to_anchor=(0.7, 0.6, 0.5, 0.5))
plt.show()

We find that by means of this internal validation method we do obtain better results than by train test approach.

We see that increasing the number of k has a small positive effect.

## Repeated k-Fold Cross-Validation (internal validation)

Repeats K-Fold n times with different randomization in each repetition.

We will be increasing the K from 2 to 4  and the number or repetitions from 1 to 3.

In [None]:
#Train models and make predictions.

max_splits = 5
max_repeats = 4
n_splits = range(2,max_splits)
n_repeats = range(1,max_repeats)

rkfcv_filename = 'rkfcv_{}_{}'.format(max_splits,max_repeats)
rkfcv_filename_path = os.path.join(exe04_tables_path, rkfcv_filename)
rkfcv_results = dict()
for model in selected_classifiers:
    model_str = str(model).split("(")[0]
    model_filename = rkfcv_filename_path+'-'+model_str+".sav"
    print("\n{}: Model: {}".format(timenow(), model_str))
    if os.path.isfile(model_filename):
        print("\n{}: loading Repeated k-Fold Cross-Validation Scores:\t{}".format(timenow(), model_filename))
        model_file = open(model_filename, "rb")
        model_rkfcv_scores = pickle.load(model_file)
        model_file.close()
        rkfcv_results[model_str] = model_rkfcv_scores
    else:
        model_rkfcv_scores = dict()
        for n_s in n_splits:
            print("{}:\tN splits: {}".format(timenow(),n_s))
            reps_rkfcv_results = dict() 
            for n_r in n_repeats:
                print("{}:\t\tN repeats: {}".format(timenow(),n_r))
                scores = rkfcv_evaluator_class(X, y,model, n_s, n_r)
                print("{}:\t\t\tAccuracy scores:\t{}\n\t\t\t\tMean (std):\t{:.6f} ({:.6f})".format(timenow(),scores, np.mean(scores), sem(scores)))
                reps_rkfcv_results[n_r] = scores
            model_rkfcv_scores[n_s] = reps_rkfcv_results
        model_file = open(model_filename, "wb")
        pickle.dump(model_rkfcv_scores, model_file)
        model_file.close()
        rkfcv_results[model_str] = model_rkfcv_scores

rkfcv_df = rkfcv_to_df(rkfcv_results)
display(rkfcv_df)
plot_rkfcv(rkfcv_df)

1.   We calculate the mean of the results obtained in each model and by number of splits and number of repeats.
2. The results are plotted to see the effect of the number of repeats on each model.

In [None]:
rkfcv_df.loc[:,"1":] = rkfcv_df.loc[:,"1":].applymap(np.mean)

for model in selected_classifiers:
    model_str = str(model).split("(")[0]
    print(model_str)
    rkfcv_model_results = rkfcv_df[rkfcv_df.model ==model_str]
    rkfcv_model_results.drop('model',axis=1, inplace=True)
    rkfcv_model_results.set_index('n_splits', inplace=True)
    sns.lineplot(data=rkfcv_model_results.T)
    plt.xlabel("Nº Repeats")
    plt.ylabel("Accuracy")
    plt.title("{}\nRepeated K-Fold Cross-Validation results\n Scores by number of Repeats hued by number of splits".format(model_str))
    plt.legend(title="Nº Splits",loc='upper right', bbox_to_anchor=(0.55, 0.5, 0.5, 0.5))
    plt.show()

At the same number of splits, in any model increasing repetitions does not generate better results but in the model `KNeighborsClassifier`


# Exercise 5: 
  - Perform some variable engineering process to improve prediction

In [None]:
mk_dir("exe05")
exe05_models_path = mk_dir("exe05/exe05_models")
exe05_tables_path = mk_dir("exe05/exe05_tables")
exe05_plots_path = mk_dir("exe05/exe05_plots")

In [None]:
df = pd.read_csv("DelayedFlights_Processed.csv", index_col=0)


From "DepTime" we extract the hour and minutes and assemble them with the variables "Year", "Month" and "DayofMonth" to then convert DepTime to datetime format (YYYY-MM-DD HH:MM:SS).

In [None]:
df["DepTime"] = pd.to_datetime(dict(year=df.Year,
                                    month=df.Month,
                                    day=df.DayofMonth,
                                    hour=[t.hour for t in df.DepTime.apply(numToTime)],
                                    minute=[t.minute for t in df.DepTime.apply(numToTime)]))

Convert "DepDelay" to timedelta, so we can use it to obtain absolute differences in times.

In [None]:
df["DepDelay"] = pd.to_timedelta(df["DepDelay"], unit='m')

Now we can get the "CRSDepTime" in datetime format by subtracting the "DepDelay" from the "DepTime"

In [None]:
df["CRSDepTime"] = df["DepTime"] - df["DepDelay"]

Now we to transform the "DepDelay" from timedelta back to minutes

In [None]:
df["DepDelay"] = df.DepDelay.dt.seconds/60

We sort the records, reindex and eliminate the variables "Year", "Month", "DayofMonth", "DayOfWeek".

In [None]:
df.sort_values(by=["CRSDepTime","DepTime"], inplace=True)
df.reset_index(drop=True, inplace=True)
df = df.drop(["Year","Month","DayofMonth","DayOfWeek"], axis=1)

Now that we have "DepTime" and "CRSDepTime" in datetime format we can extract new features.

Extract cyclic time related features

In [None]:
df["DepTime_hour"] = df.DepTime.dt.hour
df["DepTime_minute"] = df.DepTime.dt.minute
df["DepTime_month"] = df.DepTime.dt.month
df["DepTime_day"] = df.DepTime.dt.day
df["DepTime_dayofweek"] = df.DepTime.dt.dayofweek
df["DepTime_weekofyear"] = df.DepTime.dt.weekofyear
df["DepTime_dayofyear"] = df.DepTime.dt.dayofyear
df["DepTime_quarter"] = df.DepTime.dt.quarter

df["CRSDepTime_day"] = df.CRSDepTime.dt.day
df["CRSDepTime_hour"] = df.CRSDepTime.dt.hour
df["CRSDepTime_minute"] = df.CRSDepTime.dt.minute

Extract number of days in the month.

In [None]:
df["DepTime_daysinmonth"] = df.DepTime.dt.daysinmonth

Extract bolean features

In [None]:
df["DepTime_is_month_start"] = df.DepTime.dt.is_month_start
df["DepTime_is_month_end"] = df.DepTime.dt.is_month_end
df["DepTime_is_quarter_start"] = df.DepTime.dt.is_quarter_start
df["DepTime_is_quarter_end"] = df.DepTime.dt.is_quarter_end
df["DepTime_is_year_start"] = df.DepTime.dt.is_year_start
df["DepTime_is_year_end"] = df.DepTime.dt.is_year_end

Now we can drop "DepTime","CRSDepTime" 

In [None]:
df = df.drop(["DepTime","CRSDepTime"], axis=1)

Convert the boolean values into integer value

In [None]:
bool_cols = df.select_dtypes(include=["bool"]).columns.tolist()
df[bool_cols] =  df[bool_cols].astype(int)

Encode cyclical features such as all time related features into an angular distance by calculating the cosinus and sinus values of the degree.


In [None]:
cyclic_cols = ['ArrTime', 'CRSArrTime','DepTime_month', 'DepTime_day', 
               'DepTime_hour', 'DepTime_minute', 'DepTime_weekofyear', 
               'DepTime_dayofweek', 'DepTime_dayofyear', 'DepTime_quarter', 
               'CRSDepTime_day', 'CRSDepTime_hour', 'CRSDepTime_minute']
               
cyclical = CyclicalTransformer(variables=cyclic_cols, drop_original=True)
df = cyclical.fit_transform(df)

Drop Categorical Variables we are not going to use

In [None]:
df = df.drop(["TailNum","Origin", "Dest", "FlightNum"], axis=1)

Transform Categorical Variables ("UniqueCarrie") to dummies.

In [None]:
df = pd.get_dummies(df)

Save engineered data.

In [None]:
df.to_csv("DelayedFlights_Engineered.csv")

Divide data into Explanatory and Response variables. 

In [None]:
X = df.drop(['ArrDelay'], axis = 'columns')
y = df['ArrDelay']

We scale the data and divide it into training and test sets

In [None]:
X = MinMaxScaler().fit_transform(X)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=130)

We perform over-sampling using SMOTE.

In [None]:
sm = SMOTE(random_state=130)
X_train, y_train = sm.fit_resample(X_train, y_train)

Train models and make predictions.

In [None]:
metrics_lst = []
for model in classifiers:
    # Model name 
    model_str = str(model).split("(")[0]
    model_name = '{}-{}'.format("05", model_str)
    # File name
    model_filename = model_name + ".pkl"
    model_filename_path = os.path.join(exe05_models_path, model_filename)
    if os.path.isfile(model_filename_path):
        # load existing model
        print("\n{}: Loading model:\t{}".format(timenow(), model_filename))
        model = pickle.load(open(model_filename_path, 'rb'))
    else:
        # Fit model
        print("\n{}: Start Fitting model:\t{}".format(timenow(), model_name))
        tiempo_inicio_fit = datetime.datetime.now()
        model.fit(X_train, y_train)
        fit_time = datetime.datetime.now() - tiempo_inicio_fit
        print("Fitting time:\t{}".format(fit_time))

        # save the model to disk
        pickle.dump(model, open(model_filename_path, 'wb'))

    # Evaluate model
    plt_path = os.path.join(exe05_plots_path, model_name + "_conf.png")
    print("\n{}: Evaluate model:\n".format(timenow()))
    metrics_lst.append(class_metrics(model, model_name, X_train, X_test, y_train, y_test, plt_path))
# Tabualate resulst
class_metrics_df = pd.DataFrame(metrics_lst)
class_metrics_df.set_index(keys="Model", inplace=True)
class_metrics_df.sort_values(by=["Accuracy"], ascending=False, inplace=True)
# Save results
class_metrics_df.to_csv(os.path.join(exe05_tables_path, "05-classification_metrics.csv"))

To see the effect of engineering on the predictions we will compare the results with those of exercise 1.

We combine the results of the two exercises.

In [None]:
metrics_01_path = os.path.join(exe01_tables_path, "01-classification_metrics.csv")
metrics_05_path = os.path.join(exe05_tables_path, "05-classification_metrics.csv")

metrics_lst = []
for metrics_file in [metrics_01_path, metrics_05_path]:
    
    all_metrics = pd.read_csv(metrics_file, index_col=0)
    metrics_lst.append(all_metrics)
final_metrics_df_1_5 = pd.concat(metrics_lst)

data_table.enable_dataframe_formatter()
data_table.DataTable(final_metrics_df_1_5, num_rows_per_page=final_metrics_df_1_5.shape[0])

We print the model that obtains the best metric.

In [None]:
for model in classifiers:
    # Model name 
    model_str = str(model).split("(")[0]
    models_metric = final_metrics_df_1_5.filter(regex=model_str, axis=0)
    print("\n{}: ".format(model_str))
    print("{:<15} {:<10}".format("METRIC","BEST MODEL"))
    print("{:<15} {:<10}".format("-"*10,"-"*10))
    for metric in models_metric.columns:
        if metric in ["FP_rate","Misclass_rate","Misclass","FP","FN"]:
            ascend=True
        else:
            ascend=False
        models_metric.sort_values(by=metric, inplace=True, ascending=ascend)
        print("{:<15} {:<10}".format(metric,models_metric.head(1).index[0]))

For each metric we plot the models ordered from best to worst result.

In [None]:
for metric in final_metrics_df_1_5.columns:
    if metric in ["FP_rate","Misclass_rate","Misclass","FP","FN"]:
        ascend=True
    else:
        ascend=False
    final_metrics_df_1_5.sort_values(by=metric, inplace=True, ascending=ascend)
    plt.figure(figsize = (15,final_metrics_df_1_5.shape[0]/4))
    sns.barplot(data=final_metrics_df_1_5, x = metric, y = final_metrics_df_1_5.index)
    plt.title(metric)
    plt_path = os.path.join(exe05_plots_path, metric + "_summary.png")
    plt.savefig(plt_path)
    plt.show()

# Exercise 6: 
  - Do not use the DepDelay variable when making predictions

In [None]:
mk_dir("exe06")
exe06_models_path = mk_dir("exe06/exe06_models")
exe06_tables_path = mk_dir("exe06/exe06_tables")
exe06_plots_path = mk_dir("exe06/exe06_plots")

In [None]:
df = pd.read_csv("DelayedFlights_Engineered.csv", index_col=0)

In [None]:
X = df.drop(['ArrDelay', 'DepDelay'], axis = 'columns')
y = df['ArrDelay']

We scale the data and divide it into training and test sets

In [None]:
X = MinMaxScaler().fit_transform(X)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=130)

We perform over-sampling using SMOTE.

In [None]:
sm = SMOTE(random_state=130)
X_train, y_train = sm.fit_resample(X_train, y_train)

Train models and make predictions.

In [None]:
metrics_lst = []
for model in classifiers:
    # Model name 
    model_str = str(model).split("(")[0]
    model_name = '{}-{}'.format("06", model_str)
    # File name
    model_filename = model_name + ".pkl"
    model_filename_path = os.path.join(exe06_models_path, model_filename)
    if os.path.isfile(model_filename_path):
        # load existing model
        print("\n{}: Loading model:\t{}".format(timenow(), model_filename))
        model = pickle.load(open(model_filename_path, 'rb'))
    else:
        # Fit model
        print("\n{}: Start Fitting model:\t{}".format(timenow(), model_name))
        tiempo_inicio_fit = datetime.datetime.now()
        model.fit(X_train, y_train)
        fit_time = datetime.datetime.now() - tiempo_inicio_fit
        print("Fitting time:\t{}".format(fit_time))

        # save the model to disk
        pickle.dump(model, open(model_filename_path, 'wb'))

    # Evaluate model
    plt_path = os.path.join(exe06_plots_path, model_name + "_conf.png")
    print("\n{}: Evaluate model:\n".format(timenow()))
    metrics_lst.append(class_metrics(model, model_name, X_train, X_test, y_train, y_test, plt_path))
# Tabualate resulst
class_metrics_df = pd.DataFrame(metrics_lst)
class_metrics_df.set_index(keys="Model", inplace=True)
class_metrics_df.sort_values(by=["Accuracy"], ascending=False, inplace=True)
# Save results
class_metrics_df.to_csv(os.path.join(exe06_tables_path, "06-classification_metrics.csv"))

To see the effects of missing the "DepDelay" variable on the predictions we will compare the results with those of exercise 5.

We combine the results of the two exercises.

In [None]:
metrics_05_path = os.path.join(exe05_tables_path, "05-classification_metrics.csv")
metrics_06_path = os.path.join(exe06_tables_path, "06-classification_metrics.csv")
metrics_lst = []
for metrics_file in [metrics_06_path, metrics_05_path]:
    
    all_metrics = pd.read_csv(metrics_file, index_col=0)
    metrics_lst.append(all_metrics)
final_metrics_df_5_6 = pd.concat(metrics_lst)

data_table.enable_dataframe_formatter()
data_table.DataTable(final_metrics_df_5_6, num_rows_per_page=final_metrics_df_5_6.shape[0])

We print the model that obtains the best metric.

In [None]:
for model in classifiers:
    # Model name 
    model_str = str(model).split("(")[0]
    models_metric = final_metrics_df_5_6.filter(regex=model_str, axis=0)
    print("\n{}: ".format(model_str))
    print("{:<15} {:<10}".format("METRIC","BEST MODEL"))
    print("{:<15} {:<10}".format("-"*10,"-"*10))
    for metric in models_metric.columns:
        if metric in ["FP_rate","Misclass_rate","Misclass","FP","FN"]:
            ascend=True
        else:
            ascend=False
        models_metric.sort_values(by=metric, inplace=True, ascending=ascend)
        print("{:<15} {:<10}".format(metric,models_metric.head(1).index[0]))

For each metric we plot the models ordered from best to worst result.

In [None]:
for metric in final_metrics_df_5_6.columns:
    if metric in ["FP_rate","Misclass_rate","Misclass","FP","FN"]:
        ascend=True
    else:
        ascend=False
    final_metrics_df_5_6.sort_values(by=metric, inplace=True, ascending=ascend)
    plt.figure(figsize = (15,final_metrics_df_5_6.shape[0]/4))
    sns.barplot(data=final_metrics_df_5_6, x = metric, y = final_metrics_df_5_6.index)
    plt.title(metric)
    plt_path = os.path.join(exe06_plots_path, metric + "_summary.png")
    plt.savefig(plt_path)
    plt.show()