In [1]:
import pandas as pd
import numpy as np
import pickle
import logging

from sklearn.model_selection import train_test_split
from app.data_engineering.data_access import read_db
from app.utils.multi_column_label_encode import MultiColumnLabelEncoder
from app.data_engineering.feature_engineering import FeatureEngineering
from app.model import Model
import math


pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [2]:
datasets = read_db()

In [3]:
datasets.keys()

dict_keys(['batch1', 'batch2', 'test'])

In [4]:
df_batch1 = datasets['batch1']
df_batch2 = datasets['batch2']
df_test = datasets['test']

## Définition de nos labels et de nos features

In [5]:
FEATURES = df_test['vols'].columns.tolist()
FEATURES.remove('NIVEAU DE SECURITE')
FEATURES

['IDENTIFIANT',
 'VOL',
 'CODE AVION',
 'AEROPORT DEPART',
 'AEROPORT ARRIVEE',
 'DEPART PROGRAMME',
 'TEMPS DE DEPLACEMENT A TERRE AU DECOLLAGE',
 'TEMPS PROGRAMME',
 'DISTANCE',
 "TEMPS DE DEPLACEMENT A TERRE A L'ATTERRISSAGE",
 'ARRIVEE PROGRAMMEE',
 'COMPAGNIE AERIENNE',
 'NOMBRE DE PASSAGERS',
 'DATE']

In [6]:
LABELS = list(set(df_batch1['vols'].columns.tolist()) - set(df_test['vols'].columns.tolist()))
LABELS

['DECOLLAGE',
 'RETARD SYSTEM',
 'RETARD SECURITE',
 'ATTERRISSAGE',
 'HEURE DE DEPART',
 "RAISON D'ANNULATION",
 "RETARD A L'ARRIVEE",
 'RETARD METEO',
 'DETOURNEMENT',
 'RETARD AVION',
 "HEURE D'ARRIVEE",
 'RETART DE DEPART',
 'TEMPS DE VOL',
 'TEMPS PASSE',
 'RETARD COMPAGNIE',
 'ANNULATION']

In [7]:
df_batch1.keys(), df_batch2.keys(), df_test.keys()

(dict_keys(['vols', 'aeroports', 'compagnies', 'prix_fuel']),
 dict_keys(['vols', 'aeroports', 'compagnies', 'prix_fuel']),
 dict_keys(['vols']))

# Feature Engineering

In [8]:
# la valeur a prédire 
label = "RETARD A L'ARRIVEE"

In [9]:
def train_test_split_80_20(X, y): 
    return train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
vol_batch1_smaller = pd.concat([df_batch1['vols'], df_batch2['vols']]).tail(250000)#.head(1200000)
df_airport = pd.concat([df_batch1['aeroports'], df_batch2['aeroports']])

In [11]:
vol_batch1_smaller[FEATURES+[label]].head()

Unnamed: 0,IDENTIFIANT,VOL,CODE AVION,AEROPORT DEPART,AEROPORT ARRIVEE,DEPART PROGRAMME,TEMPS DE DEPLACEMENT A TERRE AU DECOLLAGE,TEMPS PROGRAMME,DISTANCE,TEMPS DE DEPLACEMENT A TERRE A L'ATTERRISSAGE,ARRIVEE PROGRAMMEE,COMPAGNIE AERIENNE,NOMBRE DE PASSAGERS,DATE,RETARD A L'ARRIVEE
1082914,3178493,2790,349cac862e,DMM,CRL,1300,12.0,100.0,543,6.0,1440,WKEUW,34,5/3/2016,-9.0
1082915,3178494,4297,df4ff9876a,DMM,CRL,1625,11.0,100.0,543,9.0,1805,WKEUW,43,5/3/2017,4.0
1082916,3178495,3479,c2423a929a,DMM,CRL,1835,11.0,100.0,543,6.0,2015,WKEUW,42,5/3/2017,-8.0
1082917,3178496,574,c6b264ebcd,DMM,CRL,830,12.0,100.0,543,5.0,1010,WKEUW,37,6/3/2017,-7.0
1082918,3178497,4175,71e5cdb634,DMM,CRL,1130,11.0,100.0,543,5.0,1310,WKEUW,32,6/3/2018,-5.0


In [12]:
vol_batch1_smaller[FEATURES+[label]].describe(include='all')

Unnamed: 0,IDENTIFIANT,VOL,CODE AVION,AEROPORT DEPART,AEROPORT ARRIVEE,DEPART PROGRAMME,TEMPS DE DEPLACEMENT A TERRE AU DECOLLAGE,TEMPS PROGRAMME,DISTANCE,TEMPS DE DEPLACEMENT A TERRE A L'ATTERRISSAGE,ARRIVEE PROGRAMMEE,COMPAGNIE AERIENNE,NOMBRE DE PASSAGERS,DATE,RETARD A L'ARRIVEE
count,250000.0,250000.0,250000.0,250000,250000,250000.0,246678.0,250000.0,250000.0,246529.0,250000.0,250000,250000.0,250000,246046.0
unique,,,914.0,64,51,,,,,,,4,,1002,
top,,,618131665.0,TPE,GLA,,,,,,,WKEUW,,29/11/2016,
freq,,,706.0,30531,25886,,,,,,,248177,,328,
mean,3314433.0,1859.049268,,,,1395.491228,11.901783,115.717476,677.213232,5.092046,1533.738064,,129.225844,,5.969209
std,162490.3,1456.304127,,,,471.06172,6.27415,50.260323,408.424319,3.318512,530.240942,,70.150542,,33.382532
min,3178493.0,1.0,,,,500.0,1.0,45.0,159.0,1.0,5.0,,-1.0,,-70.0
25%,3240993.0,586.0,,,,1010.0,8.0,75.0,358.0,4.0,1145.0,,47.0,,-10.0
50%,3303492.0,1510.0,,,,1350.0,10.0,100.0,516.0,5.0,1545.0,,145.0,,-3.0
75%,3365992.0,2905.0,,,,1815.0,13.0,150.0,928.0,6.0,1950.0,,200.0,,10.0


In [13]:
vol_batch1_smaller[FEATURES+[label]].corr()

Unnamed: 0,IDENTIFIANT,VOL,DEPART PROGRAMME,TEMPS DE DEPLACEMENT A TERRE AU DECOLLAGE,TEMPS PROGRAMME,DISTANCE,TEMPS DE DEPLACEMENT A TERRE A L'ATTERRISSAGE,ARRIVEE PROGRAMMEE,NOMBRE DE PASSAGERS,RETARD A L'ARRIVEE
IDENTIFIANT,1.0,0.0824,0.002726,0.084025,0.010916,-0.001093,0.03404,0.013296,0.006257,0.015195
VOL,0.0824,1.0,-0.040857,0.006223,0.048288,0.042889,0.015683,-0.023318,-0.000765,-0.010587
DEPART PROGRAMME,0.002726,-0.040857,1.0,-0.048187,-0.097867,-0.078113,-0.000625,0.667698,0.001892,0.197419
TEMPS DE DEPLACEMENT A TERRE AU DECOLLAGE,0.084025,0.006223,-0.048187,1.0,0.108786,0.078482,0.0165,-0.025172,0.008202,0.189139
TEMPS PROGRAMME,0.010916,0.048288,-0.097867,0.108786,1.0,0.98142,0.145365,0.003393,-0.016797,-0.037129
DISTANCE,-0.001093,0.042889,-0.078113,0.078482,0.98142,1.0,0.129571,0.012844,-0.010518,-0.021187
TEMPS DE DEPLACEMENT A TERRE A L'ATTERRISSAGE,0.03404,0.015683,-0.000625,0.0165,0.145365,0.129571,1.0,0.017463,0.009406,0.088539
ARRIVEE PROGRAMMEE,0.013296,-0.023318,0.667698,-0.025172,0.003393,0.012844,0.017463,1.0,0.002941,0.147682
NOMBRE DE PASSAGERS,0.006257,-0.000765,0.001892,0.008202,-0.016797,-0.010518,0.009406,0.002941,1.0,0.061688
RETARD A L'ARRIVEE,0.015195,-0.010587,0.197419,0.189139,-0.037129,-0.021187,0.088539,0.147682,0.061688,1.0


In [14]:
# %load ../app/data_engineering/data_cleaning.py
import pandas as pd


class DataCleaning:
    def __init__(self, features_columns, label):
        self.features_columns = features_columns
        self.label = label
        self.cie_by_avion = None

    def remove_unused_columns(self, df):
        if 'NIVEAU DE SECURITE' in df.columns:
            df = df.drop(columns=['NIVEAU DE SECURITE'])
        return df

    def drop_na(self, df):
        df = df.dropna(subset=self.features_columns)
        if self.label in df.columns:
            df = df.dropna(subset=[self.label])
        return df

    def fill_na(self, df):
        if self.cie_by_avion is None:
            self.cie_by_avion = df[['CODE AVION', 'COMPAGNIE AERIENNE']].dropna().groupby(
                by=['CODE AVION']).first().to_dict()['COMPAGNIE AERIENNE']

        df.loc[df['COMPAGNIE AERIENNE'].isna(), 'COMPAGNIE AERIENNE'] = df.loc[
            df['COMPAGNIE AERIENNE'].isna(), 'CODE AVION'].apply(
                lambda x: self.cie_by_avion.get(x, "UKN")
            )
        return df
    
    def fit_drop(self, df):
        df = self.fill_na(df)
        df = self.drop_na(df)
        df = self.remove_unused_columns(df)
        return df
    
    def fit(self, df):
        df = df.copy()
        df = df[df['NOMBRE DE PASSAGERS']<1000] 
        df = df[df["RETARD A L'ARRIVEE"]<250]
        df.loc[:, 'DATE'] = pd.to_datetime(df['DATE'])
        return df

    def transform(self, df):
        df = df.copy()
        df.loc[:, 'DATE'] = pd.to_datetime(df['DATE'])
        return df


In [15]:
cleaning = DataCleaning(features_columns=FEATURES, label=label)
cleaned_vol = cleaning.fit_drop(vol_batch1_smaller)

In [16]:
X = cleaned_vol[FEATURES]
y = cleaned_vol[label]#.apply(lambda x: 1 if x>5 else 0)

In [17]:
X_train, X_test, y_train, y_test = train_test_split_80_20(X, y)

In [18]:
train = cleaning.fit(pd.concat([X_train, y_train], axis=1))
X_train = train[FEATURES]
y_train = train[label]
X_test = cleaning.transform(X_test)

In [19]:
# %load ../app/data_engineering/feature_engineering.py
import pickle

import pandas as pd

from app.utils.multi_column_label_encode import MultiColumnLabelEncoder


class FeatureEngineering:
    def __init__(self, training_columns=None, columns_to_dummify=None, df_airport=None):
        self.training_columns = training_columns
        self.columns_to_dummify = columns_to_dummify
        self.label_encoder = MultiColumnLabelEncoder(columns=self.columns_to_dummify)
        self.average_nb_plane_by_day = {}
        self.airport = self.__get_airport_dict(df_airport)
        
    def __get_airport_dict(self, df_airport):
        df_airport.drop_duplicates(inplace = True)

        duplicate_airport = df_airport.loc[
            df_airport.duplicated(subset=['CODE IATA'], keep=False),
            ['CODE IATA', 'PRIX RETARD PREMIERE 20 MINUTES', 'PRIS RETARD POUR CHAQUE MINUTE APRES 10 MINUTES']].groupby(
            by=['CODE IATA']).mean().to_dict(orient='index')

        df_airport.drop_duplicates(inplace = True, subset=['CODE IATA'])
        
        for code_iata in duplicate_airport.keys():
            df_airport.loc[
                df_airport['CODE IATA']==code_iata,
                ['PRIX RETARD PREMIERE 20 MINUTES']
            ] = df_airport.loc[
                df_airport['CODE IATA']==code_iata,
                ['PRIX RETARD PREMIERE 20 MINUTES']
            ].apply(lambda x: duplicate_airport[code_iata]['PRIX RETARD PREMIERE 20 MINUTES'],
                    axis=1)

            df_airport.loc[
                df_airport['CODE IATA']==code_iata,
                ['PRIS RETARD POUR CHAQUE MINUTE APRES 10 MINUTES']
            ] = df_airport.loc[
                df_airport['CODE IATA']==code_iata,
                ['PRIS RETARD POUR CHAQUE MINUTE APRES 10 MINUTES']
            ].apply(lambda x: duplicate_airport[code_iata]['PRIS RETARD POUR CHAQUE MINUTE APRES 10 MINUTES'],
                    axis=1)
            
        df_airport['LONGITUDE'] = df_airport['LONGITUDE'].astype('float')
        df_airport['LATITUDE'] = df_airport['LATITUDE'].astype('float')
        df_airport['LONGITUDE TRONQUEE'] = df_airport['LONGITUDE'].apply(round)
        df_airport['LATITUDE TRONQUEE'] = df_airport['LATITUDE'].apply(round)
        return df_airport.set_index("CODE IATA").to_dict(orient='index')

    def get_month(self, df):
        return df.apply(lambda x: x.month)

    def get_week(self, df):
        return df.apply(lambda x: x.week)

    def get_hour(self, df):
        return df.apply(lambda x: x // 100)

    def __get_dict_of_average_plane_by_day(self, df, airport_type: str):
        min_date = df['DATE'].min()
        max_date = df['DATE'].max()
        number_of_days = (max_date - min_date).days + 1
        return df[
            [airport_type, 'IDENTIFIANT', 'DATE']].groupby([airport_type, 'DATE']).count().reset_index()[
            [airport_type, 'IDENTIFIANT']].groupby([airport_type]).sum().apply(
            lambda x: x / number_of_days
        )['IDENTIFIANT'].to_dict()

    def get_average_plane_take_off_or_landing_by_day(self, df, airport_type):
        self.average_nb_plane_by_day[airport_type] = self.__get_dict_of_average_plane_by_day(df, airport_type)
        return df[airport_type].apply(lambda x: self.average_nb_plane_by_day[airport_type][x])

    def apply_average_plane_take_off_or_landing_by_day(self, df, airport_type):
        return df[airport_type].apply(
            lambda x: self.average_nb_plane_by_day[airport_type][x] if x in self.average_nb_plane_by_day[
                airport_type] else 0)
    
    def add_data_from_airport(self, X):
        X.loc[:, 'PAYS DEPART'] = X['AEROPORT DEPART'].apply(lambda x: self.airport.get(x)['PAYS'])
        X.loc[:, 'PAYS ARRIVEE'] = X['AEROPORT ARRIVEE'].apply(lambda x: self.airport.get(x)['PAYS'])

        X.loc[:, 'HAUTEUR DEPART'] = X['AEROPORT DEPART'].apply(lambda x: self.airport.get(x)['HAUTEUR'])
        X.loc[:, 'HAUTEUR ARRIVEE'] = X['AEROPORT ARRIVEE'].apply(lambda x: self.airport.get(x)['HAUTEUR'])
        X.loc[:, 'LONGITUDE ARRIVEE'] = X['AEROPORT ARRIVEE'].apply(lambda x: self.airport.get(x)['LONGITUDE TRONQUEE'])
        X.loc[:, 'LATITUDE ARRIVEE'] = X['AEROPORT ARRIVEE'].apply(lambda x: self.airport.get(x)['LATITUDE TRONQUEE'])

        X.loc[:, 'PRIX RETARD PREMIERE 20 MINUTES'] = X['AEROPORT ARRIVEE'].apply(
            lambda x: self.airport.get(x)['PRIX RETARD PREMIERE 20 MINUTES'])
        X.loc[:, 'PRIS RETARD POUR CHAQUE MINUTE APRES 10 MINUTES'] = X['AEROPORT ARRIVEE'].apply(
            lambda x: self.airport.get(x)['PRIS RETARD POUR CHAQUE MINUTE APRES 10 MINUTES'])
        
        return X

    def keep_training_columns(self, X):
        if self.training_columns is not None:
            return X[self.training_columns]
        return X

    def fit_transform_dummify_columns(self, X):
        if self.columns_to_dummify is not None:
            return self.label_encoder.fit_transform(X)
        return X

    def transform_dummify_columns(self, X):
        if self.columns_to_dummify is not None:
            return self.label_encoder.transform(X)
        return X

    def fit(self, dataframe: pd.DataFrame):
        X = dataframe.copy()

        X.loc[:, 'DATE'] = pd.to_datetime(X['DATE'])
        X.loc[:, 'MOIS'] = self.get_month(X['DATE'])
        X.loc[:, 'SEMAINE'] = self.get_week(X['DATE'])
        X.loc[:, 'HEURE DEPART PROGRAMME'] = self.get_hour(X['DEPART PROGRAMME'])
        X.loc[:, 'HEURE ARRIVEE PROGRAMMEE'] = self.get_hour(X['ARRIVEE PROGRAMMEE'])

        X.loc[:, 'NOMBRE DECOLLAGE PAR AEROPORT PAR JOUR'] = self.get_average_plane_take_off_or_landing_by_day(
            X, 'AEROPORT DEPART'
        )
        X.loc[:, 'NOMBRE ATTERRISSAGE PAR AEROPORT PAR JOUR'] = self.get_average_plane_take_off_or_landing_by_day(
            X, 'AEROPORT ARRIVEE'
        )
        
        X['TEMPS DE DEPLACEMENT A TERRE AU DECOLLAGE'] = X[
            'TEMPS DE DEPLACEMENT A TERRE AU DECOLLAGE'].apply(lambda x: math.sqrt(x))
        X["TEMPS DE DEPLACEMENT A TERRE A L'ATTERRISSAGE"] = X[
            "TEMPS DE DEPLACEMENT A TERRE A L'ATTERRISSAGE"].apply(lambda x: math.sqrt(x))
        
        X = self.add_data_from_airport(X)

        X = self.fit_transform_dummify_columns(X)

        X = self.keep_training_columns(X)

        return X

    def transform(self, dataframe: pd.DataFrame):
        X = dataframe.copy()

        X.loc[:, 'DATE'] = pd.to_datetime(X['DATE'])
        X.loc[:, 'MOIS'] = self.get_month(X['DATE'])
        X.loc[:, 'SEMAINE'] = self.get_week(X['DATE'])
        X.loc[:, 'HEURE DEPART PROGRAMME'] = self.get_hour(X['DEPART PROGRAMME'])
        X.loc[:, 'HEURE ARRIVEE PROGRAMMEE'] = self.get_hour(X['ARRIVEE PROGRAMMEE'])

        X.loc[:, 'NOMBRE DECOLLAGE PAR AEROPORT PAR JOUR'] = self.apply_average_plane_take_off_or_landing_by_day(
            X, 'AEROPORT DEPART'
        )
        X.loc[:, 'NOMBRE ATTERRISSAGE PAR AEROPORT PAR JOUR'] = self.apply_average_plane_take_off_or_landing_by_day(
            X, 'AEROPORT ARRIVEE'
        )
        
        X['TEMPS DE DEPLACEMENT A TERRE AU DECOLLAGE'] = X[
            'TEMPS DE DEPLACEMENT A TERRE AU DECOLLAGE'].apply(lambda x: math.sqrt(x))
        X["TEMPS DE DEPLACEMENT A TERRE A L'ATTERRISSAGE"] = X[
            "TEMPS DE DEPLACEMENT A TERRE A L'ATTERRISSAGE"].apply(lambda x: math.sqrt(x))
        
        X = self.add_data_from_airport(X)
        
        X = self.transform_dummify_columns(X)

        X = self.keep_training_columns(X)

        return X

    # TODO: add test
    def save_feature_engineering(self, path=None):
        """
        Save to file in the current working directory
        """
        if path is None:
            path = "../../data/output/feature_engineering.pkl"
        with open(path, 'wb') as file:
            pickle.dump(self, file)

    # TODO: add test
    def load_feature_engineering(self, path=None):
        """
        Load file in an instance
        """
        print(path)
        if path is None:
            path = "../../data/output/feature_engineering.pkl"
        with open(path, 'rb') as file:
            pickle_fe = pickle.load(file)
            self.training_columns = pickle_fe.training_columns
            self.columns_to_dummify = pickle_fe.columns_to_dummify
            self.label_encoder = pickle_fe.label_encoder
            self.average_nb_plane_by_day = pickle_fe.average_nb_plane_by_day
        return self


In [20]:
feature_engineering = FeatureEngineering(
        training_columns=[
            'CODE AVION',
            'AEROPORT DEPART',
            'AEROPORT ARRIVEE', 
            'NOMBRE DECOLLAGE PAR AEROPORT PAR JOUR',
            'NOMBRE ATTERRISSAGE PAR AEROPORT PAR JOUR',
            'TEMPS DE DEPLACEMENT A TERRE AU DECOLLAGE',
            "TEMPS DE DEPLACEMENT A TERRE A L'ATTERRISSAGE",
            'TEMPS PROGRAMME',
            'DISTANCE',
            'COMPAGNIE AERIENNE',
            'NOMBRE DE PASSAGERS',
            'MOIS', # small perf +
            'SEMAINE',
            'HEURE DEPART PROGRAMME',
            'HEURE ARRIVEE PROGRAMMEE',
            'PAYS DEPART',
            'PAYS ARRIVEE',
            'HAUTEUR DEPART',
            'HAUTEUR ARRIVEE',
            'LONGITUDE ARRIVEE',
            'LATITUDE ARRIVEE',
            'PRIX RETARD PREMIERE 20 MINUTES',
            'PRIS RETARD POUR CHAQUE MINUTE APRES 10 MINUTES',
        ],
        columns_to_dummify=[
            'AEROPORT DEPART',
            'AEROPORT ARRIVEE', 
            'COMPAGNIE AERIENNE', 
            'CODE AVION',
            'PAYS DEPART',
            'PAYS ARRIVEE',
        ],
        df_airport=df_airport,
    )
X_train_engineered = feature_engineering.fit(X_train)
X_test_engineered = feature_engineering.transform(X_test)

In [21]:
display(X_train_engineered.head())
X_train_engineered.dtypes

Unnamed: 0,CODE AVION,AEROPORT DEPART,AEROPORT ARRIVEE,NOMBRE DECOLLAGE PAR AEROPORT PAR JOUR,NOMBRE ATTERRISSAGE PAR AEROPORT PAR JOUR,TEMPS DE DEPLACEMENT A TERRE AU DECOLLAGE,TEMPS DE DEPLACEMENT A TERRE A L'ATTERRISSAGE,TEMPS PROGRAMME,DISTANCE,COMPAGNIE AERIENNE,NOMBRE DE PASSAGERS,MOIS,SEMAINE,HEURE DEPART PROGRAMME,HEURE ARRIVEE PROGRAMMEE,PAYS DEPART,PAYS ARRIVEE,HAUTEUR DEPART,HAUTEUR ARRIVEE,LONGITUDE ARRIVEE,LATITUDE ARRIVEE,PRIX RETARD PREMIERE 20 MINUTES,PRIS RETARD POUR CHAQUE MINUTE APRES 10 MINUTES
1282793,80,11,39,1.380474,2.271898,3.741657,2.645751,155.0,991,3,99,2,7,18,21,6,6,14.0,13.0,121,30,43,4
1251207,233,36,17,10.874088,1.531022,3.162278,2.0,65.0,293,3,216,8,33,8,9,27,28,1562.0,13.0,52,25,42,7
1219862,111,34,2,9.416971,9.713504,2.44949,1.414214,85.0,369,3,111,2,9,18,18,43,29,5558.0,89.0,40,43,29,1
1095199,839,33,14,2.757299,15.224453,3.0,3.0,95.0,446,3,214,9,36,22,23,30,2,1646.0,614.0,4,50,41,4
1098477,720,48,14,1.79927,15.224453,3.162278,2.236068,85.0,371,3,17,4,16,8,9,5,2,116.0,614.0,4,50,41,4


CODE AVION                                           int64
AEROPORT DEPART                                      int64
AEROPORT ARRIVEE                                     int64
NOMBRE DECOLLAGE PAR AEROPORT PAR JOUR             float64
NOMBRE ATTERRISSAGE PAR AEROPORT PAR JOUR          float64
TEMPS DE DEPLACEMENT A TERRE AU DECOLLAGE          float64
TEMPS DE DEPLACEMENT A TERRE A L'ATTERRISSAGE      float64
TEMPS PROGRAMME                                    float64
DISTANCE                                             int64
COMPAGNIE AERIENNE                                   int64
NOMBRE DE PASSAGERS                                  int64
MOIS                                                 int64
SEMAINE                                              int64
HEURE DEPART PROGRAMME                               int64
HEURE ARRIVEE PROGRAMMEE                             int64
PAYS DEPART                                          int64
PAYS ARRIVEE                                         int

In [22]:
X_train_engineered.describe(include='all')

Unnamed: 0,CODE AVION,AEROPORT DEPART,AEROPORT ARRIVEE,NOMBRE DECOLLAGE PAR AEROPORT PAR JOUR,NOMBRE ATTERRISSAGE PAR AEROPORT PAR JOUR,TEMPS DE DEPLACEMENT A TERRE AU DECOLLAGE,TEMPS DE DEPLACEMENT A TERRE A L'ATTERRISSAGE,TEMPS PROGRAMME,DISTANCE,COMPAGNIE AERIENNE,NOMBRE DE PASSAGERS,MOIS,SEMAINE,HEURE DEPART PROGRAMME,HEURE ARRIVEE PROGRAMMEE,PAYS DEPART,PAYS ARRIVEE,HAUTEUR DEPART,HAUTEUR ARRIVEE,LONGITUDE ARRIVEE,LATITUDE ARRIVEE,PRIX RETARD PREMIERE 20 MINUTES,PRIS RETARD POUR CHAQUE MINUTE APRES 10 MINUTES
count,196456.0,196456.0,196456.0,196456.0,196456.0,196456.0,196456.0,196456.0,196456.0,196456.0,196456.0,196456.0,196456.0,196456.0,196456.0,196456.0,196456.0,196456.0,196456.0,196456.0,196456.0,196456.0,196456.0
mean,454.296524,37.233121,25.557061,9.622551,7.876589,3.370305,2.194695,115.701521,677.255696,2.99012,129.269709,6.343812,25.726412,13.68033,15.053656,24.068209,15.402289,710.835775,464.96616,38.31207,36.629652,51.129591,4.544901
std,261.443403,19.349468,15.079124,6.51124,5.683108,0.723067,0.524614,50.384893,409.616482,0.130034,70.241921,3.415822,14.878991,4.712169,5.298551,15.044945,10.144232,1283.992371,761.373899,42.611072,19.050249,20.059383,2.568962
min,0.0,0.0,0.0,0.00365,0.071168,1.0,1.0,45.0,159.0,0.0,-1.0,1.0,1.0,5.0,0.0,0.0,0.0,10.0,10.0,-17.0,-25.0,17.0,1.0
25%,231.0,23.0,14.0,4.391423,2.985401,2.828427,2.0,75.0,358.0,3.0,47.0,3.0,13.0,10.0,11.0,9.0,7.0,89.0,30.0,4.0,25.0,31.0,2.0
50%,447.0,36.0,24.0,9.416971,7.031022,3.162278,2.236068,100.0,516.0,3.0,145.0,6.0,25.0,13.0,15.0,27.0,12.0,141.0,135.0,26.0,42.0,48.0,4.0
75%,681.0,57.0,39.0,12.134124,12.57573,3.605551,2.44949,150.0,928.0,3.0,200.0,9.0,37.0,18.0,19.0,40.0,24.0,885.0,614.0,45.0,52.0,66.0,7.0
max,908.0,63.0,50.0,21.816606,18.588504,13.304135,12.288206,380.0,2447.0,3.0,258.0,12.0,53.0,23.0,23.0,44.0,38.0,5558.0,5763.0,135.0,60.0,94.0,9.0


# Training

In [23]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn import metrics
import pickle
import numpy as np

from app.utils.logger import Logger

class Training:
    def __init__(self):
        self.rf_regressor = RandomForestRegressor(n_estimators=100, max_depth=11, random_state=42,n_jobs=-1,verbose=1)
        #self.rf_regressor = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42, n_jobs=-1,verbose=1)
        #self.rf_regressor = LinearRegression(normalize=True, n_jobs=-1)
        
    def fit(self, X, y):
        """
        train a random forest regressor with
        X being the training columns and
        y the label to predict
        """
        self.rf_regressor.fit(X, y)
        return self

    def predict(self, X):
        return self.rf_regressor.predict(X)

    def score(self, X, y):
        y_pred = self.predict(X)
        logger = Logger().logger
        logger.info(f'Mean Absolute Error: {metrics.mean_absolute_error(y, y_pred)}')
        logger.info(f'Mean Squared Error: {metrics.mean_squared_error(y, y_pred)}')
        logger.info(f'Root Mean Squared Error: {np.sqrt(metrics.mean_squared_error(y, y_pred))}')
        logger.info(f'R2 score: {metrics.r2_score(y, y_pred)}')

    # Pour un classif algorithm
    def classif_score(self, X, y):
        y_pred = self.predict(X)
        logger = Logger().logger
        logger.info(f'Accuracy: {metrics.accuracy_score(y, y_pred)}')
        logger.info(f'Recall: {metrics.recall_score(y, y_pred)}')
        logger.info(f'Precision: {metrics.precision_score(y, y_pred)}')
        logger.info(f'F1_score: {metrics.f1_score(y, y_pred)}')
        logger.info(f'ROC AUC: {metrics.roc_auc_score(y, y_pred)}')

    def save_model(self, path=None):
        """
        Save to file in the current working directory
        """
        if path is None:
            path = "../models/rf_model.pkl"
        with open(path, 'wb') as file:
            pickle.dump(self.rf_regressor, file)


In [24]:
model = Training().fit(X_train_engineered, y_train)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    6.6s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   18.9s finished


# Prédictions

In [25]:
X_test_engineered.head(2)

Unnamed: 0,CODE AVION,AEROPORT DEPART,AEROPORT ARRIVEE,NOMBRE DECOLLAGE PAR AEROPORT PAR JOUR,NOMBRE ATTERRISSAGE PAR AEROPORT PAR JOUR,TEMPS DE DEPLACEMENT A TERRE AU DECOLLAGE,TEMPS DE DEPLACEMENT A TERRE A L'ATTERRISSAGE,TEMPS PROGRAMME,DISTANCE,COMPAGNIE AERIENNE,NOMBRE DE PASSAGERS,MOIS,SEMAINE,HEURE DEPART PROGRAMME,HEURE ARRIVEE PROGRAMMEE,PAYS DEPART,PAYS ARRIVEE,HAUTEUR DEPART,HAUTEUR ARRIVEE,LONGITUDE ARRIVEE,LATITUDE ARRIVEE,PRIX RETARD PREMIERE 20 MINUTES,PRIS RETARD POUR CHAQUE MINUTE APRES 10 MINUTES
1272566,445,21,13,5.066606,3.072993,2.828427,2.236068,85.0,483,3,111,5,22,8,11,5,16,107.0,30.0,76,10,57,6
1283362,87,39,39,0.427007,2.271898,5.196152,2.0,110.0,397,3,213,8,34,20,22,38,6,157.0,13.0,121,30,43,4


In [26]:
y_pred = model.predict(X_test_engineered)

[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 100 out of 100 | elapsed:    0.1s finished


In [27]:
y_pred[:3]

array([-4.94118632, 18.77948405, -1.0155356 ])

In [28]:
y_test[:3]

1272566    -5.0
1283362   -17.0
1123552   -17.0
Name: RETARD A L'ARRIVEE, dtype: float64

# Evaluation du model

In [29]:
model.score(X_test_engineered, y_test)

[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 100 out of 100 | elapsed:    0.1s finished
INFO:MainLogger:Mean Absolute Error: 17.09959000392783
INFO:MainLogger:Mean Squared Error: 950.84520954039
INFO:MainLogger:Root Mean Squared Error: 30.83577807580652
INFO:MainLogger:R2 score: 0.13483199400158885


In [30]:
model.score(X_train_engineered, y_train)

[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.1s
[Parallel(n_jobs=12)]: Done 100 out of 100 | elapsed:    0.3s finished
INFO:MainLogger:Mean Absolute Error: 16.151112774061048
INFO:MainLogger:Mean Squared Error: 725.7377338950515
INFO:MainLogger:Root Mean Squared Error: 26.939519926959566
INFO:MainLogger:R2 score: 0.22169035487543542


### Previous model

In [31]:
model.score(X_test_engineered, y_test)

[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 100 out of 100 | elapsed:    0.1s finished
INFO:MainLogger:Mean Absolute Error: 17.09959000392783
INFO:MainLogger:Mean Squared Error: 950.84520954039
INFO:MainLogger:Root Mean Squared Error: 30.83577807580652
INFO:MainLogger:R2 score: 0.13483199400158885


In [32]:
model.score(X_train_engineered, y_train)

[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.1s
[Parallel(n_jobs=12)]: Done 100 out of 100 | elapsed:    0.3s finished
INFO:MainLogger:Mean Absolute Error: 16.151112774061048
INFO:MainLogger:Mean Squared Error: 725.7377338950515
INFO:MainLogger:Root Mean Squared Error: 26.939519926959566
INFO:MainLogger:R2 score: 0.22169035487543542


In [33]:
#model.classif_score(X_test_engineered, y_test)

### test file

In [35]:
small_test = cleaning.fit_drop(pd.read_csv('../data/vol_test.csv'))
small_test = cleaning.transform(small_test)    
X_small_test_engineered = feature_engineering.transform(small_test)
y_pred_small_test = model.predict(X_small_test_engineered)
y_pred_small_test[:10]

[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 100 out of 100 | elapsed:    0.0s finished


array([10.57560032, 52.09558714, 11.42283089, 48.08231407, 11.47248881,
       37.4878351 , 47.30952116, 62.71089057, 48.25104988])

# Save the model

In [36]:
feature_engineering.save_feature_engineering('../data/output/feature_engineering.pkl')

In [37]:
model.save_model()

In [38]:
# Load from file
with open("../models/rf_model.pkl", 'rb') as file:
    pickle_model = pickle.load(file)
    
# Calculate the accuracy score and predict target values
score = pickle_model.score(X_test_engineered, y_test)
print("R2 score: {0:.4f}".format(score))
y_pred = pickle_model.predict(X_test_engineered)
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

R2 score: 0.1348
Mean Absolute Error: 17.09959000392783
Mean Squared Error: 950.8452095403898
Root Mean Squared Error: 30.835778075806516


[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 100 out of 100 | elapsed:    0.1s finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 100 out of 100 | elapsed:    0.0s finished
