In [1]:
import pandas as pd
import numpy as np
import pickle

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from certifia.data_engineering.data_access import read_db

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [2]:
datasets = read_db()

In [3]:
datasets.keys()

dict_keys(['batch1', 'batch2', 'test'])

In [4]:
df_batch1 = datasets['batch1']
df_batch2 = datasets['batch2']
df_test = datasets['test']

## Définition de nos labels et de nos features

In [5]:
FEATURES = df_test['vols'].columns.tolist()
FEATURES

['IDENTIFIANT',
 'VOL',
 'CODE AVION',
 'AEROPORT DEPART',
 'AEROPORT ARRIVEE',
 'DEPART PROGRAMME',
 'TEMPS DE DEPLACEMENT A TERRE AU DECOLLAGE',
 'TEMPS PROGRAMME',
 'DISTANCE',
 "TEMPS DE DEPLACEMENT A TERRE A L'ATTERRISSAGE",
 'ARRIVEE PROGRAMMEE',
 'COMPAGNIE AERIENNE',
 'NOMBRE DE PASSAGERS',
 'DATE',
 'NIVEAU DE SECURITE']

In [6]:
LABELS = list(set(df_batch1['vols'].columns.tolist()) - set(df_test['vols'].columns.tolist()))
LABELS

['ANNULATION',
 'HEURE DE DEPART',
 'DECOLLAGE',
 'TEMPS PASSE',
 'RETARD COMPAGNIE',
 "RAISON D'ANNULATION",
 'RETARD METEO',
 'RETARD AVION',
 'ATTERRISSAGE',
 'RETARD SYSTEM',
 'TEMPS DE VOL',
 "RETARD A L'ARRIVEE",
 'RETARD SECURITE',
 'RETART DE DEPART',
 "HEURE D'ARRIVEE",
 'DETOURNEMENT']

In [9]:
df_batch1.keys(), df_batch2.keys(), df_test.keys()

(dict_keys(['vols', 'aeroports', 'compagnies', 'prix_fuel']),
 dict_keys(['vols', 'aeroports', 'compagnies', 'prix_fuel']),
 dict_keys(['vols']))

# Feature Engineering

In [101]:
df_batch1['vols'].head()

Unnamed: 0,IDENTIFIANT,VOL,CODE AVION,AEROPORT DEPART,AEROPORT ARRIVEE,DEPART PROGRAMME,HEURE DE DEPART,RETART DE DEPART,TEMPS DE DEPLACEMENT A TERRE AU DECOLLAGE,DECOLLAGE,TEMPS PROGRAMME,TEMPS PASSE,TEMPS DE VOL,DISTANCE,ATTERRISSAGE,TEMPS DE DEPLACEMENT A TERRE A L'ATTERRISSAGE,ARRIVEE PROGRAMMEE,HEURE D'ARRIVEE,RETARD A L'ARRIVEE,DETOURNEMENT,ANNULATION,RAISON D'ANNULATION,RETARD SYSTEM,RETARD SECURITE,RETARD COMPAGNIE,RETARD AVION,RETARD METEO,COMPAGNIE AERIENNE,NOMBRE DE PASSAGERS,DATE,NIVEAU DE SECURITE
0,1259209,4661,a02782cd75,CEB,AAL,1707,1658.0,-9.0,20.0,1718.0,67.0,71.0,45.0,232,1803.0,6.0,1814,1809.0,-5.0,0,0,,,,,,,MAF,379,15/8/2018,10
1,4886177,5026,707f6ea54f,GOI,LTK,600,553.0,-7.0,11.0,604.0,130.0,119.0,91.0,738,835.0,17.0,910,852.0,-18.0,0,0,,,,,,,I6F,9,2/11/2016,10
2,183332,2021,b116987956,DSS,JNB,1749,1747.0,-2.0,9.0,1756.0,248.0,228.0,215.0,1671,1831.0,4.0,1857,1835.0,-22.0,0,0,,,,,,,NVPPA,2491,9/6/2017,10
3,937517,1320,a4b8db63f5,AGP,GOA,2301,2322.0,21.0,19.0,2341.0,65.0,89.0,59.0,214,40.0,11.0,6,51.0,45.0,0,0,,24.0,0.0,0.0,6.0,15.0,NVPPA,1241,26/5/2018,10
4,2157498,508,34604053c0,BRU,BOD,612,603.0,-9.0,13.0,616.0,302.0,259.0,238.0,2288,1314.0,8.0,1414,1322.0,-52.0,0,0,,,,,,,THA,78,10/11/2018,10


In [102]:
df_batch1['vols'].describe(include='all')

Unnamed: 0,IDENTIFIANT,VOL,CODE AVION,AEROPORT DEPART,AEROPORT ARRIVEE,DEPART PROGRAMME,HEURE DE DEPART,RETART DE DEPART,TEMPS DE DEPLACEMENT A TERRE AU DECOLLAGE,DECOLLAGE,TEMPS PROGRAMME,TEMPS PASSE,TEMPS DE VOL,DISTANCE,ATTERRISSAGE,TEMPS DE DEPLACEMENT A TERRE A L'ATTERRISSAGE,ARRIVEE PROGRAMMEE,HEURE D'ARRIVEE,RETARD A L'ARRIVEE,DETOURNEMENT,ANNULATION,RAISON D'ANNULATION,RETARD SYSTEM,RETARD SECURITE,RETARD COMPAGNIE,RETARD AVION,RETARD METEO,COMPAGNIE AERIENNE,NOMBRE DE PASSAGERS,DATE,NIVEAU DE SECURITE
count,3000000.0,3000000.0,3000000,3000000,3000000,3000000.0,2950817.0,2950817.0,2948974.0,2948974.0,2999997.0,2940503.0,2940503.0,3000000.0,2947215.0,2947215.0,3000000.0,2947215.0,2940503.0,3000000.0,3000000.0,51528,559953.0,559953.0,559953.0,559953.0,559953.0,3000000,3000000.0,3000000,3000000.0
unique,,,4189,318,318,,,,,,,,,,,,,,,,,4,,,,,,13,,1002,
top,,,786a02f742,AGP,AGP,,,,,,,,,,,,,,,,,B,,,,,,NVPPA,,26/6/2016,
freq,,,9360,221117,221002,,,,,,,,,,,,,,,,,28044,,,,,,575040,,3408,
mean,2616680.0,2272.942,,,,1328.994,1333.179,9.436509,17.26023,1355.807,145.9086,141.7271,116.6868,846.5176,1471.46,7.792886,1493.786,1477.092,4.901089,0.002656333,0.017176,,15.381552,0.085525,19.693153,22.738483,3.173741,,459.0983,,10.0
std,1727235.0,1824.356,,,,485.5726,498.1603,39.21812,9.281802,500.3703,79.11707,78.18913,76.39179,643.9932,520.003,5.988985,505.2654,524.1305,41.48457,0.05147114,0.1299269,,30.300445,2.291357,51.342073,44.17147,21.909615,,672.0171,,0.0
min,2.0,1.0,,,,1.0,1.0,-61.0,1.0,1.0,18.0,15.0,7.0,31.0,1.0,1.0,1.0,1.0,-87.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,,-1.0,,10.0
25%,1043596.0,777.0,,,,916.0,920.0,-5.0,12.0,935.0,88.0,85.0,60.0,370.0,1055.0,4.0,1110.0,1100.0,-14.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,,77.0,,10.0
50%,2089660.0,1728.0,,,,1325.0,1329.0,-2.0,15.0,1342.0,125.0,122.0,96.0,666.0,1508.0,6.0,1520.0,1512.0,-5.0,0.0,0.0,,4.0,0.0,1.0,0.0,0.0,,242.0,,10.0
75%,4288415.0,3410.0,,,,1730.0,1738.0,7.0,20.0,1752.0,179.0,175.0,148.0,1089.0,1911.0,9.0,1917.0,1916.0,8.0,0.0,0.0,,20.0,0.0,19.0,28.0,0.0,,381.0,,10.0


In [103]:
df_batch1['vols'][FEATURES+["RETARD A L'ARRIVEE"]].corr()

Unnamed: 0,IDENTIFIANT,VOL,DEPART PROGRAMME,TEMPS DE DEPLACEMENT A TERRE AU DECOLLAGE,TEMPS PROGRAMME,DISTANCE,TEMPS DE DEPLACEMENT A TERRE A L'ATTERRISSAGE,ARRIVEE PROGRAMMEE,NOMBRE DE PASSAGERS,NIVEAU DE SECURITE,RETARD A L'ARRIVEE
IDENTIFIANT,1.0,0.208508,0.000679,-0.028722,-0.165377,-0.165195,0.085273,-0.003014,-0.509965,,0.042344
VOL,0.208508,1.0,0.000844,0.026337,-0.391776,-0.401004,-0.037392,-0.012705,-0.17699,,0.023328
DEPART PROGRAMME,0.000679,0.000844,1.0,0.017896,-0.001466,0.00417,-0.050699,0.689619,0.002116,,0.079363
TEMPS DE DEPLACEMENT A TERRE AU DECOLLAGE,-0.028722,0.026337,0.017896,1.0,0.08776,0.054074,-0.030561,0.034437,0.030647,,0.24259
TEMPS PROGRAMME,-0.165377,-0.391776,-0.001466,0.08776,1.0,0.985256,0.081706,0.032009,0.033028,,-0.023557
DISTANCE,-0.165195,-0.401004,0.00417,0.054074,0.985256,1.0,0.061389,0.023366,0.029755,,-0.021176
TEMPS DE DEPLACEMENT A TERRE A L'ATTERRISSAGE,0.085273,-0.037392,-0.050699,-0.030561,0.081706,0.061389,1.0,-0.012797,-0.035866,,0.120962
ARRIVEE PROGRAMMEE,-0.003014,-0.012705,0.689619,0.034437,0.032009,0.023366,-0.012797,1.0,0.012521,,0.073245
NOMBRE DE PASSAGERS,-0.509965,-0.17699,0.002116,0.030647,0.033028,0.029755,-0.035866,0.012521,1.0,,-0.040094
NIVEAU DE SECURITE,,,,,,,,,,,


In [94]:
class MultiColumnLabelEncoder:
    def __init__(self,columns = None):
        self.columns = columns # array of column names to encode

    def fit(self,X,y=None):
        return self # not relevant here

    def transform(self,X):
        '''
        Transforms columns of X specified in self.columns using
        LabelEncoder(). If no columns specified, transforms all
        columns in X.
        '''
        output = X.copy()
        if self.columns is not None:
            for col in self.columns:
                output[col] = preprocessing.LabelEncoder().fit_transform(output[col])
        else:
            for colname,col in output.iteritems():
                output[colname] = preprocessing.LabelEncoder().fit_transform(col)
        return output

    def fit_transform(self,X,y=None):
        return self.fit(X,y).transform(X)

In [232]:
class FeatureEngineering:
    def __init__(self, training_columns = None):
        self.training_columns = training_columns
    
    def cleaning(self, df):
        df = df.dropna(subset=["RETARD A L'ARRIVEE"])
        
        df = df.drop(columns=['NIVEAU DE SECURITE'])
        return df
    
    def split_feature_label(self, df):
        X = df[self.training_columns]
        y = df["RETARD A L'ARRIVEE"]
        return X, y
    
    def train_test_split_80_20(self, X, y): 
        return train_test_split(X, y, test_size=0.2, random_state=42)
    
    def dummify_columns(self, X, columns = None):
        return MultiColumnLabelEncoder(columns = columns).fit_transform(X)
        
    def transform(self, dataframe: pd.DataFrame):
        df = dataframe.copy()
        
        df = self.cleaning(df)
        
        X, y = self.split_feature_label(df)
        
        X = self.dummify_columns(X, ['AEROPORT DEPART','AEROPORT ARRIVEE'])
        
        return self.train_test_split_80_20(X, y)

In [233]:
vol_batch1_smaller = df_batch1['vols']#.head(1000000)

In [234]:
vol_batch1_with_retard_arrivee = vol_batch1_smaller.dropna(subset=["RETARD A L'ARRIVEE"])

In [235]:
X_batch1 = vol_batch1_with_retard_arrivee[['AEROPORT DEPART','AEROPORT ARRIVEE']]
y_batch1 = vol_batch1_with_retard_arrivee["RETARD A L'ARRIVEE"]

In [236]:
# dummification
#X_batch1_dummified = pd.concat([pd.get_dummies(X_batch1['AEROPORT DEPART'], prefix='AEROPORT_DEPART'),
#                               pd.get_dummies(X_batch1['AEROPORT ARRIVEE'], prefix='AEROPORT_ARRIVEE')], axis=1)

# Label encoder
X_batch1_dummified = MultiColumnLabelEncoder(
    columns = ['AEROPORT DEPART','AEROPORT ARRIVEE']
).fit_transform(X_batch1)

In [237]:
X_train, X_test, y_train, y_test = train_test_split(X_batch1_dummified, y_batch1, test_size=0.2, random_state=42)

In [238]:
X_train, X_test, y_train, y_test = FeatureEngineering(
        training_columns=['AEROPORT DEPART',
                          'AEROPORT ARRIVEE', 
                          #'TEMPS DE DEPLACEMENT A TERRE AU DECOLLAGE',
                          'NOMBRE DE PASSAGERS']
    ).transform(vol_batch1_smaller)

# Training

In [239]:
rf_regressor = RandomForestRegressor(n_estimators=30, max_depth=4, random_state=0, n_jobs=-1)
rf_regressor.fit(X_train, y_train)

RandomForestRegressor(max_depth=6, n_estimators=50, n_jobs=-1, random_state=0)

# Prédictions

In [240]:
X_test.head(2)

Unnamed: 0,AEROPORT DEPART,AEROPORT ARRIVEE,NOMBRE DE PASSAGERS
1531238,7,309,65
2478044,212,7,1742


In [241]:
y_pred = rf_regressor.predict(X_test)

In [242]:
y_pred[:10]

array([7.75639378, 0.04761413, 4.0633513 , 5.0895171 , 5.62902844,
       4.09035192, 2.71076693, 4.0633513 , 4.91997576, 5.62902844])

In [243]:
y_test[:10]

1531238    -2.0
2478044   -29.0
2068905     1.0
919696     -3.0
2022056   -16.0
2451534   -12.0
1666013    -3.0
2114520     3.0
1576245   -10.0
2858069    -5.0
Name: RETARD A L'ARRIVEE, dtype: float64

# Evaluation du model

In [244]:
from sklearn import metrics

print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

Mean Absolute Error: 22.099588377937426
Mean Squared Error: 1667.3265462566103
Root Mean Squared Error: 40.83291008802349


# Save the model

In [245]:
import pickle
# Save to file in the current working directory
pkl_filename = "../data/output/rf_model.pkl"
with open(pkl_filename, 'wb') as file:
    pickle.dump(rf_regressor, file)

In [246]:
# Load from file
with open(pkl_filename, 'rb') as file:
    pickle_model = pickle.load(file)
    
# Calculate the accuracy score and predict target values
score = pickle_model.score(X_test, y_test)
print("R2 score: {0:.2f}".format(score))
y_pred = pickle_model.predict(X_test)
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

R2 score: 0.01
Mean Absolute Error: 22.099588377937426
Mean Squared Error: 1667.3265462566103
Root Mean Squared Error: 40.83291008802349
