In [1]:
import pandas as pd
import numpy as np
import pickle
import logging

from sklearn.model_selection import train_test_split
from certifia.data_engineering.data_access import read_db
from certifia.utils.multi_column_label_encode import MultiColumnLabelEncoder
from certifia.feature_engineering import FeatureEngineering
from certifia.training import Training


pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [2]:
datasets = read_db()

In [3]:
datasets.keys()

dict_keys(['batch1', 'batch2', 'test'])

In [4]:
df_batch1 = datasets['batch1']
df_batch2 = datasets['batch2']
df_test = datasets['test']

## Définition de nos labels et de nos features

In [5]:
FEATURES = df_test['vols'].columns.tolist()
FEATURES

['IDENTIFIANT',
 'VOL',
 'CODE AVION',
 'AEROPORT DEPART',
 'AEROPORT ARRIVEE',
 'DEPART PROGRAMME',
 'TEMPS DE DEPLACEMENT A TERRE AU DECOLLAGE',
 'TEMPS PROGRAMME',
 'DISTANCE',
 "TEMPS DE DEPLACEMENT A TERRE A L'ATTERRISSAGE",
 'ARRIVEE PROGRAMMEE',
 'COMPAGNIE AERIENNE',
 'NOMBRE DE PASSAGERS',
 'DATE',
 'NIVEAU DE SECURITE']

In [6]:
LABELS = list(set(df_batch1['vols'].columns.tolist()) - set(df_test['vols'].columns.tolist()))
LABELS

['RETARD COMPAGNIE',
 'RETARD SYSTEM',
 "HEURE D'ARRIVEE",
 'RETARD SECURITE',
 'RETARD AVION',
 'RETARD METEO',
 'ATTERRISSAGE',
 'ANNULATION',
 "RAISON D'ANNULATION",
 "RETARD A L'ARRIVEE",
 'DETOURNEMENT',
 'HEURE DE DEPART',
 'TEMPS DE VOL',
 'TEMPS PASSE',
 'DECOLLAGE',
 'RETART DE DEPART']

In [7]:
df_batch1.keys(), df_batch2.keys(), df_test.keys()

(dict_keys(['vols', 'aeroports', 'compagnies', 'prix_fuel']),
 dict_keys(['vols', 'aeroports', 'compagnies', 'prix_fuel']),
 dict_keys(['vols']))

# Feature Engineering

In [8]:
# la valeur a prédire 
label = 'RETART DE DEPART'

In [9]:
df_batch1['vols'].head()

Unnamed: 0,IDENTIFIANT,VOL,CODE AVION,AEROPORT DEPART,AEROPORT ARRIVEE,DEPART PROGRAMME,HEURE DE DEPART,RETART DE DEPART,TEMPS DE DEPLACEMENT A TERRE AU DECOLLAGE,DECOLLAGE,TEMPS PROGRAMME,TEMPS PASSE,TEMPS DE VOL,DISTANCE,ATTERRISSAGE,TEMPS DE DEPLACEMENT A TERRE A L'ATTERRISSAGE,ARRIVEE PROGRAMMEE,HEURE D'ARRIVEE,RETARD A L'ARRIVEE,DETOURNEMENT,ANNULATION,RAISON D'ANNULATION,RETARD SYSTEM,RETARD SECURITE,RETARD COMPAGNIE,RETARD AVION,RETARD METEO,COMPAGNIE AERIENNE,NOMBRE DE PASSAGERS,DATE,NIVEAU DE SECURITE
0,1259209,4661,a02782cd75,CEB,AAL,1707,1658.0,-9.0,20.0,1718.0,67.0,71.0,45.0,232,1803.0,6.0,1814,1809.0,-5.0,0,0,,,,,,,MAF,379,15/8/2018,10
1,4886177,5026,707f6ea54f,GOI,LTK,600,553.0,-7.0,11.0,604.0,130.0,119.0,91.0,738,835.0,17.0,910,852.0,-18.0,0,0,,,,,,,I6F,9,2/11/2016,10
2,183332,2021,b116987956,DSS,JNB,1749,1747.0,-2.0,9.0,1756.0,248.0,228.0,215.0,1671,1831.0,4.0,1857,1835.0,-22.0,0,0,,,,,,,NVPPA,2491,9/6/2017,10
3,937517,1320,a4b8db63f5,AGP,GOA,2301,2322.0,21.0,19.0,2341.0,65.0,89.0,59.0,214,40.0,11.0,6,51.0,45.0,0,0,,24.0,0.0,0.0,6.0,15.0,NVPPA,1241,26/5/2018,10
4,2157498,508,34604053c0,BRU,BOD,612,603.0,-9.0,13.0,616.0,302.0,259.0,238.0,2288,1314.0,8.0,1414,1322.0,-52.0,0,0,,,,,,,THA,78,10/11/2018,10


In [10]:
df_batch1['vols'].describe(include='all')

Unnamed: 0,IDENTIFIANT,VOL,CODE AVION,AEROPORT DEPART,AEROPORT ARRIVEE,DEPART PROGRAMME,HEURE DE DEPART,RETART DE DEPART,TEMPS DE DEPLACEMENT A TERRE AU DECOLLAGE,DECOLLAGE,TEMPS PROGRAMME,TEMPS PASSE,TEMPS DE VOL,DISTANCE,ATTERRISSAGE,TEMPS DE DEPLACEMENT A TERRE A L'ATTERRISSAGE,ARRIVEE PROGRAMMEE,HEURE D'ARRIVEE,RETARD A L'ARRIVEE,DETOURNEMENT,ANNULATION,RAISON D'ANNULATION,RETARD SYSTEM,RETARD SECURITE,RETARD COMPAGNIE,RETARD AVION,RETARD METEO,COMPAGNIE AERIENNE,NOMBRE DE PASSAGERS,DATE,NIVEAU DE SECURITE
count,3000000.0,3000000.0,3000000,3000000,3000000,3000000.0,2950817.0,2950817.0,2948974.0,2948974.0,2999997.0,2940503.0,2940503.0,3000000.0,2947215.0,2947215.0,3000000.0,2947215.0,2940503.0,3000000.0,3000000.0,51528,559953.0,559953.0,559953.0,559953.0,559953.0,3000000,3000000.0,3000000,3000000.0
unique,,,4189,318,318,,,,,,,,,,,,,,,,,4,,,,,,13,,1002,
top,,,786a02f742,AGP,AGP,,,,,,,,,,,,,,,,,B,,,,,,NVPPA,,26/6/2016,
freq,,,9360,221117,221002,,,,,,,,,,,,,,,,,28044,,,,,,575040,,3408,
mean,2616680.0,2272.942,,,,1328.994,1333.179,9.436509,17.26023,1355.807,145.9086,141.7271,116.6868,846.5176,1471.46,7.792886,1493.786,1477.092,4.901089,0.002656333,0.017176,,15.381552,0.085525,19.693153,22.738483,3.173741,,459.0983,,10.0
std,1727235.0,1824.356,,,,485.5726,498.1603,39.21812,9.281802,500.3703,79.11707,78.18913,76.39179,643.9932,520.003,5.988985,505.2654,524.1305,41.48457,0.05147114,0.1299269,,30.300445,2.291357,51.342073,44.17147,21.909615,,672.0171,,0.0
min,2.0,1.0,,,,1.0,1.0,-61.0,1.0,1.0,18.0,15.0,7.0,31.0,1.0,1.0,1.0,1.0,-87.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,,-1.0,,10.0
25%,1043596.0,777.0,,,,916.0,920.0,-5.0,12.0,935.0,88.0,85.0,60.0,370.0,1055.0,4.0,1110.0,1100.0,-14.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,,77.0,,10.0
50%,2089660.0,1728.0,,,,1325.0,1329.0,-2.0,15.0,1342.0,125.0,122.0,96.0,666.0,1508.0,6.0,1520.0,1512.0,-5.0,0.0,0.0,,4.0,0.0,1.0,0.0,0.0,,242.0,,10.0
75%,4288415.0,3410.0,,,,1730.0,1738.0,7.0,20.0,1752.0,179.0,175.0,148.0,1089.0,1911.0,9.0,1917.0,1916.0,8.0,0.0,0.0,,20.0,0.0,19.0,28.0,0.0,,381.0,,10.0


In [11]:
df_batch1['vols'][FEATURES+[label]].corr()

Unnamed: 0,IDENTIFIANT,VOL,DEPART PROGRAMME,TEMPS DE DEPLACEMENT A TERRE AU DECOLLAGE,TEMPS PROGRAMME,DISTANCE,TEMPS DE DEPLACEMENT A TERRE A L'ATTERRISSAGE,ARRIVEE PROGRAMMEE,NOMBRE DE PASSAGERS,NIVEAU DE SECURITE,RETART DE DEPART
IDENTIFIANT,1.0,0.208508,0.000679,-0.028722,-0.165377,-0.165195,0.085273,-0.003014,-0.509965,,0.015647
VOL,0.208508,1.0,0.000844,0.026337,-0.391776,-0.401004,-0.037392,-0.012705,-0.17699,,-0.00639
DEPART PROGRAMME,0.000679,0.000844,1.0,0.017896,-0.001466,0.00417,-0.050699,0.689619,0.002116,,0.091173
TEMPS DE DEPLACEMENT A TERRE AU DECOLLAGE,-0.028722,0.026337,0.017896,1.0,0.08776,0.054074,-0.030561,0.034437,0.030647,,0.068613
TEMPS PROGRAMME,-0.165377,-0.391776,-0.001466,0.08776,1.0,0.985256,0.081706,0.032009,0.033028,,0.031897
DISTANCE,-0.165195,-0.401004,0.00417,0.054074,0.985256,1.0,0.061389,0.023366,0.029755,,0.026154
TEMPS DE DEPLACEMENT A TERRE A L'ATTERRISSAGE,0.085273,-0.037392,-0.050699,-0.030561,0.081706,0.061389,1.0,-0.012797,-0.035866,,0.015411
ARRIVEE PROGRAMMEE,-0.003014,-0.012705,0.689619,0.034437,0.032009,0.023366,-0.012797,1.0,0.012521,,0.083044
NOMBRE DE PASSAGERS,-0.509965,-0.17699,0.002116,0.030647,0.033028,0.029755,-0.035866,0.012521,1.0,,-0.012257
NIVEAU DE SECURITE,,,,,,,,,,,


In [12]:
%load ../certifia/feature_engineering.py

In [12]:
def train_test_split_80_20(X, y): 
    return train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
vol_batch1_smaller = df_batch1['vols'].head(500000)

In [14]:
feature_engineering = FeatureEngineering(
        training_columns=[
            'AEROPORT DEPART',
            'AEROPORT ARRIVEE', 
            'NOMBRE DE PASSAGERS',
            'COMPAGNIE AERIENNE',
            'DATE',
            'DEPART PROGRAMME',
            'TEMPS DE DEPLACEMENT A TERRE AU DECOLLAGE',
            "TEMPS DE DEPLACEMENT A TERRE A L'ATTERRISSAGE",
            'TEMPS PROGRAMME',
            'DISTANCE',
            'MOIS',
            'SEMAINE',
            'IDENTIFIANT'
        ],
        columns_to_dummify=['AEROPORT DEPART','AEROPORT ARRIVEE', 'COMPAGNIE AERIENNE'],
        label_name=label
    )
X, y = feature_engineering.fit(vol_batch1_smaller)

X.loc[:, 'NOMBRE DECOLLAGE PAR AEROPORT PAR JOUR'] = feature_engineering.get_average_plane_take_off_or_landing_by_day(
    X, 'AEROPORT DEPART'
)
X.loc[:, 'NOMBRE ATTERRISSAGE PAR AEROPORT PAR JOUR'] = feature_engineering.get_average_plane_take_off_or_landing_by_day(
    X, 'AEROPORT ARRIVEE'
)


In [15]:
X.describe(include='all')

  X.describe(include='all')


Unnamed: 0,AEROPORT DEPART,AEROPORT ARRIVEE,NOMBRE DE PASSAGERS,COMPAGNIE AERIENNE,DATE,DEPART PROGRAMME,TEMPS DE DEPLACEMENT A TERRE AU DECOLLAGE,TEMPS DE DEPLACEMENT A TERRE A L'ATTERRISSAGE,TEMPS PROGRAMME,DISTANCE,MOIS,SEMAINE,IDENTIFIANT,NOMBRE DECOLLAGE PAR AEROPORT PAR JOUR,NOMBRE ATTERRISSAGE PAR AEROPORT PAR JOUR
count,491126.0,491126.0,491126.0,491126.0,491126,491126.0,491126.0,491126.0,491126.0,491126.0,491126.0,491126.0,491126.0,491126.0,491126.0
unique,,,,,1002,,,,,,,,,,
top,,,,,2017-06-26 00:00:00,,,,,,,,,,
freq,,,,,600,,,,,,,,,,
first,,,,,2016-01-01 00:00:00,,,,,,,,,,
last,,,,,2018-12-31 00:00:00,,,,,,,,,,
mean,139.99254,140.1588,464.207275,7.334415,,1328.266675,17.260247,7.788736,146.37411,850.643576,6.34843,25.749608,2601608.0,11.78977,11.6556
std,96.203255,96.26638,677.017257,3.035395,,485.138025,9.277269,6.028671,79.368629,645.999458,3.403731,14.823515,1725764.0,10.502989,10.403484
min,0.0,0.0,-1.0,0.0,,1.0,1.0,1.0,18.0,31.0,1.0,1.0,4.0,0.001825,0.000912
25%,61.0,61.0,77.0,5.0,,916.0,12.0,4.0,88.0,373.0,3.0,13.0,1028745.0,2.160584,2.247263


In [16]:
selected_columns = [
    'AEROPORT DEPART',
    'AEROPORT ARRIVEE', 
    'NOMBRE DE PASSAGERS',
    'COMPAGNIE AERIENNE',
    'DEPART PROGRAMME',
    'TEMPS DE DEPLACEMENT A TERRE AU DECOLLAGE',
    "TEMPS DE DEPLACEMENT A TERRE A L'ATTERRISSAGE",
    'TEMPS PROGRAMME',
    'DISTANCE',
    'SEMAINE',
    'MOIS',
    'NOMBRE DECOLLAGE PAR AEROPORT PAR JOUR',
    #'NOMBRE ATTERRISSAGE PAR AEROPORT PAR JOUR'
]
X_training = X[selected_columns]
X_training

Unnamed: 0,AEROPORT DEPART,AEROPORT ARRIVEE,NOMBRE DE PASSAGERS,COMPAGNIE AERIENNE,DEPART PROGRAMME,TEMPS DE DEPLACEMENT A TERRE AU DECOLLAGE,TEMPS DE DEPLACEMENT A TERRE A L'ATTERRISSAGE,TEMPS PROGRAMME,DISTANCE,SEMAINE,MOIS,NOMBRE DECOLLAGE PAR AEROPORT PAR JOUR
0,48,0,379,7,1707,20.0,6.0,67.0,232,33,8,0.780109
1,115,180,9,6,600,11.0,17.0,130.0,738,6,2,1.802920
2,86,142,2491,9,1749,9.0,4.0,248.0,1671,36,9,11.243613
3,7,114,1241,9,2301,19.0,11.0,65.0,214,21,5,33.563869
4,38,31,78,12,612,13.0,8.0,302.0,2288,41,10,16.969891
...,...,...,...,...,...,...,...,...,...,...,...,...
499995,210,0,2488,9,835,13.0,5.0,167.0,1050,33,8,1.434307
499996,80,269,189,12,1226,9.0,7.0,107.0,550,21,5,3.934307
499997,22,67,78,6,600,16.0,6.0,74.0,246,39,9,0.594891
499998,67,38,326,12,1500,14.0,12.0,367.0,2454,36,9,10.312044


In [17]:
y_training = y#.apply(lambda x: 1 if x>0 else 0)

In [18]:
FEATURES

['IDENTIFIANT',
 'VOL',
 'CODE AVION',
 'AEROPORT DEPART',
 'AEROPORT ARRIVEE',
 'DEPART PROGRAMME',
 'TEMPS DE DEPLACEMENT A TERRE AU DECOLLAGE',
 'TEMPS PROGRAMME',
 'DISTANCE',
 "TEMPS DE DEPLACEMENT A TERRE A L'ATTERRISSAGE",
 'ARRIVEE PROGRAMMEE',
 'COMPAGNIE AERIENNE',
 'NOMBRE DE PASSAGERS',
 'DATE',
 'NIVEAU DE SECURITE']

In [19]:
X_train, X_test, y_train, y_test = train_test_split_80_20(X_training, y_training)

# Training

In [20]:
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn import metrics
import pickle
import numpy as np

from certifia.utils.logger import Logger

class Training:
    def __init__(self):
        self.rf_regressor = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42,n_jobs=-1,verbose=1)

    def fit(self, X, y):
        """
        train a random forest regressor with
        X being the training columns and
        y the label to predict
        """
        self.rf_regressor.fit(X, y)
        return self

    def predict(self, X):
        return self.rf_regressor.predict(X)

    def score(self, X, y):
        y_pred = self.predict(X)
        logger = Logger().logger
        logger.info(f'Mean Absolute Error: {metrics.mean_absolute_error(y, y_pred)}')
        logger.info(f'Mean Squared Error: {metrics.mean_squared_error(y, y_pred)}')
        logger.info(f'Root Mean Squared Error: {np.sqrt(metrics.mean_squared_error(y, y_pred))}')
        logger.info(f'R2 score: {metrics.r2_score(y, y_pred)}')

    # Pour un classif algorithm
    def classif_score(self, X, y):
        y_pred = self.predict(X)
        logger = Logger().logger
        logger.info(f'Accuracy: {metrics.accuracy_score(y, y_pred)}')
        logger.info(f'Recall: {metrics.recall_score(y, y_pred)}')
        logger.info(f'Precision: {metrics.precision_score(y, y_pred)}')
        logger.info(f'F1_score: {metrics.f1_score(y, y_pred)}')
        logger.info(f'ROC AUC: {metrics.roc_auc_score(y, y_pred)}')

    def save_model(self, path=None):
        """
        Save to file in the current working directory
        """
        if path is None:
            path = "../models/rf_model.pkl"
        with open(path, 'wb') as file:
            pickle.dump(self.rf_regressor, file)


In [21]:
model = Training().fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    9.1s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   27.7s finished


# Prédictions

In [22]:
X_test.head(2)

Unnamed: 0,AEROPORT DEPART,AEROPORT ARRIVEE,NOMBRE DE PASSAGERS,COMPAGNIE AERIENNE,DEPART PROGRAMME,TEMPS DE DEPLACEMENT A TERRE AU DECOLLAGE,TEMPS DE DEPLACEMENT A TERRE A L'ATTERRISSAGE,TEMPS PROGRAMME,DISTANCE,SEMAINE,MOIS,NOMBRE DECOLLAGE PAR AEROPORT PAR JOUR
108220,223,0,371,7,1709,13.0,5.0,55.0,144,49,12,0.168796
242396,250,211,229,11,1755,20.0,8.0,63.0,196,41,10,30.387774


In [23]:
y_pred = model.predict(X_test)

[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 100 out of 100 | elapsed:    0.1s finished


In [24]:
y_pred[:10]

array([ 7.87554311, 12.03390412, 12.90557007,  4.89259389,  6.59341577,
       15.07426329,  9.00067189,  6.93187105,  8.43485482,  6.32488893])

In [25]:
y_test[:10]

108220    -11.0
242396     -1.0
330161     -1.0
354650    201.0
421206    -11.0
255414     -3.0
391411     24.0
341688      0.0
91900      -3.0
58751      10.0
Name: RETART DE DEPART, dtype: float64

# Evaluation du model

In [26]:
model.score(X_test, y_test)

[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 100 out of 100 | elapsed:    0.1s finished
INFO:MainLogger:Mean Absolute Error: 18.691401653391825
INFO:MainLogger:Mean Squared Error: 1462.384702676079
INFO:MainLogger:Root Mean Squared Error: 38.24113887786397
INFO:MainLogger:R2 score: 0.0262124236620932


In [47]:
#model.classif_score(X_test, y_test)

[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.1s
[Parallel(n_jobs=12)]: Done 100 out of 100 | elapsed:    0.1s finished
INFO:MainLogger:Accuracy: 0.6690183861706677
INFO:MainLogger:Recall: 0.1657292085174769
INFO:MainLogger:Precision: 0.626695604991861
INFO:MainLogger:F1_score: 0.262136583373051
INFO:MainLogger:ROC AUC: 0.5557267058680789


# Save the model

In [98]:
# Save to file in the current working directory
pkl_filename = "../models/rf_model.pkl"
with open(pkl_filename, 'wb') as file:
    pickle.dump(model.rf_regressor, file)

In [99]:
# Load from file
with open(pkl_filename, 'rb') as file:
    pickle_model = pickle.load(file)
    
# Calculate the accuracy score and predict target values
score = pickle_model.score(X_test, y_test)
print("R2 score: {0:.4f}".format(score))
y_pred = pickle_model.predict(X_test)
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

R2 score: 0.0041
Mean Absolute Error: 22.13286070928927
Mean Squared Error: 1723.3259856908633
Root Mean Squared Error: 41.51296165887064


[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  30 out of  30 | elapsed:    0.0s finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  30 out of  30 | elapsed:    0.0s finished
