In [1]:
import pandas as pd
import numpy as np
import pickle
import logging

from certifia.data_engineering.data_access import read_db
from certifia.feature_engineering import FeatureEngineering
from certifia.training import Training

import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# To plot pretty figures
%matplotlib inline
import matplotlib.pyplot as plt
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12

pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [2]:
datasets = read_db()

In [3]:
datasets.keys()

dict_keys(['batch1', 'batch2', 'test'])

In [61]:
df_batch1 = datasets['batch1']
df_batch2 = datasets['batch2']
df_test_vols = datasets['test']['vols']

In [62]:
FEATURES = df_test['vols'].columns.tolist()

In [63]:
LABELS = list(set(df_batch1['vols'].columns.tolist()) - set(df_test['vols'].columns.tolist()))

In [64]:
label = "RETARD A L'ARRIVEE"

In [65]:
COLUMNS = FEATURES + [label]

## df_batch1 & df_batch2

In [66]:
df_batch1.keys()

dict_keys(['vols', 'aeroports', 'compagnies', 'prix_fuel'])

In [67]:
#df_batch1['vols'].loc[:, 'DATE'] = pd.to_datetime(df_batch1['vols']['DATE'])
#df_batch2['vols'].loc[:, 'DATE'] = pd.to_datetime(df_batch2['vols']['DATE'])

In [68]:
#df_vol1 = df_batch1['vols']
#df_vol2 = df_batch2['vols']

In [69]:
#df_vol1.to_csv('../data/batch_1_vol.csv')
#df_vol2.to_csv('../data/batch_2_vol.csv')

In [70]:
df_vol1 = pd.read_csv('../data/batch_1_vol.csv')
df_vol2 = pd.read_csv('../data/batch_2_vol.csv')

In [71]:
df_vols = pd.concat([df_vol1, df_vol2])

### Vols

In [149]:
df_vols[df_vols.DETOURNEMENT==1].shape

(11697, 32)

In [150]:
df_vols[df_vols["TEMPS DE DEPLACEMENT A TERRE A L'ATTERRISSAGE"].isna()].shape

(72199, 32)

In [157]:
df_vols[(df_vols.DETOURNEMENT==1) & (~df_vols["TEMPS DE DEPLACEMENT A TERRE A L'ATTERRISSAGE"].isna())][COLUMNS]

Unnamed: 0,IDENTIFIANT,VOL,CODE AVION,AEROPORT DEPART,AEROPORT ARRIVEE,DEPART PROGRAMME,TEMPS DE DEPLACEMENT A TERRE AU DECOLLAGE,TEMPS PROGRAMME,DISTANCE,TEMPS DE DEPLACEMENT A TERRE A L'ATTERRISSAGE,ARRIVEE PROGRAMMEE,COMPAGNIE AERIENNE,NOMBRE DE PASSAGERS,DATE,NIVEAU DE SECURITE,RETARD A L'ARRIVEE
1284,4772300,4348,a1108cd0d0,MPM,DSS,723,24.0,179.0,1075,9.0,1122,I6F,90,2018-11-07,10,
1358,1084414,804,48bb931849,TRN,DUR,1453,10.0,260.0,1693,7.0,1913,OA,39,2018-09-04,10,
1767,4099549,1893,6b30af3da3,UPG,TNA,1515,38.0,92.0,331,6.0,1647,COA,415,2017-11-08,10,
2243,3617655,808,afc5b053ab,TRN,TNA,1640,10.0,238.0,1520,2.0,2038,,201,2016-09-05,10,
4416,791745,976,c8a3e917c5,AGP,JUB,2220,11.0,95.0,432,6.0,2355,NVPPA,2504,2016-07-13,10,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1332140,5164028,3356,8b401165c5,CDG,DXB,651,13.0,169.0,977,29.0,940,SMITH,244,2018-08-16,10,
1332144,5164032,3356,d3c6f7722a,CDG,DXB,650,13.0,169.0,977,14.0,939,SMITH,257,2017-08-20,10,
1332187,5164075,3356,9057853637,CDG,DXB,635,13.0,174.0,977,80.0,929,SMITH,52,2018-02-11,10,
1332237,5164125,3356,c822986956,CDG,DXB,635,31.0,193.0,977,8.0,948,SMITH,200,2017-12-22,10,


In [134]:
class DataCleaning:
    def __init__(self, features_columns, label):
        self.features_columns = features_columns
        self.label = label

    def remove_unused_columns(self, df):
        if 'NIVEAU DE SECURITE' in df.columns:
            df = df.drop(columns=['NIVEAU DE SECURITE'])
        return df
    
    def fill_temps_de_deplacement_annulation(self, df):
        df.loc[df['ANNULATION']==1, [
            "TEMPS DE DEPLACEMENT A TERRE AU DECOLLAGE", 
            "TEMPS DE DEPLACEMENT A TERRE A L'ATTERRISSAGE", "RETARD A L'ARRIVEE"]] = 999
        df.loc[(df["DETOURNEMENT"]==1) & (df["TEMPS DE DEPLACEMENT A TERRE A L'ATTERRISSAGE"].isna()), [
            "TEMPS DE DEPLACEMENT A TERRE A L'ATTERRISSAGE", "RETARD A L'ARRIVEE"]] = 999

    def cleaning(self, df):
        df = df.dropna(subset=self.features_columns)
        if self.label in df.columns:
            df = df.dropna(subset=[self.label])
        return df

    def transform(self, df):
        df = df.copy()
        self.fill_temps_de_deplacement_annulation(df)
        #df = self.cleaning()
        #df.loc[:, 'DATE'] = pd.to_datetime(df['DATE'])
        return df


In [135]:
data_cleaning = DataCleaning(features_columns=FEATURES, label=label)

In [136]:
df_vols_cleaned = data_cleaning.transform(df_vols)
df_vols_cleaned

Unnamed: 0.1,Unnamed: 0,IDENTIFIANT,VOL,CODE AVION,AEROPORT DEPART,AEROPORT ARRIVEE,DEPART PROGRAMME,HEURE DE DEPART,RETART DE DEPART,TEMPS DE DEPLACEMENT A TERRE AU DECOLLAGE,DECOLLAGE,TEMPS PROGRAMME,TEMPS PASSE,TEMPS DE VOL,DISTANCE,ATTERRISSAGE,TEMPS DE DEPLACEMENT A TERRE A L'ATTERRISSAGE,ARRIVEE PROGRAMMEE,HEURE D'ARRIVEE,RETARD A L'ARRIVEE,DETOURNEMENT,ANNULATION,RAISON D'ANNULATION,RETARD SYSTEM,RETARD SECURITE,RETARD COMPAGNIE,RETARD AVION,RETARD METEO,COMPAGNIE AERIENNE,NOMBRE DE PASSAGERS,DATE,NIVEAU DE SECURITE
0,0,1259209,4661,a02782cd75,CEB,AAL,1707,1658.0,-9.0,20.0,1718.0,67.0,71.0,45.0,232,1803.0,6.0,1814,1809.0,-5.0,0,0,,,,,,,MAF,379,2018-08-15,10
1,1,4886177,5026,707f6ea54f,GOI,LTK,600,553.0,-7.0,11.0,604.0,130.0,119.0,91.0,738,835.0,17.0,910,852.0,-18.0,0,0,,,,,,,I6F,9,2016-02-11,10
2,2,183332,2021,b116987956,DSS,JNB,1749,1747.0,-2.0,9.0,1756.0,248.0,228.0,215.0,1671,1831.0,4.0,1857,1835.0,-22.0,0,0,,,,,,,NVPPA,2491,2017-09-06,10
3,3,937517,1320,a4b8db63f5,AGP,GOA,2301,2322.0,21.0,19.0,2341.0,65.0,89.0,59.0,214,40.0,11.0,6,51.0,45.0,0,0,,24.0,0.0,0.0,6.0,15.0,NVPPA,1241,2018-05-26,10
4,4,2157498,508,34604053c0,BRU,BOD,612,603.0,-9.0,13.0,616.0,302.0,259.0,238.0,2288,1314.0,8.0,1414,1322.0,-52.0,0,0,,,,,,,THA,78,2018-10-11,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1332909,1332909,5255349,3306,1558121ece,DXB,CDG,1830,1956.0,86.0,28.0,2024.0,171.0,162.0,127.0,977,2231.0,7.0,2121,2238.0,77.0,0,0,,0.0,0.0,0.0,77.0,0.0,SMITH,193,2017-12-27,10
1332910,1332910,5255350,3306,83f05c683c,DXB,CDG,1830,1838.0,8.0,13.0,1851.0,171.0,140.0,123.0,977,2054.0,4.0,2121,2058.0,-23.0,0,0,,,,,,,SMITH,195,2017-12-28,10
1332911,1332911,5255351,3306,83f05c683c,DXB,CDG,1830,1825.0,-5.0,19.0,1844.0,171.0,147.0,118.0,977,2042.0,10.0,2121,2052.0,-29.0,0,0,,,,,,,SMITH,197,2016-12-29,10
1332912,1332912,5255352,3306,89b2dba862,DXB,CDG,1830,1825.0,-5.0,13.0,1838.0,171.0,149.0,128.0,977,2046.0,8.0,2121,2054.0,-27.0,0,0,,,,,,,SMITH,201,2017-12-30,10


In [129]:
display(df_vols_cleaned.head(200))
display(df_vols_cleaned.shape)

Unnamed: 0.1,Unnamed: 0,IDENTIFIANT,VOL,CODE AVION,AEROPORT DEPART,AEROPORT ARRIVEE,DEPART PROGRAMME,HEURE DE DEPART,RETART DE DEPART,TEMPS DE DEPLACEMENT A TERRE AU DECOLLAGE,DECOLLAGE,TEMPS PROGRAMME,TEMPS PASSE,TEMPS DE VOL,DISTANCE,ATTERRISSAGE,TEMPS DE DEPLACEMENT A TERRE A L'ATTERRISSAGE,ARRIVEE PROGRAMMEE,HEURE D'ARRIVEE,RETARD A L'ARRIVEE,DETOURNEMENT,ANNULATION,RAISON D'ANNULATION,RETARD SYSTEM,RETARD SECURITE,RETARD COMPAGNIE,RETARD AVION,RETARD METEO,COMPAGNIE AERIENNE,NOMBRE DE PASSAGERS,DATE,NIVEAU DE SECURITE
0,0,1259209,4661,a02782cd75,CEB,AAL,1707,1658.0,-9.0,20.0,1718.0,67.0,71.0,45.0,232,1803.0,6.0,1814,1809.0,-5.0,0,0,,,,,,,MAF,379,2018-08-15,10
1,1,4886177,5026,707f6ea54f,GOI,LTK,600,553.0,-7.0,11.0,604.0,130.0,119.0,91.0,738,835.0,17.0,910,852.0,-18.0,0,0,,,,,,,I6F,9,2016-02-11,10
2,2,183332,2021,b116987956,DSS,JNB,1749,1747.0,-2.0,9.0,1756.0,248.0,228.0,215.0,1671,1831.0,4.0,1857,1835.0,-22.0,0,0,,,,,,,NVPPA,2491,2017-09-06,10
3,3,937517,1320,a4b8db63f5,AGP,GOA,2301,2322.0,21.0,19.0,2341.0,65.0,89.0,59.0,214,40.0,11.0,6,51.0,45.0,0,0,,24.0,0.0,0.0,6.0,15.0,NVPPA,1241,2018-05-26,10
4,4,2157498,508,34604053c0,BRU,BOD,612,603.0,-9.0,13.0,616.0,302.0,259.0,238.0,2288,1314.0,8.0,1414,1322.0,-52.0,0,0,,,,,,,THA,78,2018-10-11,10
5,5,315293,2202,c9bb139bb9,TIA,AGP,1515,1753.0,158.0,14.0,1807.0,225.0,211.0,189.0,1747,16.0,8.0,2200,24.0,144.0,0,0,,0.0,0.0,0.0,144.0,0.0,NVPPA,1249,2016-02-24,10
6,6,4016340,1627,c942b5536a,PEK,DXB,645,643.0,-2.0,17.0,700.0,193.0,168.0,142.0,1205,1122.0,9.0,1158,1131.0,-27.0,0,0,,,,,,,COA,364,2016-08-09,10
7,7,1834318,525,04338f33e4,CPH,SXF,1555,1607.0,12.0,9.0,1616.0,132.0,114.0,100.0,679,1756.0,5.0,1807,1801.0,-6.0,0,0,,,,,,,THA,369,2016-06-26,10
8,8,4011321,2289,35c98dd398,ISB,DXB,800,752.0,-8.0,12.0,804.0,187.0,176.0,151.0,1171,1235.0,13.0,1307,1248.0,-19.0,0,0,,,,,,,COA,422,2016-07-24,10
9,9,2172509,511,d1ca9b36f4,AMM,BOD,1450,1453.0,3.0,9.0,1502.0,83.0,62.0,47.0,224,1549.0,6.0,1613,1555.0,-18.0,0,0,,,,,,,THA,342,2017-04-09,10


(4332914, 32)

In [104]:
df_vols_cleaned[COLUMNS].info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4332914 entries, 0 to 1332913
Data columns (total 16 columns):
 #   Column                                         Non-Null Count    Dtype  
---  ------                                         --------------    -----  
 0   IDENTIFIANT                                    4332914 non-null  int64  
 1   VOL                                            4332914 non-null  int64  
 2   CODE AVION                                     4332914 non-null  object 
 3   AEROPORT DEPART                                4332914 non-null  object 
 4   AEROPORT ARRIVEE                               4332914 non-null  object 
 5   DEPART PROGRAMME                               4332914 non-null  int64  
 6   TEMPS DE DEPLACEMENT A TERRE AU DECOLLAGE      4332914 non-null  float64
 7   TEMPS PROGRAMME                                4332910 non-null  float64
 8   DISTANCE                                       4332914 non-null  int64  
 9   TEMPS DE DEPLACEMENT A T

In [105]:
df_vols_cleaned[COLUMNS].nunique()

IDENTIFIANT                                      4332914
VOL                                                 6948
CODE AVION                                          4893
AEROPORT DEPART                                      320
AEROPORT ARRIVEE                                     320
DEPART PROGRAMME                                    1316
TEMPS DE DEPLACEMENT A TERRE AU DECOLLAGE            183
TEMPS PROGRAMME                                      541
DISTANCE                                            1355
TEMPS DE DEPLACEMENT A TERRE A L'ATTERRISSAGE        185
ARRIVEE PROGRAMMEE                                  1431
COMPAGNIE AERIENNE                                    13
NOMBRE DE PASSAGERS                                  487
DATE                                                1002
NIVEAU DE SECURITE                                     1
RETARD A L'ARRIVEE                                  1166
dtype: int64

In [106]:
df_vols_cleaned[df_vols_cleaned["TEMPS DE DEPLACEMENT A TERRE A L'ATTERRISSAGE"].isna()].shape

(2068, 32)

In [108]:
df_vols_cleaned[(df_vols_cleaned["DETOURNEMENT"]==1) ].shape

(11697, 32)

In [None]:
df_vols

### Aeroports

In [None]:
df_aeroports1 = df_batch1['aeroports']
display(df_aeroports1.head(20))
display(df_aeroports1.shape)

### Compagnies

In [None]:
df_compagnies1 = pd.concat([df_batch1['compagnies'], df_batch2['compagnies']]).drop_duplicates()
display(df_compagnies1)
display(df_compagnies1.shape)

## df_batch2

In [None]:
df_batch2

## Test

In [None]:
df_test

# DATAVIZ