In [6]:
import pandas as pd
import numpy as np
import pickle
import logging

from sklearn.model_selection import train_test_split
from app.data_engineering.data_access import read_db
from app.utils.multi_column_label_encode import MultiColumnLabelEncoder
from app.data_engineering.feature_engineering import FeatureEngineering
from app.model import Model


pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [None]:
datasets = read_db()

In [None]:
datasets.keys()

In [None]:
df_batch1 = datasets['batch1']
df_batch2 = datasets['batch2']
df_test = datasets['test']

## Définition de nos labels et de nos features

In [None]:
FEATURES = df_test['vols'].columns.tolist()
FEATURES.remove('NIVEAU DE SECURITE')
FEATURES

In [None]:
LABELS = list(set(df_batch1['vols'].columns.tolist()) - set(df_test['vols'].columns.tolist()))
LABELS

In [None]:
df_batch1.keys(), df_batch2.keys(), df_test.keys()

# Feature Engineering

In [None]:
# la valeur a prédire 
label = "RETARD A L'ARRIVEE"

In [None]:
def train_test_split_80_20(X, y): 
    return train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
vol_batch1_smaller = pd.concat([df_batch1['vols'], df_batch2['vols']]).tail(1500000)#.head(1200000)

In [None]:
vol_batch1_smaller[FEATURES+[label]].head()

In [None]:
vol_batch1_smaller[FEATURES+[label]].describe(include='all')

In [None]:
vol_batch1_smaller[FEATURES+[label]].corr()

In [None]:
# %load ../certifia/data_cleaning.py
class DataCleaning:
    def __init__(self, features_columns, label):
        self.features_columns = features_columns
        self.label = label

    def remove_unused_columns(self, df):
        if 'NIVEAU DE SECURITE' in df.columns:
            df = df.drop(columns=['NIVEAU DE SECURITE'])
        return df

    def cleaning(self, df):
        df = df.dropna(subset=self.features_columns)
        if self.label in df.columns:
            df = df.dropna(subset=[self.label])
        return df

    def transform(self, df):
        df = df.copy()
        df = self.cleaning(df)
        df = self.remove_unused_columns(df)
        df.loc[:, 'DATE'] = pd.to_datetime(df['DATE'])
        return df


In [None]:
cleaning = DataCleaning(features_columns=FEATURES, label=label)
cleaned_vol = cleaning.transform(vol_batch1_smaller)

In [None]:
X = cleaned_vol[FEATURES]
y = cleaned_vol[label]#.apply(lambda x: 1 if x>0 else 0)

In [None]:
X_train, X_test, y_train, y_test = train_test_split_80_20(X, y)

In [None]:
# %load ../certifia/feature_engineering.py
import pickle
import pandas as pd

from certifia.utils.multi_column_label_encode import MultiColumnLabelEncoder


class FeatureEngineering:
    def __init__(self, training_columns=None, columns_to_dummify=None):
        self.training_columns = training_columns
        self.columns_to_dummify = columns_to_dummify
        self.label_encoder = MultiColumnLabelEncoder(columns=self.columns_to_dummify)
        self.average_nb_plane_by_day = {}

    def get_month(self, df):
        return df.apply(lambda x: x.month)

    def get_week(self, df):
        return df.apply(lambda x: x.week)

    def get_hour(self, df):
        return df.apply(lambda x: x // 100)

    def __get_dict_of_average_plane_by_day(self, df, airport_type: str):
        min_date = df['DATE'].min()
        max_date = df['DATE'].max()
        number_of_days = (max_date - min_date).days + 1
        return df[
            [airport_type, 'IDENTIFIANT', 'DATE']].groupby([airport_type, 'DATE']).count().reset_index()[
            [airport_type, 'IDENTIFIANT']].groupby([airport_type]).sum().apply(
            lambda x: x / number_of_days
        )['IDENTIFIANT'].to_dict()

    def get_average_plane_take_off_or_landing_by_day(self, df, airport_type):
        self.average_nb_plane_by_day[airport_type] = self.__get_dict_of_average_plane_by_day(df, airport_type)
        return df[airport_type].apply(lambda x: self.average_nb_plane_by_day[airport_type][x])

    def apply_average_plane_take_off_or_landing_by_day(self, df, airport_type):
        return df[airport_type].apply(
            lambda x: self.average_nb_plane_by_day[airport_type][x] if x in self.average_nb_plane_by_day[
                airport_type] else 0)

    def keep_training_columns(self, X):
        if self.training_columns is not None:
            return X[self.training_columns]
        return X

    def fit_transform_dummify_columns(self, X):
        if self.columns_to_dummify is not None:
            return self.label_encoder.fit_transform(X)
        return X

    def transform_dummify_columns(self, X):
        if self.columns_to_dummify is not None:
            return self.label_encoder.transform(X)
        return X

    def fit(self, dataframe: pd.DataFrame):
        X = dataframe.copy()

        X.loc[:, 'MOIS'] = self.get_month(X['DATE'])
        X.loc[:, 'SEMAINE'] = self.get_week(X['DATE'])
        X.loc[:, 'HEURE DEPART PROGRAMME'] = self.get_hour(X['DEPART PROGRAMME'])
        X.loc[:, 'HEURE ARRIVEE PROGRAMMEE'] = self.get_hour(X['ARRIVEE PROGRAMMEE'])

        X.loc[:, 'NOMBRE DECOLLAGE PAR AEROPORT PAR JOUR'] = self.get_average_plane_take_off_or_landing_by_day(
            X, 'AEROPORT DEPART'
        )
        X.loc[:, 'NOMBRE ATTERRISSAGE PAR AEROPORT PAR JOUR'] = self.get_average_plane_take_off_or_landing_by_day(
            X, 'AEROPORT ARRIVEE'
        )

        X = self.fit_transform_dummify_columns(X)

        X = self.keep_training_columns(X)

        return X

    def transform(self, dataframe: pd.DataFrame):
        X = dataframe.copy()

        X.loc[:, 'MOIS'] = self.get_month(X['DATE'])
        X.loc[:, 'SEMAINE'] = self.get_week(X['DATE'])
        X.loc[:, 'HEURE DEPART PROGRAMME'] = self.get_hour(X['DEPART PROGRAMME'])
        X.loc[:, 'HEURE ARRIVEE PROGRAMMEE'] = self.get_hour(X['ARRIVEE PROGRAMMEE'])

        X.loc[:, 'NOMBRE DECOLLAGE PAR AEROPORT PAR JOUR'] = self.apply_average_plane_take_off_or_landing_by_day(
            X, 'AEROPORT DEPART'
        )
        X.loc[:, 'NOMBRE ATTERRISSAGE PAR AEROPORT PAR JOUR'] = self.apply_average_plane_take_off_or_landing_by_day(
            X, 'AEROPORT ARRIVEE'
        )
        X = self.transform_dummify_columns(X)

        X = self.keep_training_columns(X)

        return X

    # TODO: add test
    def save_feature_engineering(self, path=None):
        """
        Save to file in the current working directory
        """
        if path is None:
            path = "../data/output/feature_engineering.pkl"
        with open(path, 'wb') as file:
            pickle.dump(self, file)

    # TODO: add test
    def load_feature_engineering(self, path=None):
        """
        Load file in an instance
        """
        if path is None:
            path = "../data/output/feature_engineering.pkl"
        with open(path, 'rb') as file:
            pickle_fe = pickle.load(file)
            self.training_columns = pickle_fe.training_columns
            self.columns_to_dummify = pickle_fe.columns_to_dummify
            self.label_encoder = pickle_fe.label_encoder
            self.average_nb_plane_by_day = pickle_fe.average_nb_plane_by_day
        return self


In [None]:
feature_engineering = FeatureEngineering(
        training_columns=[
            'CODE AVION',
            'AEROPORT DEPART',
            'AEROPORT ARRIVEE', 
            'TEMPS DE DEPLACEMENT A TERRE AU DECOLLAGE',
            "TEMPS DE DEPLACEMENT A TERRE A L'ATTERRISSAGE",
            'TEMPS PROGRAMME',
            'DISTANCE',
            'COMPAGNIE AERIENNE',
            'NOMBRE DE PASSAGERS',
            'MOIS',
            'SEMAINE',
            'HEURE DEPART PROGRAMME',
            'HEURE ARRIVEE PROGRAMMEE'
        ],
        columns_to_dummify=['AEROPORT DEPART','AEROPORT ARRIVEE', 'COMPAGNIE AERIENNE', 'CODE AVION'],
    )
X_train_engineered = feature_engineering.fit(X_train)
X_test_engineered = feature_engineering.transform(X_test)

In [None]:
display(X_train_engineered.head())
X_train_engineered.dtypes

In [None]:
X_train_engineered.describe(include='all')

# Training

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn import metrics
import pickle
import numpy as np

from certifia.utils.logger import Logger

class Training:
    def __init__(self):
        self.rf_regressor = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42,n_jobs=-1,verbose=1)
        #self.rf_regressor = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42, n_jobs=-1,verbose=1)
        #self.rf_regressor = LinearRegression(normalize=True, n_jobs=-1)
        
    def fit(self, X, y):
        """
        train a random forest regressor with
        X being the training columns and
        y the label to predict
        """
        self.rf_regressor.fit(X, y)
        return self

    def predict(self, X):
        return self.rf_regressor.predict(X)

    def score(self, X, y):
        y_pred = self.predict(X)
        logger = Logger().logger
        logger.info(f'Mean Absolute Error: {metrics.mean_absolute_error(y, y_pred)}')
        logger.info(f'Mean Squared Error: {metrics.mean_squared_error(y, y_pred)}')
        logger.info(f'Root Mean Squared Error: {np.sqrt(metrics.mean_squared_error(y, y_pred))}')
        logger.info(f'R2 score: {metrics.r2_score(y, y_pred)}')

    # Pour un classif algorithm
    def classif_score(self, X, y):
        y_pred = self.predict(X)
        logger = Logger().logger
        logger.info(f'Accuracy: {metrics.accuracy_score(y, y_pred)}')
        logger.info(f'Recall: {metrics.recall_score(y, y_pred)}')
        logger.info(f'Precision: {metrics.precision_score(y, y_pred)}')
        logger.info(f'F1_score: {metrics.f1_score(y, y_pred)}')
        logger.info(f'ROC AUC: {metrics.roc_auc_score(y, y_pred)}')

    def save_model(self, path=None):
        """
        Save to file in the current working directory
        """
        if path is None:
            path = "../models/rf_model.pkl"
        with open(path, 'wb') as file:
            pickle.dump(self.rf_regressor, file)


In [None]:
model = Training().fit(X_train_engineered, y_train)

# Prédictions

In [None]:
X_test_engineered.head(2)

In [None]:
y_pred = model.predict(X_test_engineered)

In [None]:
y_pred[:10]

In [None]:
y_test[:10]

# Evaluation du model

In [None]:
model.score(X_test_engineered, y_test)

In [None]:
model.score(X_train_engineered, y_train)

In [29]:
#model.classif_score(X_test_engineered, y_test)

### test file

In [30]:
small_test = DataCleaning(features_columns=FEATURES, label=label).cleaning(pd.read_csv('../data/vol_test.csv'))
X_small_test_engineered = feature_engineering.transform(small_test)
y_pred_small_test = model.predict(X_small_test_engineered)
y_pred_small_test[:10]

[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 100 out of 100 | elapsed:    0.0s finished


array([-5.18241389,  7.36806039, 10.62997097, 28.9174465 , -7.37209897,
        6.60642149, -2.29465309, 26.33183661,  5.37269167])

# Save the model

In [31]:
model.save_model()

In [32]:
# Load from file
with open("../models/rf_model.pkl", 'rb') as file:
    pickle_model = pickle.load(file)
    
# Calculate the accuracy score and predict target values
score = pickle_model.score(X_test_engineered, y_test)
print("R2 score: {0:.4f}".format(score))
y_pred = pickle_model.predict(X_test_engineered)
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.2s
[Parallel(n_jobs=12)]: Done 100 out of 100 | elapsed:    0.6s finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.


R2 score: 0.1014


[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.2s


Mean Absolute Error: 19.34668043224163
Mean Squared Error: 1394.8902425946849
Root Mean Squared Error: 37.348229443906504


[Parallel(n_jobs=12)]: Done 100 out of 100 | elapsed:    0.6s finished
