In [6]:
import pandas as pd
import numpy as np
import pickle
import logging

from sklearn.model_selection import train_test_split
from app.data_engineering.data_access import read_db
from app.utils.multi_column_label_encode import MultiColumnLabelEncoder
from app.data_engineering.feature_engineering import FeatureEngineering
from app.model import Model


pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [7]:
datasets = read_db()

In [8]:
datasets.keys()

dict_keys(['batch1', 'batch2', 'test'])

In [9]:
df_batch1 = datasets['batch1']
df_batch2 = datasets['batch2']
df_test = datasets['test']

## Définition de nos labels et de nos features

In [10]:
FEATURES = df_test['vols'].columns.tolist()
FEATURES.remove('NIVEAU DE SECURITE')
FEATURES

['IDENTIFIANT',
 'VOL',
 'CODE AVION',
 'AEROPORT DEPART',
 'AEROPORT ARRIVEE',
 'DEPART PROGRAMME',
 'TEMPS DE DEPLACEMENT A TERRE AU DECOLLAGE',
 'TEMPS PROGRAMME',
 'DISTANCE',
 "TEMPS DE DEPLACEMENT A TERRE A L'ATTERRISSAGE",
 'ARRIVEE PROGRAMMEE',
 'COMPAGNIE AERIENNE',
 'NOMBRE DE PASSAGERS',
 'DATE']

In [11]:
LABELS = list(set(df_batch1['vols'].columns.tolist()) - set(df_test['vols'].columns.tolist()))
LABELS

['HEURE DE DEPART',
 'RETARD SECURITE',
 "HEURE D'ARRIVEE",
 'DECOLLAGE',
 'ANNULATION',
 'RETARD METEO',
 'RETARD COMPAGNIE',
 'RETARD AVION',
 "RAISON D'ANNULATION",
 'DETOURNEMENT',
 'RETARD SYSTEM',
 'TEMPS PASSE',
 'ATTERRISSAGE',
 "RETARD A L'ARRIVEE",
 'RETART DE DEPART',
 'TEMPS DE VOL']

In [12]:
df_batch1.keys(), df_batch2.keys(), df_test.keys()

(dict_keys(['vols', 'aeroports', 'compagnies', 'prix_fuel']),
 dict_keys(['vols', 'aeroports', 'compagnies', 'prix_fuel']),
 dict_keys(['vols']))

# Feature Engineering

In [13]:
# la valeur a prédire 
label = "RETARD A L'ARRIVEE"

In [14]:
def train_test_split_80_20(X, y): 
    return train_test_split(X, y, test_size=0.2, random_state=42)

In [15]:
vol_batch1_smaller = pd.concat([df_batch1['vols'], df_batch2['vols']]).tail(1500000)#.head(1200000)

In [16]:
vol_batch1_smaller[FEATURES+[label]].head()

Unnamed: 0,IDENTIFIANT,VOL,CODE AVION,AEROPORT DEPART,AEROPORT ARRIVEE,DEPART PROGRAMME,TEMPS DE DEPLACEMENT A TERRE AU DECOLLAGE,TEMPS PROGRAMME,DISTANCE,TEMPS DE DEPLACEMENT A TERRE A L'ATTERRISSAGE,ARRIVEE PROGRAMMEE,COMPAGNIE AERIENNE,NOMBRE DE PASSAGERS,DATE,RETARD A L'ARRIVEE
2832914,3632717,854,26be1ce54e,AGP,BOH,1445,15.0,120.0,666,4.0,1645,,81,27/3/2016,-23.0
2832915,1126475,1654,f73d0e1e01,CNX,WRO,1328,14.0,163.0,1046,4.0,1611,OA,332,30/7/2016,12.0
2832916,4776442,5182,751a068573,SVG,DSS,705,14.0,50.0,95,13.0,755,I6F,75,26/8/2018,4.0
2832917,1118011,1680,766ef903a6,WRO,UPG,2030,24.0,149.0,899,4.0,2259,OA,30,23/4/2016,15.0
2832918,4890169,5246,0dcaff3ebe,SVG,LTK,1834,9.0,99.0,419,8.0,2013,I6F,13,18/11/2017,-20.0


In [17]:
vol_batch1_smaller[FEATURES+[label]].describe(include='all')

Unnamed: 0,IDENTIFIANT,VOL,CODE AVION,AEROPORT DEPART,AEROPORT ARRIVEE,DEPART PROGRAMME,TEMPS DE DEPLACEMENT A TERRE AU DECOLLAGE,TEMPS PROGRAMME,DISTANCE,TEMPS DE DEPLACEMENT A TERRE A L'ATTERRISSAGE,ARRIVEE PROGRAMMEE,COMPAGNIE AERIENNE,NOMBRE DE PASSAGERS,DATE,RETARD A L'ARRIVEE
count,1500000.0,1500000.0,1500000.0,1500000,1500000,1500000.0,1478697.0,1499999.0,1500000.0,1477675.0,1500000.0,1500000,1500000.0,1500000,1474393.0
unique,,,4793.0,320,320,,,,,,,14,,1002,
top,,,618131665.0,SAW,SAW,,,,,,,WKEUW,,22/7/2018,
freq,,,2538.0,76753,76731,,,,,,,1157339,,1730,
mean,2793218.0,1935.755,,,,1333.087,13.16096,131.4324,764.0808,6.538149,1493.64,,204.9144,,4.821849
std,881135.6,1560.426,,,,481.4104,7.459216,63.37414,500.4773,4.72976,515.004,,352.9355,,35.15498
min,3.0,1.0,,,,3.0,1.0,18.0,31.0,1.0,1.0,,-1.0,,-87.0
25%,2456379.0,616.0,,,,920.0,9.0,83.0,377.0,4.0,1106.0,,59.0,,-12.0
50%,2831378.0,1540.0,,,,1330.0,11.0,115.0,630.0,5.0,1520.0,,156.0,,-4.0
75%,3206378.0,2957.0,,,,1735.0,15.0,161.0,997.0,7.0,1920.0,,206.0,,9.0


In [18]:
vol_batch1_smaller[FEATURES+[label]].corr()

Unnamed: 0,IDENTIFIANT,VOL,DEPART PROGRAMME,TEMPS DE DEPLACEMENT A TERRE AU DECOLLAGE,TEMPS PROGRAMME,DISTANCE,TEMPS DE DEPLACEMENT A TERRE A L'ATTERRISSAGE,ARRIVEE PROGRAMMEE,NOMBRE DE PASSAGERS,RETARD A L'ARRIVEE
IDENTIFIANT,1.0,0.086058,0.015514,-0.050347,-0.136917,-0.125614,-0.008089,0.012523,-0.473953,0.026072
VOL,0.086058,1.0,-0.022177,0.048252,-0.092273,-0.104793,0.00847,-0.014989,-0.041242,-0.002379
DEPART PROGRAMME,0.015514,-0.022177,1.0,-0.030956,-0.063343,-0.052365,-0.022721,0.74015,-0.002071,0.175894
TEMPS DE DEPLACEMENT A TERRE AU DECOLLAGE,-0.050347,0.048252,-0.030956,1.0,0.137515,0.092603,0.039214,-0.00936,0.133131,0.197837
TEMPS PROGRAMME,-0.136917,-0.092273,-0.063343,0.137515,1.0,0.980811,0.119921,0.029714,0.062809,-0.050232
DISTANCE,-0.125614,-0.104793,-0.052365,0.092603,0.980811,1.0,0.095721,0.037164,0.049889,-0.038088
TEMPS DE DEPLACEMENT A TERRE A L'ATTERRISSAGE,-0.008089,0.00847,-0.022721,0.039214,0.119921,0.095721,1.0,0.010174,0.041298,0.10744
ARRIVEE PROGRAMMEE,0.012523,-0.014989,0.74015,-0.00936,0.029714,0.037164,0.010174,1.0,0.003875,0.143282
NOMBRE DE PASSAGERS,-0.473953,-0.041242,-0.002071,0.133131,0.062809,0.049889,0.041298,0.003875,1.0,-0.013526
RETARD A L'ARRIVEE,0.026072,-0.002379,0.175894,0.197837,-0.050232,-0.038088,0.10744,0.143282,-0.013526,1.0


In [19]:
# %load ../certifia/data_cleaning.py
class DataCleaning:
    def __init__(self, features_columns, label):
        self.features_columns = features_columns
        self.label = label

    def remove_unused_columns(self, df):
        if 'NIVEAU DE SECURITE' in df.columns:
            df = df.drop(columns=['NIVEAU DE SECURITE'])
        return df

    def cleaning(self, df):
        df = df.dropna(subset=self.features_columns)
        if self.label in df.columns:
            df = df.dropna(subset=[self.label])
        return df

    def transform(self, df):
        df = df.copy()
        df = self.cleaning(df)
        df = self.remove_unused_columns(df)
        df.loc[:, 'DATE'] = pd.to_datetime(df['DATE'])
        return df


In [20]:
cleaning = DataCleaning(features_columns=FEATURES, label=label)
cleaned_vol = cleaning.transform(vol_batch1_smaller)

In [21]:
X = cleaned_vol[FEATURES]
y = cleaned_vol[label]#.apply(lambda x: 1 if x>0 else 0)

In [22]:
X_train, X_test, y_train, y_test = train_test_split_80_20(X, y)

In [24]:
# %load ../certifia/feature_engineering.py
import pickle
import pandas as pd

from app.utils.multi_column_label_encode import MultiColumnLabelEncoder


class FeatureEngineering:
    def __init__(self, training_columns=None, columns_to_dummify=None):
        self.training_columns = training_columns
        self.columns_to_dummify = columns_to_dummify
        self.label_encoder = MultiColumnLabelEncoder(columns=self.columns_to_dummify)
        self.average_nb_plane_by_day = {}

    def get_month(self, df):
        return df.apply(lambda x: x.month)

    def get_week(self, df):
        return df.apply(lambda x: x.week)

    def get_hour(self, df):
        return df.apply(lambda x: x // 100)

    def __get_dict_of_average_plane_by_day(self, df, airport_type: str):
        min_date = df['DATE'].min()
        max_date = df['DATE'].max()
        number_of_days = (max_date - min_date).days + 1
        return df[
            [airport_type, 'IDENTIFIANT', 'DATE']].groupby([airport_type, 'DATE']).count().reset_index()[
            [airport_type, 'IDENTIFIANT']].groupby([airport_type]).sum().apply(
            lambda x: x / number_of_days
        )['IDENTIFIANT'].to_dict()

    def get_average_plane_take_off_or_landing_by_day(self, df, airport_type):
        self.average_nb_plane_by_day[airport_type] = self.__get_dict_of_average_plane_by_day(df, airport_type)
        return df[airport_type].apply(lambda x: self.average_nb_plane_by_day[airport_type][x])

    def apply_average_plane_take_off_or_landing_by_day(self, df, airport_type):
        return df[airport_type].apply(
            lambda x: self.average_nb_plane_by_day[airport_type][x] if x in self.average_nb_plane_by_day[
                airport_type] else 0)

    def keep_training_columns(self, X):
        if self.training_columns is not None:
            return X[self.training_columns]
        return X

    def fit_transform_dummify_columns(self, X):
        if self.columns_to_dummify is not None:
            return self.label_encoder.fit_transform(X)
        return X

    def transform_dummify_columns(self, X):
        if self.columns_to_dummify is not None:
            return self.label_encoder.transform(X)
        return X

    def fit(self, dataframe: pd.DataFrame):
        X = dataframe.copy()

        X.loc[:, 'MOIS'] = self.get_month(X['DATE'])
        X.loc[:, 'SEMAINE'] = self.get_week(X['DATE'])
        X.loc[:, 'HEURE DEPART PROGRAMME'] = self.get_hour(X['DEPART PROGRAMME'])
        X.loc[:, 'HEURE ARRIVEE PROGRAMMEE'] = self.get_hour(X['ARRIVEE PROGRAMMEE'])

        X.loc[:, 'NOMBRE DECOLLAGE PAR AEROPORT PAR JOUR'] = self.get_average_plane_take_off_or_landing_by_day(
            X, 'AEROPORT DEPART'
        )
        X.loc[:, 'NOMBRE ATTERRISSAGE PAR AEROPORT PAR JOUR'] = self.get_average_plane_take_off_or_landing_by_day(
            X, 'AEROPORT ARRIVEE'
        )

        X = self.fit_transform_dummify_columns(X)

        X = self.keep_training_columns(X)

        return X

    def transform(self, dataframe: pd.DataFrame):
        X = dataframe.copy()

        X.loc[:, 'MOIS'] = self.get_month(X['DATE'])
        X.loc[:, 'SEMAINE'] = self.get_week(X['DATE'])
        X.loc[:, 'HEURE DEPART PROGRAMME'] = self.get_hour(X['DEPART PROGRAMME'])
        X.loc[:, 'HEURE ARRIVEE PROGRAMMEE'] = self.get_hour(X['ARRIVEE PROGRAMMEE'])

        X.loc[:, 'NOMBRE DECOLLAGE PAR AEROPORT PAR JOUR'] = self.apply_average_plane_take_off_or_landing_by_day(
            X, 'AEROPORT DEPART'
        )
        X.loc[:, 'NOMBRE ATTERRISSAGE PAR AEROPORT PAR JOUR'] = self.apply_average_plane_take_off_or_landing_by_day(
            X, 'AEROPORT ARRIVEE'
        )
        X = self.transform_dummify_columns(X)

        X = self.keep_training_columns(X)

        return X

    # TODO: add test
    def save_feature_engineering(self, path=None):
        """
        Save to file in the current working directory
        """
        if path is None:
            path = "../data/output/feature_engineering.pkl"
        with open(path, 'wb') as file:
            pickle.dump(self, file)

    # TODO: add test
    def load_feature_engineering(self, path=None):
        """
        Load file in an instance
        """
        if path is None:
            path = "../data/output/feature_engineering.pkl"
        with open(path, 'rb') as file:
            pickle_fe = pickle.load(file)
            self.training_columns = pickle_fe.training_columns
            self.columns_to_dummify = pickle_fe.columns_to_dummify
            self.label_encoder = pickle_fe.label_encoder
            self.average_nb_plane_by_day = pickle_fe.average_nb_plane_by_day
        return self


In [25]:
feature_engineering = FeatureEngineering(
        training_columns=[
            'CODE AVION',
            'AEROPORT DEPART',
            'AEROPORT ARRIVEE', 
            'TEMPS DE DEPLACEMENT A TERRE AU DECOLLAGE',
            "TEMPS DE DEPLACEMENT A TERRE A L'ATTERRISSAGE",
            'TEMPS PROGRAMME',
            'DISTANCE',
            'COMPAGNIE AERIENNE',
            'NOMBRE DE PASSAGERS',
            'MOIS',
            'SEMAINE',
            'HEURE DEPART PROGRAMME',
            'HEURE ARRIVEE PROGRAMMEE'
        ],
        columns_to_dummify=['AEROPORT DEPART','AEROPORT ARRIVEE', 'COMPAGNIE AERIENNE', 'CODE AVION'],
    )
X_train_engineered = feature_engineering.fit(X_train)
X_test_engineered = feature_engineering.transform(X_test)

In [26]:
display(X_train_engineered.head())
X_train_engineered.dtypes

Unnamed: 0,CODE AVION,AEROPORT DEPART,AEROPORT ARRIVEE,TEMPS DE DEPLACEMENT A TERRE AU DECOLLAGE,TEMPS DE DEPLACEMENT A TERRE A L'ATTERRISSAGE,TEMPS PROGRAMME,DISTANCE,COMPAGNIE AERIENNE,NOMBRE DE PASSAGERS,MOIS,SEMAINE,HEURE DEPART PROGRAMME,HEURE ARRIVEE PROGRAMMEE
537429,840,155,214,7.0,20.0,50.0,148,13,202,8,35,9,9
1180901,292,282,126,12.0,3.0,145.0,806,13,109,2,8,16,18
238933,1859,138,143,8.0,4.0,75.0,304,13,153,10,40,18,20
1219428,2790,125,6,13.0,3.0,145.0,850,13,195,9,39,8,10
713190,4354,122,11,7.0,5.0,130.0,834,13,206,6,25,9,12


CODE AVION                                         int64
AEROPORT DEPART                                    int64
AEROPORT ARRIVEE                                   int64
TEMPS DE DEPLACEMENT A TERRE AU DECOLLAGE        float64
TEMPS DE DEPLACEMENT A TERRE A L'ATTERRISSAGE    float64
TEMPS PROGRAMME                                  float64
DISTANCE                                           int64
COMPAGNIE AERIENNE                                 int64
NOMBRE DE PASSAGERS                                int64
MOIS                                               int64
SEMAINE                                            int64
HEURE DEPART PROGRAMME                             int64
HEURE ARRIVEE PROGRAMMEE                           int64
dtype: object

In [27]:
X_train_engineered.describe(include='all')

Unnamed: 0,CODE AVION,AEROPORT DEPART,AEROPORT ARRIVEE,TEMPS DE DEPLACEMENT A TERRE AU DECOLLAGE,TEMPS DE DEPLACEMENT A TERRE A L'ATTERRISSAGE,TEMPS PROGRAMME,DISTANCE,COMPAGNIE AERIENNE,NOMBRE DE PASSAGERS,MOIS,SEMAINE,HEURE DEPART PROGRAMME,HEURE ARRIVEE PROGRAMMEE
count,1179514.0,1179514.0,1179514.0,1179514.0,1179514.0,1179514.0,1179514.0,1179514.0,1179514.0,1179514.0,1179514.0,1179514.0,1179514.0
mean,2418.241,151.3364,151.2383,13.15633,6.530997,131.5344,765.2247,11.71025,205.8575,6.357693,25.78843,13.06185,14.65464
std,1359.04,96.36523,96.33146,7.441584,4.702155,63.42025,500.8954,2.775185,354.9889,3.408055,14.848,4.806845,5.145037
min,0.0,0.0,0.0,1.0,1.0,20.0,31.0,0.0,-1.0,1.0,1.0,0.0,0.0
25%,1256.0,68.0,68.0,9.0,4.0,84.0,377.0,13.0,58.0,3.0,13.0,9.0,11.0
50%,2412.0,143.0,143.0,11.0,5.0,115.0,632.0,13.0,157.0,6.0,25.0,13.0,15.0
75%,3620.0,248.0,248.0,15.0,7.0,161.0,997.0,13.0,206.0,9.0,37.0,17.0,19.0
max,4759.0,319.0,319.0,177.0,174.0,680.0,4983.0,13.0,2505.0,12.0,53.0,23.0,23.0


# Training

In [29]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn import metrics
import pickle
import numpy as np

from app.utils.logger import Logger

class Training:
    def __init__(self):
        self.rf_regressor = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42,n_jobs=-1,verbose=1)
        #self.rf_regressor = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42, n_jobs=-1,verbose=1)
        #self.rf_regressor = LinearRegression(normalize=True, n_jobs=-1)
        
    def fit(self, X, y):
        """
        train a random forest regressor with
        X being the training columns and
        y the label to predict
        """
        self.rf_regressor.fit(X, y)
        return self

    def predict(self, X):
        return self.rf_regressor.predict(X)

    def score(self, X, y):
        y_pred = self.predict(X)
        logger = Logger().logger
        logger.info(f'Mean Absolute Error: {metrics.mean_absolute_error(y, y_pred)}')
        logger.info(f'Mean Squared Error: {metrics.mean_squared_error(y, y_pred)}')
        logger.info(f'Root Mean Squared Error: {np.sqrt(metrics.mean_squared_error(y, y_pred))}')
        logger.info(f'R2 score: {metrics.r2_score(y, y_pred)}')

    # Pour un classif algorithm
    def classif_score(self, X, y):
        y_pred = self.predict(X)
        logger = Logger().logger
        logger.info(f'Accuracy: {metrics.accuracy_score(y, y_pred)}')
        logger.info(f'Recall: {metrics.recall_score(y, y_pred)}')
        logger.info(f'Precision: {metrics.precision_score(y, y_pred)}')
        logger.info(f'F1_score: {metrics.f1_score(y, y_pred)}')
        logger.info(f'ROC AUC: {metrics.roc_auc_score(y, y_pred)}')

    def save_model(self, path=None):
        """
        Save to file in the current working directory
        """
        if path is None:
            path = "../models/rf_model.pkl"
        with open(path, 'wb') as file:
            pickle.dump(self.rf_regressor, file)


In [31]:
model = Training().fit(X_train_engineered, y_train)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:   44.2s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  1.9min finished


# Prédictions

In [32]:
X_test_engineered.head(2)

Unnamed: 0,CODE AVION,AEROPORT DEPART,AEROPORT ARRIVEE,TEMPS DE DEPLACEMENT A TERRE AU DECOLLAGE,TEMPS DE DEPLACEMENT A TERRE A L'ATTERRISSAGE,TEMPS PROGRAMME,DISTANCE,COMPAGNIE AERIENNE,NOMBRE DE PASSAGERS,MOIS,SEMAINE,HEURE DEPART PROGRAMME,HEURE ARRIVEE PROGRAMMEE
1047877,4264,243,312,8.0,5.0,185.0,1092,13,152,1,3,6,9
877652,3811,125,235,8.0,7.0,80.0,349,13,44,11,46,8,9


In [33]:
y_pred = model.predict(X_test_engineered)

[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.1s
[Parallel(n_jobs=12)]: Done 100 out of 100 | elapsed:    0.2s finished


In [34]:
y_pred[:10]

array([-12.1267859 ,  -4.50217202,   6.27618372,   5.9938932 ,
        10.77652554,   0.02674853,   7.9646048 ,  -1.25415766,
         8.42186651,  -2.43233736])

In [35]:
y_test[:10]

1047877   -25.0
877652    -20.0
15983       1.0
865178    -17.0
511268     -2.0
1327708    -2.0
540892     11.0
607566      0.0
202249      2.0
311394     -8.0
Name: RETARD A L'ARRIVEE, dtype: float64

# Evaluation du model

In [36]:
model.score(X_test_engineered, y_test)

[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.1s
[Parallel(n_jobs=12)]: Done 100 out of 100 | elapsed:    0.2s finished
INFO:MainLogger:Mean Absolute Error: 18.14349711577458
INFO:MainLogger:Mean Squared Error: 1098.3288992175042
INFO:MainLogger:Root Mean Squared Error: 33.14104553597403
INFO:MainLogger:R2 score: 0.11588664521194969


In [37]:
model.score(X_train_engineered, y_train)

[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.4s
[Parallel(n_jobs=12)]: Done 100 out of 100 | elapsed:    1.0s finished
INFO:MainLogger:Mean Absolute Error: 17.989707122778643
INFO:MainLogger:Mean Squared Error: 1058.4500266539064
INFO:MainLogger:Root Mean Squared Error: 32.53382895777727
INFO:MainLogger:R2 score: 0.14244456247029413


In [38]:
#model.classif_score(X_test_engineered, y_test)

### test file

In [40]:
small_test = DataCleaning(features_columns=FEATURES, label=label).transform(pd.read_csv('../data/vol_test.csv'))
X_small_test_engineered = feature_engineering.transform(small_test)
y_pred_small_test = model.predict(X_small_test_engineered)
y_pred_small_test[:10]

[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 100 out of 100 | elapsed:    0.0s finished


array([-3.36182607,  6.44696184,  6.31715811, 34.84560246, -0.69884341,
        6.80516371, -1.64655901, 25.32660966,  3.52064854])

# Save the model

In [44]:
feature_engineering.save_feature_engineering()

In [43]:
model.save_model()

In [42]:
# Load from file
with open("../models/rf_model.pkl", 'rb') as file:
    pickle_model = pickle.load(file)
    
# Calculate the accuracy score and predict target values
score = pickle_model.score(X_test_engineered, y_test)
print("R2 score: {0:.4f}".format(score))
y_pred = pickle_model.predict(X_test_engineered)
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.1s
[Parallel(n_jobs=12)]: Done 100 out of 100 | elapsed:    0.2s finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.1s


R2 score: 0.1159
Mean Absolute Error: 18.14349711577458
Mean Squared Error: 1098.3288992175042
Root Mean Squared Error: 33.14104553597403


[Parallel(n_jobs=12)]: Done 100 out of 100 | elapsed:    0.3s finished
