In [97]:
import pandas as pd

import pandas as pd
import numpy as np

from sklearn.metrics import make_scorer
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import cross_val_score, TimeSeriesSplit

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from feature_engine.creation import CyclicalFeatures

from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.linear_model import BayesianRidge, HuberRegressor, PassiveAggressiveRegressor, TheilSenRegressor
from sklearn.multioutput import MultiOutputRegressor

from skforecast.ForecasterAutoreg import ForecasterAutoreg

In [98]:
# Load dataframes
path = "../data/train.csv"
df = pd.read_csv(path, parse_dates=[0])

df_weather1 = pd.read_csv("https://object.files.data.gouv.fr/meteofrance/data/synchro_ftp/BASE/HOR/H_75_previous-2020-2022.csv.gz", sep=';')
df_weather2 = pd.read_csv("https://object.files.data.gouv.fr/meteofrance/data/synchro_ftp/BASE/HOR/H_75_latest-2023-2024.csv.gz", sep=';')
weather = pd.concat([df_weather1, df_weather2])

holidays_path = "../data/fr-en-calendrier-scolaire.csv"
holidays = pd.read_csv(holidays_path, sep=";")

traffic_path = "../data/traffic_data.csv"
traffic = pd.read_csv(traffic_path, sep=";")

final_test = pd.read_csv("../data/test.csv", parse_dates=[0])

In [99]:
class Eng:
    def __init__(self, df, weather, traffic, holidays):
        self.df = df
        self.df = self.df.rename(columns={"id": "date"})
        self.df = self.df.drop_duplicates()

        self.external_database(weather, traffic)
        self.additional_data(holidays)
        self.date_features()
        

    def external_database(self, weather, traffic):
        
        ##### Weather #####
        
        self.weather = weather

        # Dropping empty columns
        self.weather = self.weather.drop(
            columns=['FF2', 'QFF2', 'DD2', 'QDD2', 'FXI2', 'QFXI2', 'DXI2', 'QDXI2', 'HXI2', 'QHXI2', 'DXI3S',
                                      'DHUMEC', 'QDHUMEC', 'GEOP', 'QGEOP', 'N', 'QN', 'NBAS', 'QNBAS', 'CL', 'QCL', 'CM', 'QCM',
                                      'CH', 'QCH', 'N1', 'QN1', 'C1', 'QC1', 'B1', 'QB1', 'N2', 'QN2', 'C2', 'QC2', 'B2', 'QB2',
                                      'N3', 'QN3', 'C3', 'QC3', 'B3', 'QB3', 'N4', 'QN4', 'C4', 'QC4', 'B4', 'QB4', 'W1', 'QW1',
                                      'W2', 'QW2', 'SOL', 'QSOL', 'SOLNG', 'QSOLNG', 'TMER', 'QTMER', 'VVMER', 'QVVMER', 'ETATMER',
                                      'QETATMER', 'DIRHOULE', 'QDIRHOULE', 'HVAGUE', 'QHVAGUE', 'PVAGUE', 'QPVAGUE', 'HNEIGEF', 'QHNEIGEF',
                                      'TSNEIGE', 'QTSNEIGE', 'TUBENEIGE', 'QTUBENEIGE', 'HNEIGEFI3', 'QHNEIGEFI3', 'HNEIGEFI1', 'QHNEIGEFI1',
                                      'ESNEIGE', 'QESNEIGE', 'CHARGENEIGE', 'QCHARGENEIGE', 'DIR', 'QDIR', 'DIR2', 'QDIR2', 'DIF', 'QDIF',
                                      'DIF2', 'QDIF2', 'UV', 'QUV', 'UV2', 'QUV2', 'UV_INDICE', 'QUV_INDICE', 'INFRAR', 'QINFRAR', 'INFRAR2',
                                      'QINFRAR2', 'TLAGON', 'QTLAGON', 'TVEGETAUX', 'QTVEGETAUX', 'ECOULEMENT', 'QECOULEMENT', 'NUM_POSTE',
                                      'NOM_USUEL', 'LAT', 'LON', 'DRR1', 'FF', 'QFF', 'DD', 'QDD', 'FXY', 'QFXY', 'DXY', 'QDXY', 'QHXY', 'FXI',
                                      'QFXI', 'DXI', 'QDXI', 'QHXI', 'FXI3S', 'QFXI3S', 'QDXI3S', 'QHFXI3S', 'TD', 'QTD', 'T10', 'QT10', 'T20',
                                      'QT20', 'T50', 'QT50', 'T100', 'QT100', 'TNSOL', 'QTNSOL', 'TN50', 'QTN50', 'TCHAUSSEE', 'QTCHAUSSEE',
                                      'U', 'QU', 'UN', 'QUN', 'QHUN', 'UX', 'QUX', 'QHUX', 'DHUMI40', 'QDHUMI40', 'DHUMI80', 'QDHUMI80', 'TSV',
                                      'QTSV', 'PMER', 'QPMER', 'PSTAT', 'QPSTAT', 'PMERMIN', 'QPMERMIN', 'VV', 'QVV', 'DVV200', 'QDVV200', 'WW',
                                      'QWW', 'NEIGETOT', 'QNEIGETOT', 'GLO', 'QGLO', 'GLO2', 'QGLO2', 'INS', 'QINS', 'INS2', 'QINS2', 'QDRR1']
                                      )
        
        # Grouping by date to sum and average values across all meteo stations
        self.weather["date"] = pd.to_datetime(self.weather["AAAAMMJJHH"], format="%Y%m%d%H")
        self.weather = self.weather.drop(columns=["AAAAMMJJHH", "HXI", "HXY", "HFXI3S", "HTN", "HTX", "HUN", "HUX"])

        self.avg = self.weather.drop(
            columns=['RR1']
            ).groupby("date").mean().reset_index()

        self.tot = self.weather.drop(
            columns=['TX', 'T', 'QTX', 'DG', 'QT', 'QHTN', 'QHTX', 'TN', 'QDG', 'QTN']
            ).groupby("date").sum().reset_index()

        self.weather = self.weather.drop_duplicates(subset=["date"])

        self.df = self.df.merge(self.weather, on="date", how="left")

        # Filling missing values for weather data with their mean as there are very few
        self.is_na = self.df.isna().sum()

        for col in self.is_na.index:
            if self.is_na[col] > 0:
                self.df[col] = self.df[col].fillna(self.df[col].mean())
        

        ##### Traffic #####
        
        # traffic.drop("Unnamed: 0", axis=1, inplace=True)
        traffic["date"] = pd.to_datetime(traffic["date"])

        traffic = traffic.drop_duplicates(subset=["date"])
        linear_int2 = traffic[["date", 'flow', 'occupation_rate']]
        linear_int2 = linear_int2.set_index("date")
        new_features2 = linear_int2.resample('1h').interpolate("linear")

        self.df = self.df.merge(new_features2, on="date", how="left")

        self.df["flow"] = self.df["flow"].fillna(self.df["flow"].mean())
        self.df["occupation_rate"] = self.df["occupation_rate"].fillna(self.df["occupation_rate"].mean())
        

    def date_features(self):
        # Create new date features
        self.df['day'] = self.df['date'].dt.day
        self.df['month'] = self.df['date'].dt.month
        self.df['year'] = self.df['date'].dt.year
        self.df['hour'] = self.df['date'].dt.hour
        self.df['weekday'] = self.df['date'].dt.weekday 
        
        self.df["weekend"] = self.df['weekday'].isin([5, 6])
        

    def additional_data(self, holidays):
    
        ###### Lockdown variable ######
        
        # Lockdown in Paris
        start_date = pd.to_datetime('2020-10-31')
        end_date = pd.to_datetime('2020-12-14')
        start_date1 = pd.to_datetime('2021-04-04')
        end_date1 = pd.to_datetime('2021-05-02')
        
        # Assign 1 if the date is within the specified range, otherwise 0
        self.df['lockdown'] = self.df['date'].apply(lambda x: 1 if ((start_date <= x <= end_date) or (start_date1 <= x <= end_date1)) else 0)
        

        ###### Curfews variable ######
        
        # Curfew date range (+1 hour because the data is the cumulative of the previous)
        curfew_periods = [
            (pd.to_datetime('2020-10-17 22:00'), pd.to_datetime('2020-10-29 07:00')),
            (pd.to_datetime('2020-12-16 21:00'), pd.to_datetime('2021-01-15 07:00')),
            (pd.to_datetime('2021-01-16 19:00'), pd.to_datetime('2021-03-20 07:00')),
            (pd.to_datetime('2021-03-21 20:00'), pd.to_datetime('2021-04-02 07:00')),
            (pd.to_datetime('2021-05-19 22:00'), pd.to_datetime('2021-06-08 07:00')),
            (pd.to_datetime('2021-06-09 23:00'), pd.to_datetime('2021-06-20 07:00'))
        ]

        # Create a new column 'Curfew' and assign 1 if the date is within the specified range, otherwise 0

        self.df['curfew'] = 0  # Initialize the Curfew column

        # Loop through curfew periods and set Curfew column accordingly
        for start_time, end_time in curfew_periods:
            mask = self.df[self.df['date'].between(start_time, end_time)]
            mask = mask[(mask["date"].dt.hour > start_time.hour) | (mask["date"].dt.hour < end_time.hour)]
            self.df.loc[mask.index, 'curfew'] = 1
            
        
        ###### Holidays variable ######

        # Consider only metropolitan areas
        holidays = holidays[holidays["Zones"].isin(["Zone C"])]

        # Consider only relevant years
        holidays = holidays[holidays["annee_scolaire"].isin(["2020-2021", "2021-2022"])]

        # Distinguish for holidays in Paris or not
        holidays['Holidays in Paris'] = 1

        holidays.drop(["Académies","Population", "Zones"], axis=1, inplace = True)

        # Convert to same date format
        holidays['Date de début'] = holidays['Date de début'].apply(lambda x: x[0:10]+' '+x[11:19])
        holidays['Date de fin'] = holidays['Date de fin'].apply(lambda x: x[0:10]+' '+x[11:19])

        holidays["Date de début"] = pd.to_datetime(holidays["Date de début"], format='%Y-%m-%d %H:%M:%S')
        holidays["Date de fin"] = pd.to_datetime(holidays["Date de fin"], format='%Y-%m-%d %H:%M:%S')

        # Remove holidays starting after final date of dataset
        holidays = holidays[holidays["Date de début"].dt.year != 2022]

        # Remove summer holidays
        holidays = holidays[holidays["Description"] != "Vacances d'Été"]
        holidays.drop(["Description","annee_scolaire"], axis=1, inplace=True)

        # Drop same holidays
        holidays = holidays.drop_duplicates(subset=['Date de début', 'Date de fin'], keep='first')

        # Create holidays date ranges 
        ranges = []
        for x in holidays[["Date de début","Date de fin"]].values:
            ranges.append(pd.date_range(x[0], x[1], freq="h"))
            
            
        def is_date_within_ranges(date, ranges):
            for date_range in ranges:
                if date_range[0] <= date <= date_range[-1]:
                    return 1
            return 0

        # Apply the function to create a new column indicating whether the date is within any holiday range
        self.df['holiday'] = self.df['date'].apply(lambda x: is_date_within_ranges(x, ranges))
        
        
        ###### Rush hour ########
        def is_rush_hour(x):
            if ((7 <= x.hour <= 9) or (16 <= x.hour <= 19)) and x.weekday != 5 and x.weekday != 6:
                return 1
            elif (x.weekday == 5 or x.weekday == 6) and (16 <= x.hour <= 19):
                return 1
            else:
                return 0
            
        self.df['rush_hour'] = self.df['date'].apply(lambda x: is_rush_hour(x))
        

    def output(self, test: bool=False):

        if test == False:

            # Using interpolation to fill in missing values before adding lag features
            linear_int = self.df[["date", "valeur_NO2", "valeur_CO", "valeur_O3", "valeur_PM10", "valeur_PM25"]]
            linear_int = linear_int.set_index("date")
            new_features = linear_int.resample('1h').interpolate("linear")

            self.df = self.df.merge(new_features, on="date", how="left")
            self.df = self.df.rename(columns={'valeur_NO2_y': 'valeur_NO2', 'valeur_CO_y': 'valeur_CO', 'valeur_O3_y': 'valeur_O3', 'valeur_PM10_y': 'valeur_PM10',
                'valeur_PM25_y': 'valeur_PM25'})
            self.df = self.df.drop(['valeur_NO2_x', 'valeur_CO_x', 'valeur_O3_x', 'valeur_PM10_x',
                'valeur_PM25_x'], axis=1)
            
            # Generating lag columns
            self.df["valeur_NO2_lag1"] = self.df['valeur_NO2'].shift(1)
            self.df["valeur_CO_lag1"] = self.df['valeur_CO'].shift(1)
            self.df["valeur_O3_lag1"] = self.df['valeur_O3'].shift(1)
            self.df["valeur_PM10_lag1"] = self.df['valeur_PM10'].shift(1)
            self.df["valeur_PM25_lag1"] = self.df['valeur_PM25'].shift(1)

            self.df["valeur_NO2_lag12"] = self.df['valeur_NO2'].shift(12)
            self.df["valeur_CO_lag12"] = self.df['valeur_CO'].shift(12)
            self.df["valeur_O3_lag12"] = self.df['valeur_O3'].shift(12)
            self.df["valeur_PM10_lag12"] = self.df['valeur_PM10'].shift(12)
            self.df["valeur_PM25_lag12"] = self.df['valeur_PM25'].shift(12)

            self.df["valeur_NO2_lag24"] = self.df['valeur_NO2'].shift(24)
            self.df["valeur_CO_lag24"] = self.df['valeur_CO'].shift(24)
            self.df["valeur_O3_lag24"] = self.df['valeur_O3'].shift(24)
            self.df["valeur_PM10_lag24"] = self.df['valeur_PM10'].shift(24)
            self.df["valeur_PM25_lag24"] = self.df['valeur_PM25'].shift(24)

            # Filling NaNs
            self.df['valeur_NO2_lag1'] = self.df['valeur_NO2_lag1'].fillna(self.df['valeur_NO2'])
            self.df['valeur_CO_lag1'] = self.df['valeur_CO_lag1'].fillna(self.df['valeur_CO'])
            self.df['valeur_O3_lag1'] = self.df['valeur_O3_lag1'].fillna(self.df['valeur_O3'])
            self.df['valeur_PM10_lag1'] = self.df['valeur_PM10_lag1'].fillna(self.df['valeur_PM10'])
            self.df['valeur_PM25_lag1'] = self.df['valeur_PM25_lag1'].fillna(self.df['valeur_PM25'])

            self.df['valeur_NO2_lag12'] = self.df['valeur_NO2_lag12'].fillna(self.df['valeur_NO2'])
            self.df['valeur_CO_lag12'] = self.df['valeur_CO_lag12'].fillna(self.df['valeur_CO'])
            self.df['valeur_O3_lag12'] = self.df['valeur_O3_lag12'].fillna(self.df['valeur_O3'])
            self.df['valeur_PM10_lag12'] = self.df['valeur_PM10_lag12'].fillna(self.df['valeur_PM10'])
            self.df['valeur_PM25_lag12'] = self.df['valeur_PM25_lag12'].fillna(self.df['valeur_PM25'])

            self.df['valeur_NO2_lag24'] = self.df['valeur_NO2_lag24'].fillna(self.df['valeur_NO2'])
            self.df['valeur_CO_lag24'] = self.df['valeur_CO_lag24'].fillna(self.df['valeur_CO'])
            self.df['valeur_O3_lag24'] = self.df['valeur_O3_lag24'].fillna(self.df['valeur_O3'])
            self.df['valeur_PM10_lag24'] = self.df['valeur_PM10_lag24'].fillna(self.df['valeur_PM10'])
            self.df['valeur_PM25_lag24'] = self.df['valeur_PM25_lag24'].fillna(self.df['valeur_PM25'])

            self.df.set_index("date", inplace=True)
        
        else:
            self.df.set_index("date", inplace=True)

        return self.df

In [100]:
class Preprocess:
    def __init__(self, df, test: bool=False):
        self.dfs = []
        for col in ['valeur_NO2', 'valeur_CO', 'valeur_O3', 'valeur_PM10', 'valeur_PM25']:
            if test == False:
                self.df = df
                self.X = self.df.drop(col, axis=1) 
                y = self.df[col]  

                self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(self.X, y, test_size=0.12, random_state=42, shuffle=False)

                self.pipe(test=False)
                self.dfs.append([self.X_train, self.X_test, self.y_train, self.y_test])
        
            else:
                self.df = df
                self.pipe(test=True)
                self.dfs.append([self.X_train, self.X_test, self.y_train, self.y_test])
        
        
    def pipe(self, test: bool=False):
        
        if test == False:
            categorical_features = ['year', 'holiday', 'rush_hour', 'lockdown', 'curfew']

            numerical_features = ['weekend', 'valeur_NO2_lag1', 'valeur_CO_lag1', 'valeur_O3_lag1',
                'valeur_PM10_lag1', 'valeur_PM25_lag1', 'valeur_NO2_lag12',
                'valeur_CO_lag12', 'valeur_O3_lag12', 'valeur_PM10_lag12',
                'valeur_PM25_lag12', 'valeur_NO2_lag24', 'valeur_CO_lag24',
                'valeur_O3_lag24', 'valeur_PM10_lag24', 'valeur_PM25_lag24', 'flow',
                'occupation_rate', 'ALTI', 'RR1', 'QRR1', 'T', 'QT', 'TN', 'QTN', 'QHTN', 'TX', 'QTX',
                'QHTX', 'DG', 'QDG']

            dates = ["weekday", "month", "hour", 'day']

            cyclical = CyclicalFeatures(variables=None, drop_original=True)

            full_pipeline = ColumnTransformer([
                    ("cat", OneHotEncoder(sparse_output=False), categorical_features),
                    ("num", MinMaxScaler(), numerical_features), 
                    ("dates", cyclical, dates),
                ], remainder='passthrough').set_output(transform='pandas')
        
            self.data_train_prepared = full_pipeline.fit_transform(self.X_train)
            self.data_test_prepared = full_pipeline.transform(self.X_test)

        else:
            self.data_pred = full_pipeline.fit_transform(self.df)
            self.data_pred['cat__curfew_1'] = 0
            self.data_pred['cat__holiday_1'] = 0
            self.data_pred['cat__lockdown_1'] = 0
            self.data_pred['cat__year_2020'] = 0
            self.data_pred['cat__year_2021'] = 0
            self.data_pred['cat__year_2022'] = 0
            self.data_pred['cat__year_2023'] = 0

In [96]:
eng = Eng(df, weather, traffic, holidays)
df = eng.output(test=False)
eng1 = Eng(final_test, weather, traffic, holidays)
df_test = eng1.output(test=True)
    
data1, data2, data3, data4, data5 = Preprocess(df, test=False)
data_test1, data_test2, data_test3, data_test4, data_test5 = Preprocess(df_test, test=True)

TypeError: cannot unpack non-iterable Preprocess object

In [101]:
eng = Eng(df, weather, traffic, holidays)
df = eng.output(test=False)
eng1 = Eng(final_test, weather, traffic, holidays)
df_test = eng1.output(test=True)
lags = [1, 12, 24]
for lag in lags:
    df_test[f'valeur_NO2_lag{lag}'] = df[f'valeur_NO2_lag{lag}'].iloc[-1]
    df_test[f'valeur_CO_lag{lag}'] = df[f'valeur_CO_lag{lag}'].iloc[-1]
    df_test[f'valeur_O3_lag{lag}'] = df[f'valeur_O3_lag{lag}'].iloc[-1]
    df_test[f'valeur_PM10_lag{lag}'] = df[f'valeur_PM10_lag{lag}'].iloc[-1]
    df_test[f'valeur_PM25_lag{lag}'] = df[f'valeur_PM25_lag{lag}'].iloc[-1]
data_pred = Preprocess(df_test, test=True).data_pred

UnboundLocalError: cannot access local variable 'full_pipeline' where it is not associated with a value

In [None]:
df_test.to_csv("../processed_data/test.csv")

Unnamed: 0_level_0,ALTI,RR1,QRR1,T,QT,TN,QTN,QHTN,TX,QTX,...,valeur_NO2_lag12,valeur_CO_lag12,valeur_O3_lag12,valeur_PM10_lag12,valeur_PM25_lag12,valeur_NO2_lag24,valeur_CO_lag24,valeur_O3_lag24,valeur_PM10_lag24,valeur_PM25_lag24
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2024-09-03 23:00:00,46,0.0,1.0,19.1,1.0,19.0,1.0,9.0,19.2,1.0,...,21.831528,0.182,49.2,7.1,3.1,21.831528,0.162,56.7,11.6,6.7
2024-09-04 00:00:00,46,0.0,1.0,19.0,1.0,18.9,1.0,9.0,19.1,1.0,...,21.831528,0.182,49.2,7.1,3.1,21.831528,0.162,56.7,11.6,6.7
2024-09-04 01:00:00,46,0.0,1.0,18.7,1.0,18.7,1.0,9.0,18.9,1.0,...,21.831528,0.182,49.2,7.1,3.1,21.831528,0.162,56.7,11.6,6.7
2024-09-04 02:00:00,46,0.0,1.0,18.3,1.0,18.3,1.0,9.0,18.7,1.0,...,21.831528,0.182,49.2,7.1,3.1,21.831528,0.162,56.7,11.6,6.7
2024-09-04 03:00:00,46,0.0,1.0,18.1,1.0,18.1,1.0,9.0,18.3,1.0,...,21.831528,0.182,49.2,7.1,3.1,21.831528,0.162,56.7,11.6,6.7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-09-24 18:00:00,46,1.6,1.0,15.2,1.0,15.2,1.0,9.0,15.9,1.0,...,21.831528,0.182,49.2,7.1,3.1,21.831528,0.162,56.7,11.6,6.7
2024-09-24 19:00:00,46,0.0,1.0,15.2,1.0,15.1,1.0,9.0,15.2,1.0,...,21.831528,0.182,49.2,7.1,3.1,21.831528,0.162,56.7,11.6,6.7
2024-09-24 20:00:00,46,0.0,1.0,15.2,9.0,15.1,9.0,9.0,15.2,9.0,...,21.831528,0.182,49.2,7.1,3.1,21.831528,0.162,56.7,11.6,6.7
2024-09-24 21:00:00,46,0.0,1.0,14.9,1.0,14.9,1.0,9.0,15.2,1.0,...,21.831528,0.182,49.2,7.1,3.1,21.831528,0.162,56.7,11.6,6.7


In [None]:
data_train_prepared  = data.data_train_prepared 
y_train = data.y_train

data_test_prepared = data.data_test_prepared 
y_test = data.y_test

data_pred = data_test.data_pred

In [None]:
final_test = pd.read_csv("../data/test.csv", index_col=0, parse_dates=[0])

for i in [data1, data2, data3, data4, data5]:
    data_train_prepared  = i.data_train_prepared 
    y_train = i.y_train

    data_test_prepared = i.data_test_prepared 
    y_test = i.y_test

    data_pred = data_test.data_pred
    forest = CatBoostRegressor(iterations=1000, l2_leaf_reg=5, 
                            learning_rate=0.15, max_depth=10, subsample=0.6, colsample_bylevel=1, 
                            verbose=False)
    miaou = forest.fit(data_train_prepared, y_train)

    prediction = miaou.predict(data_pred)
    prediction[prediction < 0] = 0

    df_submit = pd.DataFrame(prediction, columns=["valeur_NO2", "valeur_CO", "valeur_O3", "valeur_PM10", "valeur_PM25"])

    final_test = pd.concat([final_test, df_submit], axis=1)

In [None]:
prediction = miaou.predict(data_pred)
prediction[prediction < 0] = 0

df_submit = pd.DataFrame(prediction, columns=["valeur_NO2", "valeur_CO", "valeur_O3", "valeur_PM10", "valeur_PM25"])

final_test = pd.read_csv("../data/test.csv", index_col=0)

df_submit.index = final_test.index
df_submit

In [35]:
df_submit.to_csv("../output/submission.csv")

In [None]:
prediction = miaou.predict(data_pred)
prediction[prediction < 0] = 0

print(f"The MAE is {mean_absolute_error(miaou.predict(data_train_prepared), y_train)} for the train set.")
print(f"The MAE is {mean_absolute_error(prediction, y_test)} for the test set.")

The MAE is 0.3778390200766095 for the train set.
The MAE is 2.1559772487021363 for the test set.


## With Prophet

In [37]:
import pandas as pd
import numpy as np
from prophet import Prophet

y_train = y_train.reset_index()

# Prepare the predictions for each target using Prophet
for target in y_train.columns:
    df_prophet = df.rename(columns={'date': 'ds', target: 'y'})
    
    # Initialize and fit the Prophet model
    model = Prophet()
    model.fit(df_prophet)

    # Create a DataFrame for future predictions (next 5 hours)
    future_dates = model.make_future_dataframe(periods=504, freq='h')

    # Make predictions
    forecast = model.predict(future_dates)

    # Display predictions
    print(f"\nPredicted Values for {target}:")
    print(forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']].tail(5))


  from .autonotebook import tqdm as notebook_tqdm
cmdstanpy  DEBUG cmd: where.exe tbb.dll
cwd: None


AttributeError: 'Prophet' object has no attribute 'stan_backend'

## With Darts

In [None]:
import pandas as pd
import numpy as np
from darts import TimeSeries
from darts.models import CatBoostModel
from darts.metrics import mae

# Convert to Darts TimeSeries for each target
series_1 = TimeSeries.from_dataframe(y_train, 'date', 'valeur_NO2')
series_2 = TimeSeries.from_dataframe(y_train, 'date', 'valeur_CO')
series_3 = TimeSeries.from_dataframe(y_train, 'date', 'valeur_O3')
series_4 = TimeSeries.from_dataframe(y_train, 'date', 'valeur_PM10')
series_5 = TimeSeries.from_dataframe(y_train, 'date', 'valeur_PM25')

# Initialize CatBoost model for each target
model_1 = CatBoostModel(lags=3)
model_2 = CatBoostModel(lags=3)
model_3 = CatBoostModel(lags=3)
model_4 = CatBoostModel(lags=3)
model_5 = CatBoostModel(lags=3)

# Fit the model for each target
model_1.fit(series_1)
model_2.fit(series_2)
model_3.fit(series_3)
model_4.fit(series_4)
model_5.fit(series_5)

# Perform recursive forecasting for each target
predictions_1 = model_1.predict(len(final_test))
predictions_2 = model_2.predict(len(final_test))
predictions_3 = model_3.predict(len(final_test))
predictions_4 = model_4.predict(len(final_test))
predictions_5 = model_5.predict(len(final_test))

In [42]:
pd.DataFrame([[predictions_1, predictions_2, predictions_3, predictions_4, predictions_5]],
             columns=["valeur_NO2", "valeur_CO", "valeur_O3", "valeur_PM10", "valeur_PM25"])

Unnamed: 0,valeur_NO2,valeur_CO,valeur_O3,valeur_PM10,valeur_PM25
0,"(((<TimeSeries (DataArray) (date: 1, component...","(((<TimeSeries (DataArray) (date: 1, component...","(((<TimeSeries (DataArray) (date: 1, component...","(((<TimeSeries (DataArray) (date: 1, component...","(((<TimeSeries (DataArray) (date: 1, component..."


In [62]:
predictions_1.to_csv("../output/pred1.csv")
predictions_2.to_csv("../output/pred2.csv")
predictions_3.to_csv("../output/pred3.csv")
predictions_4.to_csv("../output/pred4.csv")
predictions_5.to_csv("../output/pred5.csv")

In [70]:
pred1 = pd.read_csv("../output/pred1.csv", index_col=0)
pred2 = pd.read_csv("../output/pred2.csv", index_col=0)
pred3 = pd.read_csv("../output/pred3.csv", index_col=0)
pred4 = pd.read_csv("../output/pred4.csv", index_col=0)
pred5 = pd.read_csv("../output/pred5.csv", index_col=0)

In [72]:
final = pd.concat([pred1, pred2, pred3, pred4, pred5], axis=1)
final.index = final_test.index
final

Unnamed: 0_level_0,valeur_NO2,valeur_CO,valeur_O3,valeur_PM10,valeur_PM25
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2024-09-03 23,10.383597,0.180708,59.052518,12.104946,5.898594
2024-09-04 00,10.594473,0.183766,60.105916,12.525816,6.111461
2024-09-04 01,11.558665,0.187163,59.512192,12.950781,6.502704
2024-09-04 02,13.294573,0.188378,58.091384,13.522361,6.723683
2024-09-04 03,15.342283,0.190749,56.068238,14.022370,6.930514
...,...,...,...,...,...
2024-09-24 18,22.362755,0.198516,43.742501,15.092087,8.371878
2024-09-24 19,22.362755,0.198516,43.742501,15.092087,8.371878
2024-09-24 20,22.362755,0.198516,43.742501,15.092087,8.371878
2024-09-24 21,22.362755,0.198516,43.742501,15.092087,8.371878


In [73]:
final.to_csv("../output/submission.csv")

In [74]:
data_pred

Unnamed: 0_level_0,cat__year_2024,cat__holiday_0,cat__rush_hour_0,cat__rush_hour_1,cat__lockdown_0,cat__curfew_0,num__weekend,num__valeur_NO2_lag1,num__valeur_CO_lag1,num__valeur_O3_lag1,...,dates__hour_cos,dates__day_sin,dates__day_cos,cat__curfew_1,cat__holiday_1,cat__lockdown_1,cat__year_2020,cat__year_2021,cat__year_2022,cat__year_2023
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2024-09-03 23:00:00,1.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,...,1.000000,7.071068e-01,0.707107,0,0,0,0,0,0,0
2024-09-04 00:00:00,1.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,...,1.000000,8.660254e-01,0.500000,0,0,0,0,0,0,0
2024-09-04 01:00:00,1.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.962917,8.660254e-01,0.500000,0,0,0,0,0,0,0
2024-09-04 02:00:00,1.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.854419,8.660254e-01,0.500000,0,0,0,0,0,0,0
2024-09-04 03:00:00,1.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.682553,8.660254e-01,0.500000,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-09-24 18:00:00,1.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.203456,-2.449294e-16,1.000000,0,0,0,0,0,0,0
2024-09-24 19:00:00,1.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.460065,-2.449294e-16,1.000000,0,0,0,0,0,0,0
2024-09-24 20:00:00,1.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.682553,-2.449294e-16,1.000000,0,0,0,0,0,0,0
2024-09-24 21:00:00,1.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.854419,-2.449294e-16,1.000000,0,0,0,0,0,0,0
