# Bonus assignment

**Joris LIMONIER**

---

In this assignment, we try to predict the number of passengers through time. We will use the airline dataset.


## Data Preprocessing

In [1]:
from pathlib import Path

import airline_passengers as ap
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from torch.utils.data import DataLoader, Dataset, TensorDataset
from tqdm import tqdm

pio.templates.default = "plotly_white"

2023-03-08 11:50:18.035266: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
%reload_ext autoreload
%autoreload 2

### Load the dataset


In [3]:
filepath = Path("airline_passenger.txt")
passengers = pd.read_csv(
  filepath,
  parse_dates=["date"],
  names=["date", "passengers"],
  index_col="date",
  header=0,
  dtype={"passengers": "float32"},
)
passengers

Unnamed: 0_level_0,passengers
date,Unnamed: 1_level_1
1949-01-01,112.0
1949-02-01,118.0
1949-03-01,132.0
1949-04-01,129.0
1949-05-01,121.0
...,...
1960-08-01,606.0
1960-09-01,508.0
1960-10-01,461.0
1960-11-01,390.0


### Split the dataset into train and test
We use 1/3 of the dataset for testing. The remaining 2/3 is further split into training and validation.

We also scale the data with respect to the training data as the validation and test data should not be used except for model evaluation and testing.

In [4]:
val_size = 0.1  # proportion of the training set is used for validation
test_size = 1 / 3  # proportion of the data is used for testing

train, val, test = ap.ttv_split(df=passengers, val_size=val_size, test_size=test_size)

# Scale the data
train, val, test = ap.scale_wrt(train, val, test, wrt=train, feature_range=(0, 1))
print(f"{len(train) = }, {len(val) = }, {len(test) = }")


len(train) = 81, len(val) = 15, len(test) = 48


We plot the train, validation and test sets with different colors.

In [5]:
ap.plot_tts(train=train, val=val, test=test)

## Model

We define the model and start training.

We define constants to use for training:
- `SEQ_LENGTH`: the number of time steps to use for training, *i.e.* the number of previous months to use to predict the next month
- `N_EPOCHS`: the maximum number of epochs to train for
- `BATCH_SIZE`: the batch size to use for training

In [6]:
seq_length = 1
n_epochs = 400
batch_size = 8

data_module = ap.PassengerDataModule(
  train=train,
  val=val,
  test=test,
  seq_length=seq_length,
  batch_size=batch_size,
)

# Create the model, optimizer and loss function
lstm = ap.PassengerLSTM(input_size=1, hidden_size=50, num_layers=3, output_size=1)
optimizer = optim.AdamW(lstm.parameters(), lr=0.0002)
loss_fn = nn.MSELoss()


# Train the model
predictor = ap.PassengerPredictor(
  data_module=data_module, model=lstm, optimizer=optimizer, loss_fn=loss_fn
)
train_losses, val_losses = predictor.train(
  model=lstm,
  optimizer=optimizer,
  loss_fn=loss_fn,
  n_epochs=n_epochs,
)


# Plot the losses
losses = pd.DataFrame({"train": train_losses, "val": val_losses})
px.line(losses, y=["train", "val"], title="Losses", log_y=True)


Epoch 0: train loss 0.1529, val loss 0.6783
Epoch 10: train loss 0.0938, val loss 0.5113
Epoch 20: train loss 0.0486, val loss 0.3035
Epoch 30: train loss 0.0446, val loss 0.2366
Epoch 40: train loss 0.0434, val loss 0.2236
Epoch 50: train loss 0.0419, val loss 0.2127
Epoch 60: train loss 0.0403, val loss 0.2006
Epoch 70: train loss 0.0384, val loss 0.1868
Epoch 80: train loss 0.0363, val loss 0.1710
Epoch 90: train loss 0.0337, val loss 0.1528
Epoch 100: train loss 0.0307, val loss 0.1320
Epoch 110: train loss 0.0272, val loss 0.1088
Epoch 120: train loss 0.0233, val loss 0.0839
Epoch 130: train loss 0.0191, val loss 0.0592
Epoch 140: train loss 0.0149, val loss 0.0383
Epoch 150: train loss 0.0115, val loss 0.0254
Epoch 160: train loss 0.0092, val loss 0.0218
Epoch 170: train loss 0.0081, val loss 0.0240
Epoch 180: train loss 0.0077, val loss 0.0273
Epoch 190: train loss 0.0075, val loss 0.0293
Epoch 200: train loss 0.0074, val loss 0.0301
Epoch 210: train loss 0.0074, val loss 0.0300

In [7]:
y_pred = predictor.predict(model=lstm, dataloader=data_module.test_dataloader)
pred = pd.DataFrame(y_pred.flatten(), index=test.index[seq_length:], columns=["passengers"])

y_pred_val = predictor.predict(model=lstm, dataloader=data_module.val_dataloader)
pred_val = pd.DataFrame(y_pred_val.flatten(), index=val.index[seq_length:], columns=["passengers"])

fig = ap.plot_tts(train=train, val=val, test=test)
fig.add_trace(go.Scatter(x=pred.index, y=pred["passengers"], name="pred", mode="lines"))
fig.add_trace(go.Scatter(x=pred_val.index, y=pred_val["passengers"], name="pred_val", mode="lines"))

## Feature engineering

In [8]:
passengers_augmented = passengers.copy()
passengers_augmented["month"] = passengers_augmented.index.month
passengers_augmented["year"] = passengers_augmented.index.year
passengers_augmented["season"] = passengers_augmented.index.month % 12 // 3 + 1
passengers_augmented["summer"] = passengers_augmented["season"] == 3

px.line(
  passengers_augmented,
  x="month",
  y="passengers",
  color="year",
  title="Passengers by month",
).show()


In [9]:
# Make a df with the month and number of passengers
passengers_month = (
  passengers_augmented.groupby("month").mean().reset_index().drop(columns="year")
)
px.bar(passengers_month, x="month", y="passengers", title="Average passengers by month").show()


Plot the number of passengers for each month.

In [10]:
passengers_season = (
  passengers_augmented.groupby("season").mean().reset_index()[["season", "passengers"]]
)
px.bar(
  passengers_season, x="season", y="passengers", title="Average passengers by season"
).show()


In [11]:
# Plot the correlation between the new features and the target
corr = passengers_augmented.corr().round(4)
# Use blue red color scale
fig = px.imshow(
  corr.values,
  color_continuous_scale="gray_r",
  color_continuous_midpoint=0,
  title="Correlation between features",
)

fig.update_xaxes(
  title="Features", ticktext=corr.columns, tickvals=np.arange(len(corr.columns))
)
fig.update_yaxes(
  title="Features", ticktext=corr.columns, tickvals=np.arange(len(corr.columns))
)
fig.show()


In [12]:
val_size = 0.1  # proportion of the training set is used for validation
test_size = 1 / 3  # proportion of the data is used for testing

train, val, test = ap.ttv_split(df=passengers_augmented, val_size=val_size, test_size=test_size)


# Scale the data
train, val, test = ap.scale_wrt(train, val, test, wrt=train, feature_range=(0, 1))
print(f"{len(train) = }, {len(val) = }, {len(test) = }")


len(train) = 81, len(val) = 15, len(test) = 48


In [26]:
seq_length = 1
n_epochs = 150
batch_size = 2

data_module = ap.PassengerDataModule(
  train=train,
  val=val,
  test=test,
  seq_length=seq_length,
  batch_size=batch_size,
  target_col="passengers",
)

# Create the model, optimizer and loss function
lstm = ap.PassengerLSTM(
  input_size=5, hidden_size=50, num_layers=3, output_size=1, dropout=0.2
)
optimizer = optim.AdamW(lstm.parameters(), lr=0.0001)
loss_fn = nn.MSELoss()


# Train the model
predictor = ap.PassengerPredictor(
  data_module=data_module, model=lstm, optimizer=optimizer, loss_fn=loss_fn
)
train_losses, val_losses = predictor.train(
  model=lstm,
  optimizer=optimizer,
  loss_fn=loss_fn,
  n_epochs=n_epochs,
)


# Plot the losses
losses = pd.DataFrame({"train": train_losses, "val": val_losses})
px.line(losses, y=["train", "val"], title="Losses", log_y=True)


Epoch 0: train loss 0.0892, val loss 0.4780
Epoch 10: train loss 0.0492, val loss 0.2893
Epoch 20: train loss 0.0431, val loss 0.2133
Epoch 30: train loss 0.0380, val loss 0.1702
Epoch 40: train loss 0.0265, val loss 0.1161
Epoch 50: train loss 0.0167, val loss 0.0587
Epoch 60: train loss 0.0088, val loss 0.0253
Epoch 70: train loss 0.0062, val loss 0.0143
Epoch 80: train loss 0.0058, val loss 0.0128
Epoch 90: train loss 0.0055, val loss 0.0111
Epoch 100: train loss 0.0060, val loss 0.0108
Epoch 110: train loss 0.0070, val loss 0.0113
Epoch 120: train loss 0.0059, val loss 0.0113
Epoch 130: train loss 0.0056, val loss 0.0108
Epoch 140: train loss 0.0062, val loss 0.0113


In [27]:
y_pred = predictor.predict(model=lstm, dataloader=data_module.test_dataloader)
pred = pd.DataFrame(
  y_pred.flatten(), index=test.index[seq_length:], columns=["passengers"]
)

y_pred_val = predictor.predict(model=lstm, dataloader=data_module.val_dataloader)
pred_val = pd.DataFrame(
  y_pred_val.flatten(), index=val.index[seq_length:], columns=["passengers"]
)

fig = ap.plot_tts(train=train, val=val, test=test)
fig.add_trace(go.Scatter(x=pred.index, y=pred["passengers"], name="pred", mode="lines"))
fig.add_trace(
  go.Scatter(x=pred_val.index, y=pred_val["passengers"], name="pred_val", mode="lines")
)


## Evaluation
