This notebook is for comparing the gap filling techniques visually

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import torch

In [2]:
# First import the model
from modules.MLPstuff import MLP
from modules.util import grab_data, EBCDataset

In [3]:
# Load the data on which predictions and or gap filling are to be done
from columns import COLS_FEATURES, COLS_LABELS, COLS_TIME
input, target, dim_in, dim_out = grab_data('data/data_merged_with_nans.csv', columns_data=COLS_FEATURES, columns_labels=COLS_LABELS, return_dataset = False )
data = pd.concat([input, target], axis=1)
print(input.shape)
print(target.shape)
print(data.shape)

(11655, 12)
(11655, 2)
(11655, 14)


In [4]:
# Load the model
model = MLP(dim_in, dim_out, num_hidden_units=30, num_hidden_layers=4)
model.load_state_dict(torch.load('model_saves/mlp_1.pth'))
model.eval()

MLP(
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (linear_relu_stack): Sequential(
    (0): Linear(in_features=12, out_features=30, bias=True)
    (1): ReLU()
    (2): Linear(in_features=30, out_features=30, bias=True)
    (3): ReLU()
    (4): Linear(in_features=30, out_features=30, bias=True)
    (5): ReLU()
    (6): Linear(in_features=30, out_features=30, bias=True)
    (7): ReLU()
    (8): Linear(in_features=30, out_features=30, bias=True)
    (9): ReLU()
    (10): Linear(in_features=30, out_features=2, bias=True)
  )
)

In [316]:
# Get an overview about NaNs per Row to remove input data nans
nan_counts = input.isnull().sum()
print("NaNs per column:")
print(nan_counts)

NaNs per column:
day                           0
waterPressureDeficit          0
month                         0
outgoingShortwaveRadiation    0
waterVaporPressure            0
soilHeatflux                  0
location                      0
airPressure                   0
year                          0
windSpeed                     0
30min                         0
incomingShortwaveRadiation    0
dtype: int64


In [317]:
# identify rows where labels are NaN, but features aren't
mask_nan = data[COLS_LABELS].isna().any(axis=1)
mask_not_nan = data[COLS_FEATURES].notna().all(axis=1)

# Combine the masks
combined_mask = mask_nan & mask_not_nan

# data used for prediction
input = data[combined_mask][COLS_FEATURES].reset_index(drop=True)

In [318]:
print(data[data[COLS_LABELS].isna().any(axis=1)].shape)
print(input.shape)

(0, 14)
(0, 12)


In [5]:
input.head()

Unnamed: 0,year,month,day,30min,location,incomingShortwaveRadiation,outgoingShortwaveRadiation,soilHeatflux,airPressure,waterPressureDeficit,waterVaporPressure,windSpeed
0,2023,2,16,23,0,408.582,87.725333,6.283667,996.140667,2.131392,6.857902,1.988667
1,2023,2,16,25,0,256.372,53.086667,11.058,995.548,2.456927,6.827809,1.910667
2,2023,2,16,26,0,371.893667,82.873667,10.838333,995.325667,2.997957,7.012585,1.548333
3,2023,2,16,27,0,342.721,78.387333,13.126333,994.989667,3.479701,6.975607,2.116
4,2023,2,16,28,0,279.880333,62.984667,14.722,994.96,3.488345,7.070665,1.428333


In [6]:
# transform input into torch.tensor and make predictions
input_tensor = torch.tensor(input.values, dtype=torch.float32)

with torch.no_grad():
    pred = model(input_tensor).numpy() #  Transform back to numpy 
# create dataframe of predictions with target rows and the indices of the missing values
pred = pd.DataFrame(pred, columns=target.columns)

# merge predictions onto features
data_pred = pd.concat([input, pred], axis=1)


In [7]:
input_tensor.data

tensor([[2.0230e+03, 2.0000e+00, 1.6000e+01,  ..., 2.1314e+00, 6.8579e+00,
         1.9887e+00],
        [2.0230e+03, 2.0000e+00, 1.6000e+01,  ..., 2.4569e+00, 6.8278e+00,
         1.9107e+00],
        [2.0230e+03, 2.0000e+00, 1.6000e+01,  ..., 2.9980e+00, 7.0126e+00,
         1.5483e+00],
        ...,
        [2.0240e+03, 6.0000e+00, 3.0000e+01,  ..., 1.6388e+00, 1.5432e+01,
         2.9223e+00],
        [2.0240e+03, 6.0000e+00, 3.0000e+01,  ..., 1.8795e+00, 1.7002e+01,
         5.1367e-01],
        [2.0240e+03, 6.0000e+00, 3.0000e+01,  ..., 1.5369e+00, 1.5290e+01,
         2.8293e+00]])

In [282]:
input_tensor.shape

torch.Size([11655, 12])

In [283]:
pred.head()

Unnamed: 0,H_orig,LE_orig
0,-402.012909,1124.145752
1,-405.99707,1138.849487
2,-404.942291,1130.25354
3,-406.366852,1134.293579
4,-407.984894,1140.052002


In [284]:
target.head()

Unnamed: 0,H_orig,LE_orig
0,93.7665,112.71
1,51.9651,30.423
2,69.53238,39.713
3,71.31974,39.382
4,48.84439,32.021


In [285]:
df_test = (pred - target) ** 2
df_test.mean()

H_orig     1.884849e+05
LE_orig    1.069379e+06
dtype: float64

In [222]:
print(data_pred.shape)
print(input.shape)
print(pred.shape)

(15378, 14)
(15378, 12)
(15378, 2)


In [223]:
# create new column indicating that it is a prediction
data_pred["pred"] = 'red'

# create original dataframe
data_orig = data[~mask_nan].reset_index(drop=True)
data_orig["pred"] = 'blue'
data_plot = pd.concat([data_orig, data_pred])

In [224]:
print(data_pred.shape)
print(data_orig.shape)
print(data_plot.shape)

(15378, 15)
(12040, 15)
(27418, 15)


In [225]:
# filter by location and sort by date
data_plot_bg = data_plot[data_plot["location"] == 0].sort_values(by=['year', 'month', 'day', '30min'])
data_plot_gw = data_plot[data_plot["location"] == 1].sort_values(by=['year', 'month', 'day', '30min'])

In [226]:
print(data_plot_bg[data_plot_bg["pred"] == 'red'].shape)
print(data_plot_bg[data_plot_bg["pred"] == 'blue'].shape)

(9249, 15)
(5285, 15)


In [8]:
# Create plots of the filled frame

time = np.arange(data_plot_bg.shape[0])

plt.figure(figsize=(40,5))
plt.scatter(time, data_plot_bg['H_orig'].to_numpy(), marker='x', c=data_plot_bg['pred'])
# plt.plot(time, target['H_orig'].to_numpy(), linestyle='dotted' )

NameError: name 'data_plot_bg' is not defined