# SAITS Imputation with PyPOTS

> Notebook adapted in parts from [PyPOTS](https://github.com/WenjieDu/PyPOTS/).  <center>  
  <img src="https://raw.githubusercontent.com/WenjieDu/PyPOTS/main/docs/figs/PyPOTS%20logo.svg?sanitize=true" width="150" height="150" />
  </center>

<details>
        <summary> Using your own dataset </summary>
<div>   
```
{
# Install PyPOTS first: pip install pypots  
from pypots.data import mcar, fill_nan_with_mask  
from pypots.imputation import SAITS  
from pypots.utils.metrics import cal_mae  

# ❗️👀 Preprocess your data here to generate X, which should be a NumPy array
# X.shape should be [n_samples, T, D]

# hold out some observed values as ground truth
X_intact, X, missing_mask, indicating_mask = mcar(X, 0.1)
X = fill_nan_with_mask(X, missing_mask)

saits_base = SAITS(n_layers=2, d_model=256, d_inner=128, n_head=4, d_k=64, d_v=64, dropout=0.1, epochs=100)
saits_base.fit(X) # here use the whole set. You can also split X into train/val/test sets.
imputation = saits_base.impute(X)
mae = cal_mae(imputation, X_intact, indicating_mask)  # calculate mean absolute error on imputation
}```
</div>
</details>


```
# Install PyPOTS first: pip install pypots  
from pypots.data import mcar, fill_nan_with_mask  
from pypots.imputation import SAITS  
from pypots.utils.metrics import cal_mae  

# ❗️👀 Preprocess your data here to generate X, which should be a NumPy array
# X.shape should be [n_samples, T, D]

# hold out some observed values as ground truth
X_intact, X, missing_mask, indicating_mask = mcar(X, 0.1)
X = fill_nan_with_mask(X, missing_mask)

saits_base = SAITS(n_layers=2, d_model=256, d_inner=128, n_head=4, d_k=64, d_v=64, dropout=0.1, epochs=100)
saits_base.fit(X) # here use the whole set. You can also split X into train/val/test sets.
imputation = saits_base.impute(X)
mae = cal_mae(imputation, X_intact, indicating_mask)  # calculate mean absolute error on imputation
```

In [15]:
# Install PyPOTS first: pip install pypots
import numpy as np
from sklearn.preprocessing import StandardScaler

from pypots.data import load_specific_dataset, mcar, masked_fill
from pypots.imputation import SAITS, BRITS
from pypots.utils.metrics import cal_mae

# PhysionNET Dataset
## Data loading and preprocessing

In [None]:
# Data preprocessing. Tedious, but PyPOTS can help. 🤓
data = load_specific_dataset('physionet_2012')  # For datasets in PyPOTS database, PyPOTS will automatically download and extract it.
X = data['X']
num_samples = len(X['RecordID'].unique())

print('num samples: ', num_samples)
print(X)
X = X.drop('RecordID', axis = 1)
X = StandardScaler().fit_transform(X.to_numpy())
X = X.reshape(num_samples, 48, -1)
X_intact, X, missing_mask, indicating_mask = mcar(X, 0.1) # hold out 10% observed values as ground truth
X = masked_fill(X, 1 - missing_mask, np.nan)


## Train SAITS model

In [2]:
# Model training. This is PyPOTS showtime. 💪
saits = SAITS(n_steps=48, n_features=37, n_layers=2, d_model=256, d_inner=128, n_head=4, d_k=64, d_v=64, dropout=0.1, epochs=10)
saits.fit(X)  # train the model. Here I use the whole dataset as the training set, because ground truth is not visible to the model.
imputation = saits.impute(X)  # impute the originally-missing values and artificially-missing values
mae = cal_mae(imputation, X_intact, indicating_mask)  # calculate mean absolute error on the ground truth (artificially-missing values)

No given device, using default device: cuda:0
Model initialized successfully. Number of the trainable parameters: 1378358


NameError: name 'X' is not defined

# Toy dataset

The data set here was generated by using a script from the **AFA** repository from *Henrik v. Kleist*.
## Imports

In [2]:
import pandas as pd
import gzip

## Load data and preprocess

In [28]:
path_toydataset = '/home2/joshua.wendland/Documents/sepsis/toy_dataset/synthetic_ts_1/synthetic_ts_test_data_eav.csv.gz'

df = pd.read_csv(path_toydataset, compression=None)
df = df.sort_values(by=['id', 'time'], ascending=True, ignore_index=True)  # time was not sorted
#df = df.loc[df['id'] == 'id_90']
num_samples = len(df['id'].unique())
print('num samples: ', num_samples)

X = df.drop('id', axis = 1)
X = StandardScaler().fit_transform(X.to_numpy())
X[:,0] = df['time'].to_numpy() # time should not be normalized
X = X.reshape(num_samples, 50, -1)
X_intact, X, missing_mask, indicating_mask = mcar(X, 0.5) # hold out 10% observed values as ground truth
X = masked_fill(X, 1 - missing_mask, np.nan)

num samples:  10


# Model Training
### SAITS

In [14]:
# Model training. This is PyPOTS showtime. 💪
saits = SAITS(n_steps=50, n_features=6, n_layers=2, d_model=256, d_inner=128, n_head=4, d_k=64, d_v=64, dropout=0.0, epochs=200, patience=30)
saits.save_logs_to_tensorboard(saving_path='./runs/saits/', title='test')
saits.fit(X)  # train the model. Here I use the whole dataset as the training set, because ground truth is not visible to the model.
imputation = saits.impute(X)  # impute the originally-missing values and artificially-missing values
mae = cal_mae(imputation, X_intact, indicating_mask)  # calculate mean absolute error on the ground truth (artificially-missing values)

No given device, using default device: cuda:0
Model initialized successfully. Number of the trainable parameters: 1326476
epoch 0: training loss 7.7477
epoch 1: training loss 8.0805
epoch 2: training loss 7.5872
epoch 3: training loss 8.1251
epoch 4: training loss 5.4035
epoch 5: training loss 4.5630
epoch 6: training loss 6.8829
epoch 7: training loss 4.7526
epoch 8: training loss 5.6988
epoch 9: training loss 4.8192
epoch 10: training loss 4.7027
epoch 11: training loss 6.8866
epoch 12: training loss 5.9552
epoch 13: training loss 5.7504
epoch 14: training loss 7.0227
epoch 15: training loss 5.7542
epoch 16: training loss 5.0919
epoch 17: training loss 5.5334
epoch 18: training loss 4.2338
epoch 19: training loss 4.1536
epoch 20: training loss 4.4698
epoch 21: training loss 4.1121
epoch 22: training loss 3.9592
epoch 23: training loss 3.4621
epoch 24: training loss 3.8195
epoch 25: training loss 3.7491
epoch 26: training loss 3.6877
epoch 27: training loss 3.6544
epoch 28: training l

In [31]:
saits.logger
import tensorboard


        n_steps,
        n_features,
        rnn_hidden_size,
        learning_rate=1e-3,
        epochs=100,
        patience=10,
        batch_size=32,
        weight_decay=1e-5,
        device=None,

### BRITS



In [29]:
brits = BRITS(n_steps=50, n_features=6, rnn_hidden_size=64, learning_rate=10e-3, epochs=1000, patience=10)
brits.save_logs_to_tensorboard(saving_path='./runs/brits/', title='test')
brits.fit(X)  # train the model. Here I use the whole dataset as the training set, because ground truth is not visible to the model.
# imputation = brits.impute(X)  # impute the originally-missing values and artificially-missing values
# mae = cal_mae(imputation, X_intact, indicating_mask)  # calculate mean absolute error on the ground truth (artificially-missing values)


No given device, using default device: cuda:0
Model initialized successfully. Number of the trainable parameters: 41936
epoch 0: training loss 11.1776
epoch 1: training loss 10.9682
epoch 2: training loss 10.7608
epoch 3: training loss 10.5500
epoch 4: training loss 10.3391
epoch 5: training loss 10.1236
epoch 6: training loss 9.9127
epoch 7: training loss 9.7042
epoch 8: training loss 9.5074
epoch 9: training loss 9.3191
epoch 10: training loss 9.1319
epoch 11: training loss 8.9550
epoch 12: training loss 8.7873
epoch 13: training loss 8.6238
epoch 14: training loss 8.4655
epoch 15: training loss 8.3141
epoch 16: training loss 8.1671
epoch 17: training loss 8.0265
epoch 18: training loss 7.8895
epoch 19: training loss 7.7559
epoch 20: training loss 7.6280
epoch 21: training loss 7.5078
epoch 22: training loss 7.3886
epoch 23: training loss 7.2758
epoch 24: training loss 7.1706
epoch 25: training loss 7.0697
epoch 26: training loss 6.9794
epoch 27: training loss 6.8964
epoch 28: traini

<pypots.imputation.brits.BRITS at 0x7f0d00ea5d00>