In [None]:
import os
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from torchinfo import summary

class Conv1dAutoencoder(nn.Module):
    def __init__(self, input_length, latent_dim):
        super(Conv1dAutoencoder, self).__init__()
        self.input_length = input_length
        self.latent_dim = latent_dim

        # Encoder: Conv1d downsampling
        self.encoder = nn.Sequential(
            nn.Conv1d(1, 16, kernel_size=3, stride=2, padding=1),
            nn.ReLU(True),
            nn.Conv1d(16, 8, kernel_size=3, stride=2, padding=1),
            nn.ReLU(True)
        )
        # infer flattened size
        with torch.no_grad():
            dummy = torch.zeros(1, 1, self.input_length)
            enc = self.encoder(dummy)
            c, l = enc.shape[1], enc.shape[2]
            self._flattened_size = c * l

        # Bottleneck
        self.fc1 = nn.Linear(self._flattened_size, self.latent_dim)
        self.fc2 = nn.Linear(self.latent_dim, self._flattened_size)

        # Decoder: ConvTranspose1d upsampling
        self.decoder = nn.Sequential(
            nn.Unflatten(1, (c, l)),
            nn.ConvTranspose1d(c, 16, kernel_size=3, stride=2, padding=1, output_padding=1),
            nn.ReLU(True),
            nn.ConvTranspose1d(16, 1, kernel_size=3, stride=2, padding=1, output_padding=1),
            nn.Sigmoid()
        )

    def encode(self, x):
        x = self.encoder(x)
        x = x.view(x.size(0), -1)
        return self.fc1(x)

    def decode(self, z):
        x = self.fc2(z)
        x = self.decoder(x)
        return x[..., :self.input_length]

    def forward(self, x):
        z = self.encode(x)
        return self.decode(z), z

# helper for file size

def get_size(path):
    try:
        return os.path.getsize(path)
    except:
        return 0

# sequence creation

def create_sequences(values, labels, window_size, step=1):
    seqs, seq_labels = [], []
    for i in range(0, len(values) - window_size + 1, step):
        seq = values[i:i+window_size]
        label = 1 if any(labels[i:i+window_size]) else 0
        seqs.append(seq)
        seq_labels.append(label)
    return np.array(seqs), np.array(seq_labels)

if __name__ == '__main__':
    # directories
    if os.name == 'nt':
        data_dir = 'C:/Users/jed95/Documents/GitHub/anomaly_detection/dataset/yahoo_s5/A2Benchmark/'
    else:
        data_dir = '/home/adlink3/Downloads/yahoo_s5/A2Benchmark/'
    window_size = 10

    # load csvs
    files = [os.path.join(data_dir,f) for f in os.listdir(data_dir) if f.endswith('.csv')]
    dfs = [pd.read_csv(f) for f in files]
    df = pd.concat(dfs, ignore_index=True)
    df.dropna(inplace=True)
    scaler = MinMaxScaler()
    df['value'] = scaler.fit_transform(df['value'].values.reshape(-1,1))

    # sequences
    X, y = create_sequences(df['value'].values, df['is_anomaly'].values, window_size)
    # split normals and anomalies
    normals = np.where(y==0)[0]
    anomalies = np.where(y==1)[0]
    Xn, yn = X[normals], y[normals]
    Xa, ya = X[anomalies], y[anomalies]
    # train/val/test splits
    Xn_train, Xn_tmp, yn_train, yn_tmp = train_test_split(Xn, yn, test_size=0.6, random_state=42, stratify=yn)
    Xn_val, Xn_test, yn_val, yn_test   = train_test_split(Xn_tmp, yn_tmp, test_size=0.5, random_state=42, stratify=yn_tmp)
    Xa_train, Xa_tmp, ya_train, ya_tmp = train_test_split(Xa, ya, test_size=0.6, random_state=42, stratify=ya)
    Xa_val, Xa_test, ya_val, ya_test   = train_test_split(Xa_tmp, ya_tmp, test_size=0.5, random_state=42, stratify=ya_tmp)
    # combine
    X_train = np.concatenate([Xn_train, Xa_train], axis=0)
    y_train = np.concatenate([yn_train, ya_train], axis=0)
    X_val   = np.concatenate([Xn_val,   Xa_val],   axis=0)
    y_val   = np.concatenate([yn_val,   ya_val],   axis=0)
    X_test  = np.concatenate([Xn_test,  Xa_test],  axis=0)
    y_test  = np.concatenate([yn_test,  ya_test],  axis=0)
    print("Training data shape:", X_train.shape, y_train.shape)
    print("Validation data shape:", X_val.shape, y_val.shape)
    print("Test data shape:", X_test.shape, y_test.shape)
    # shuffle
    X_train, y_train = shuffle(X_train, y_train, random_state=42)
    X_val,   y_val   = shuffle(X_val,   y_val,   random_state=42)
    X_test,  y_test  = shuffle(X_test,  y_test,  random_state=42)

    # tensors
    X_train_t = torch.tensor(X_train).unsqueeze(1).float()
    y_train_t = torch.tensor(y_train).long()
    X_val_t   = torch.tensor(X_val).unsqueeze(1).float()
    y_val_t   = torch.tensor(y_val).long()
    X_test_t  = torch.tensor(X_test).unsqueeze(1).float()
    y_test_t  = torch.tensor(y_test).long()

    # save uncompressed
    np.save('X_train_t.npy', X_train_t)
    size_un = get_size('X_train_t.npy')

    # dataloader
    train_ds = torch.utils.data.TensorDataset(X_train_t)
    loader = torch.utils.data.DataLoader(train_ds, batch_size=64, shuffle=True)

    # model
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = Conv1dAutoencoder(input_length=window_size, latent_dim=10).to(device)
    opt = optim.Adam(model.parameters(), lr=1e-4)
    loss_fn = nn.MSELoss()

    # train
    model.train()
    for ep in range(5):
        tot=0
        for bx, in loader:
            bx = bx.to(device)
            recon, _ = model(bx)
            loss = loss_fn(recon, bx)
            opt.zero_grad(); loss.backward(); opt.step()
            tot+=loss.item()
        print(f'Epoch {ep+1} Loss {tot/len(loader):.4f}')

    # infer
    model.eval()
    with torch.no_grad():
        recon, z = model(X_train_t.to(device))

        z = z.cpu().numpy()
    #print(summary(
    #model,
    #input_size=(32, 1, window_size),
    #))
    # save compressed and append labels
    np.save('compressed.npy', z)
    dfz = pd.DataFrame(z, columns=[f'z{i}' for i in range(z.shape[1])])
    dfz['label'] = y_train
    dfz.to_csv('compressed_with_labels.csv', index=False)

    # sizes
    s_c_np = get_size('compressed.npy')
    s_u = size_un
    ratio = s_c_np / s_u if s_u else float('nan')
    report = f"Uncompressed: {s_u} bytes\nCompressed: {s_c_np} bytes\nRatio: {ratio:.3f}\n"
    print(report)



Training data shape: (56836, 10) (56836,)
Validation data shape: (42627, 10) (42627,)
Test data shape: (42628, 10) (42628,)
Epoch 1 Loss 0.1097
Epoch 2 Loss 0.0046
Epoch 3 Loss 0.0012
Epoch 4 Loss 0.0006
Epoch 5 Loss 0.0005
Uncompressed: 2273568 bytes
Compressed: 2273568 bytes
Ratio: 1.000

