In [None]:
# 구글드라이브 구글코랩에 연동(마운트)
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Diffusion Model for Sensor Dataset Imputation
import os
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset, random_split
from tqdm import tqdm
import matplotlib.pyplot as plt
import zipfile
from sklearn.preprocessing import StandardScaler

In [None]:
# Custom Dataset for Sensor Data
class SensorDataset(Dataset):
    def __init__(self, data_dir, transform=None):
        self.data_dir = data_dir
        self.transform = transform

        # Load and process all sensor data
        self.data = self._load_data()

    def _load_data(self):
        # Combine all sensor files into a single matrix (300 x 75,600)
        all_data = []
        for i in range(1, 64):
            file_path = os.path.join(self.data_dir, f"ProcessVar{i}.csv")
            if not os.path.exists(file_path):
                raise FileNotFoundError(f"File not found: {file_path}")
            sensor_data = pd.read_csv(file_path, header=None).values.T  # Transpose to 300x1200
            all_data.append(sensor_data)
        return np.hstack(all_data)  # Combine all sensors column-wise

    def __len__(self):
        return self.data.shape[0]

    def __getitem__(self, idx):
        sample = self.data[idx, :]
        if self.transform:
            sample = self.transform(sample)
        return sample

In [None]:
# Define the 1D UNet for the diffusion process
class UNet1D(nn.Module):
    def __init__(self, input_dim):
        super(UNet1D, self).__init__()
        self.encoder1 = nn.Conv1d(1, 32, kernel_size=3, stride=1, padding=1)
        self.encoder2 = nn.Conv1d(32, 64, kernel_size=3, stride=1, padding=1)
        self.decoder1 = nn.ConvTranspose1d(64, 32, kernel_size=3, stride=1, padding=1)
        self.decoder2 = nn.ConvTranspose1d(32, 1, kernel_size=3, stride=1, padding=1)

    def forward(self, x):
        x = x.unsqueeze(1)  # Add channel dimension for Conv1D
        enc1 = F.relu(self.encoder1(x))
        enc2 = F.relu(self.encoder2(enc1))
        dec1 = F.relu(self.decoder1(enc2))
        dec2 = self.decoder2(dec1)
        return dec2.squeeze(1)  # Remove channel dimension

In [None]:
# Noise schedule for diffusion
def linear_beta_schedule(timesteps):
    beta_start = 0.0001
    beta_end = 0.02
    return torch.linspace(beta_start, beta_end, timesteps)


In [None]:
# Forward Diffusion Process
def forward_diffusion(x0, t, noise_schedule):
    noise = torch.randn_like(x0)
    sqrt_alpha_cumprod = torch.sqrt(1.0 - noise_schedule[:t].prod())
    sqrt_one_minus_alpha_cumprod = torch.sqrt(noise_schedule[:t].prod())
    return sqrt_alpha_cumprod * x0 + sqrt_one_minus_alpha_cumprod * noise, noise


In [None]:
# Loss Function for Reverse Process
class DiffusionLoss(nn.Module):
    def __init__(self):
        super(DiffusionLoss, self).__init__()
        self.mse = nn.MSELoss()

    def forward(self, predicted_noise, true_noise):
        return self.mse(predicted_noise, true_noise)

In [None]:
# Training Loop for Diffusion Model
def train_diffusion(model, dataloader, optimizer, loss_fn, timesteps, noise_schedule, epochs):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for x0 in dataloader:
            optimizer.zero_grad()
            t = torch.randint(1, timesteps + 1, (1,)).item()
            noisy_x, noise = forward_diffusion(x0.float(), t, noise_schedule)
            predicted_noise = model(noisy_x.float())
            loss = loss_fn(predicted_noise, noise)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch + 1}, Loss: {total_loss / len(dataloader):.4f}")

In [None]:
# Generate Missing Data with Diffusion Model
def impute_missing_data(model, corrupted_data, timesteps, noise_schedule):
    model.eval()
    generated_data = corrupted_data.clone().float()
    for t in range(timesteps, 0, -1):
        with torch.no_grad():
            noise = model(generated_data)
            beta_t = noise_schedule[t - 1]
            sqrt_one_minus_beta_t = torch.sqrt(1 - beta_t)
            generated_data = (generated_data - beta_t * noise) / sqrt_one_minus_beta_t
    return generated_data

In [None]:
# Main Execution
if __name__ == "__main__":
    # Mount Google Drive and extract the zip file
    from google.colab import drive
    drive.mount('/content/drive')

    zip_path = "/content/drive/MyDrive/Colab Notebooks/Generative_Deep_Learning_2nd_Edition/Generative_Deep_Learning_2nd_Edition/notebooks/08_diffusion/01_ddm/Sensor_data.zip"
    extract_path = "/content/Sensor_data"

    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_path)

    # Load Dataset
    data_dir = extract_path
    dataset = SensorDataset(data_dir, transform=torch.tensor)

    # Split into train and test sets
    train_size = int(0.8 * len(dataset))
    test_size = len(dataset) - train_size
    train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

    train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

    # Define Diffusion Model
    input_dim = dataset[0].shape[0]
    timesteps = 100
    noise_schedule = linear_beta_schedule(timesteps)

    model = UNet1D(input_dim)
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
    loss_fn = DiffusionLoss()

    # Train Diffusion Model
    train_diffusion(model, train_loader, optimizer, loss_fn, timesteps, noise_schedule, epochs=20)

    # Save the trained model
    torch.save(model.state_dict(), "sensor_ddpm_model.pth")
    print("Model saved as 'sensor_ddpm_model.pth'")

    # Impute Missing Data
    corrupted_data = torch.tensor(dataset.data).clone().float()  # Convert to PyTorch tensor
    corrupted_data[torch.rand_like(corrupted_data) < 0.1] = 0  # Mask 10% of the data
    generated_data = impute_missing_data(model, corrupted_data, timesteps, noise_schedule)

    # Visualize Results
    for sensor_idx in range(5):
        plt.figure(figsize=(12, 4))
        plt.plot(dataset.data[sensor_idx], label="Real Data", alpha=0.7, color="blue")
        plt.plot(corrupted_data[sensor_idx], label="Corrupted Data", alpha=0.7, color="orange")
        plt.plot(generated_data[sensor_idx], label="Imputed Data", alpha=0.7, color="green")
        plt.title(f"Sensor {sensor_idx + 1}: Real vs Corrupted vs Imputed")
        plt.xlabel("Time Steps")
        plt.ylabel("Sensor Values")
        plt.legend()
        plt.show()

    # Reload the model (example for future use)
    model.load_state_dict(torch.load("sensor_ddpm_model.pth"))
    model.eval()
    print("Model reloaded for further experiments")