In [None]:
# 구글드라이브 구글코랩에 연동(마운트)
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Denoising Diffusion Model for Sensor Dataset Imputation
#!pip install scikit-learn
import os
import numpy as np
import pandas as pd
import torch
from torch.utils.data import DataLoader, Dataset, random_split
from torchvision.transforms import Compose
from tqdm import tqdm
import matplotlib.pyplot as plt
import zipfile
from sklearn.preprocessing import StandardScaler


In [None]:
# Custom Dataset for Sensor Data
class SensorDataset(Dataset):
    def __init__(self, data_dir, transform=None):
        self.data_dir = data_dir
        self.transform = transform

        # Load and process all sensor data
        self.data = self._load_data()

    def _load_data(self):
        # Combine all sensor files into a single matrix (300 x 75,600)
        all_data = []
        for i in range(1, 64):
            file_path = os.path.join(self.data_dir, f"ProcessVar{i}.csv")
            if not os.path.exists(file_path):
                raise FileNotFoundError(f"File not found: {file_path}")
            sensor_data = pd.read_csv(file_path, header=None).values.T  # Transpose to 300x1200
            all_data.append(sensor_data)
        return np.hstack(all_data)  # Combine all sensors column-wise

    def __len__(self):
        return self.data.shape[0]

    def __getitem__(self, idx):
        sample = self.data[idx, :]
        if self.transform:
            sample = self.transform(sample)
        return sample

In [None]:
# Diffusion Model Setup
class DiffusionModel(torch.nn.Module):
    def __init__(self, input_dim, embed_dim=128):
        super(DiffusionModel, self).__init__()
        # Define a simple MLP for the diffusion model
        self.fc1 = torch.nn.Linear(input_dim, embed_dim)
        self.dropout = torch.nn.Dropout(0.2)
        self.fc2 = torch.nn.Linear(embed_dim, embed_dim)
        self.fc3 = torch.nn.Linear(embed_dim, input_dim)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.dropout(x)
        x = torch.relu(self.fc2(x))
        return self.fc3(x)


In [None]:
# Training and Evaluation Functions
def train_model(model, dataloader, optimizer, criterion, epochs, scheduler=None):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for batch in dataloader:
            optimizer.zero_grad()
            outputs = model(batch.float())
            loss = criterion(outputs, batch.float())
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        if scheduler:
            scheduler.step()
        print(f"Epoch {epoch + 1}, Loss: {total_loss / len(dataloader):.4f}")

def evaluate_model(model, dataloader, criterion):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for batch in dataloader:
            outputs = model(batch.float())
            loss = criterion(outputs, batch.float())
            total_loss += loss.item()
    print(f"Evaluation Loss: {total_loss / len(dataloader):.4f}")


In [None]:
# Experiment: Missing Data Imputation
def run_experiment(model, dataset, missing_ratio=0.1):
    # Create a copy of the data
    data = dataset.data.copy()

    # Mask 10% of the data arbitrarily
    num_samples, num_features = data.shape
    num_missing = int(missing_ratio * num_samples * num_features)
    missing_indices = np.random.choice(num_samples * num_features, num_missing, replace=False)

    # Create a mask and apply it to the data
    mask = np.ones_like(data, dtype=bool)
    mask[np.unravel_index(missing_indices, data.shape)] = False
    corrupted_data = data * mask

    # Use the trained model to generate missing parts
    model.eval()
    with torch.no_grad():
        generated_data = model(torch.tensor(corrupted_data).float()).numpy()

    # Compute the discrepancy (e.g., RMSE) between real and generated parts
    real_missing_values = data[~mask]
    generated_missing_values = generated_data[~mask]
    rmse = np.sqrt(np.mean((real_missing_values - generated_missing_values) ** 2))

    print(f"RMSE for missing data imputation: {rmse:.4f}")

    # Enhanced visualization: plot real, masked, and generated data per sensor
    for sensor_idx in range(5):  # Display for the first 5 sensors
        plt.figure(figsize=(12, 4))
        plt.plot(data[sensor_idx], label="Real Data", alpha=0.7, color="blue")
        plt.plot(generated_data[sensor_idx], label="Generated Data", linestyle="--", alpha=0.7, color="green")
        masked_points = np.where(~mask[sensor_idx])[0]
        plt.scatter(masked_points, data[sensor_idx][masked_points], color="red", label="Masked Points", alpha=0.8)
        plt.title(f"Sensor {sensor_idx + 1}: Real vs Generated")
        plt.xlabel("Time Steps")
        plt.ylabel("Sensor Values")
        plt.legend()
        plt.show()

    return real_missing_values, generated_missing_values, rmse

In [None]:
# Additional Experiment Settings and Techniques
def setup_advanced_experiments(dataset, model, train_loader, test_loader, criterion):
    # Normalize Data
    scaler = StandardScaler()
    dataset.data = scaler.fit_transform(dataset.data)
    print("Data normalized.")

    # Experiment with Different Learning Rates
    print("Experiment: Reducing Learning Rate")
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-4, weight_decay=1e-5)
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.5)
    train_model(model, train_loader, optimizer, criterion, epochs=20, scheduler=scheduler)
    evaluate_model(model, test_loader, criterion)

    # Experiment with Larger Batch Size
    print("Experiment: Increasing Batch Size")
    train_loader = DataLoader(dataset, batch_size=64, shuffle=True)
    test_loader = DataLoader(dataset, batch_size=64, shuffle=False)
    train_model(model, train_loader, optimizer, criterion, epochs=20)
    evaluate_model(model, test_loader, criterion)

In [None]:
# Compare Models
def compare_models(dataset, baseline_model, advanced_model):
    print("Comparing Baseline and Advanced Models...")
    baseline_real, baseline_generated, baseline_rmse = run_experiment(baseline_model, dataset)
    advanced_real, advanced_generated, advanced_rmse = run_experiment(advanced_model, dataset)

    print(f"Baseline Model RMSE: {baseline_rmse:.4f}")
    print(f"Advanced Model RMSE: {advanced_rmse:.4f}")

    # Enhanced comparison of real, masked, and generated data for both models
    for sensor_idx in range(5):  # Compare for the first 5 sensors
        plt.figure(figsize=(12, 4))
        plt.plot(dataset.data[sensor_idx], label="Real Data", alpha=0.7, color="blue")
        plt.plot(baseline_generated[sensor_idx], label="Baseline Generated", linestyle="--", alpha=0.7, color="orange")
        plt.plot(advanced_generated[sensor_idx], label="Advanced Generated", linestyle="-.", alpha=0.7, color="green")
        masked_points = np.where(~np.isfinite(dataset.data[sensor_idx]))[0]
        plt.scatter(masked_points, dataset.data[sensor_idx][masked_points], color="red", label="Masked Points", alpha=0.8)
        plt.title(f"Sensor {sensor_idx + 1}: Real vs Baseline vs Advanced")
        plt.xlabel("Time Steps")
        plt.ylabel("Sensor Values")
        plt.legend()
        plt.show()

In [None]:
# Main Execution
if __name__ == "__main__":
    # Mount Google Drive and extract the zip file
    from google.colab import drive
    drive.mount('/content/drive')

    zip_path = "/content/drive/MyDrive/Colab Notebooks/Generative_Deep_Learning_2nd_Edition/Generative_Deep_Learning_2nd_Edition/notebooks/08_diffusion/01_ddm/Sensor_data.zip"
    extract_path = "/content/Sensor_data"

    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_path)

    # Load Dataset
    data_dir = extract_path
    dataset = SensorDataset(data_dir, transform=torch.tensor)

    # Split into train and test sets
    train_size = int(0.8 * len(dataset))
    test_size = len(dataset) - train_size
    train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

    train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

    # Baseline Model
    input_dim = dataset[0].shape[0]
    baseline_model = DiffusionModel(input_dim)
    criterion = torch.nn.MSELoss()
    baseline_optimizer = torch.optim.Adam(baseline_model.parameters(), lr=1e-3)

    train_model(baseline_model, train_loader, baseline_optimizer, criterion, epochs=20)

    # Advanced Model with Scheduler
    advanced_model = DiffusionModel(input_dim)
    advanced_optimizer = torch.optim.Adam(advanced_model.parameters(), lr=1e-4, weight_decay=1e-5)
    scheduler = torch.optim.lr_scheduler.StepLR(advanced_optimizer, step_size=10, gamma=0.5)

    train_model(advanced_model, train_loader, advanced_optimizer, criterion, epochs=20, scheduler=scheduler)

    # Compare Models
    compare_models(dataset, baseline_model, advanced_model)
