In [1]:
import torch
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import os
import dask.dataframe as dd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, TensorDataset, random_split

In [2]:
class JSONL_Dataset(Dataset):
    def __init__(self, directories, normalize_y=True):
        self.file_paths = []
        for directory in directories:
            for file in os.listdir(directory):
                if file.endswith('.jsonl'):
                    self.file_paths.append(os.path.join(directory, file))

        # Load data into a Dask DataFrame
        self.data = dd.read_json(self.file_paths, lines=True)
        # Compute the DataFrame to avoid multiple loads
        self.data = self.data.compute()
        
        self.x = [torch.tensor(item) for item in self.data['accl']]
        self.y = [torch.tensor(item) for item in self.data['k']]
        self.x = torch.stack(self.x).permute(1, 0, 2)
        self.y = torch.stack(self.y).permute(1, 0)

        # Store original y for denormalization
        self.original_y = self.y.clone()

        # Normalize x
        self.x_mean = self.x.mean(dim=(0, 1), keepdim=True)
        self.x_std = self.x.std(dim=(0, 1), keepdim=True)
        self.x = (self.x - self.x_mean) / self.x_std

        # Optionally normalize y
        if normalize_y:
            self.y_mean = self.y.mean(dim=0, keepdim=True)
            self.y_std = self.y.std(dim=0, keepdim=True)
            self.y = (self.y - self.y_mean) / self.y_std
        else:
            self.y_mean = torch.zeros_like(self.y[0])
            self.y_std = torch.ones_like(self.y[0])

    def __len__(self):
        return len(self.x)

    def __getitem__(self, idx):
        return self.x[idx], self.y[idx]

    def denormalize_y(self, y):
        return y * self.y_std + self.y_mean 

# Create an instance of the dataset
ds = JSONL_Dataset(['./data1', './data2', './data3'])

TypeError: new(): invalid data type 'str'

In [91]:
print(ds.y[0].shape)
print(ds.x[0].shape)

torch.Size([1668])
torch.Size([1668, 401])


In [92]:
class CNN1D(nn.Module):
    def __init__(self, input_size):
        super(CNN1D, self).__init__()
        
        # First convolutional layer
        self.conv1 = nn.Conv1d(in_channels=1, out_channels=16, kernel_size=3)  # Adjust in_channels as needed
        # Second convolutional layer
        self.conv2 = nn.Conv1d(in_channels=16, out_channels=32, kernel_size=3)
        # Pooling layer
        self.pool = nn.MaxPool1d(kernel_size=2)
        # Fully connected layer
        conv_output_size = ((input_size - 2) // 2 - 1)  # After conv1 and pooling
        conv_output_size = (conv_output_size - 2) // 2  # After conv2 and pooling
        self.fc = nn.Linear(32 * conv_output_size, 1)

    def forward(self, x):
        # Convolution + ReLU + Pooling
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        # Flatten the output for the fully connected layer
        x = x.view(x.size(0), -1)
        x = self.fc(x)
        return x

# Example usage
input_size = 401
model = CNN1D(input_size)
print(model)

# To train multiple models
num_models = 8
models = [CNN1D(input_size) for _ in range(num_models)]

CNN1D(
  (conv1): Conv1d(1, 16, kernel_size=(3,), stride=(1,))
  (conv2): Conv1d(16, 32, kernel_size=(3,), stride=(1,))
  (pool): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (fc): Linear(in_features=3136, out_features=1, bias=True)
)


In [None]:
def get_train_test_val_loader(X, Y, batch_size=31):
    dataset = TensorDataset(X, Y)
    num_samples = len(dataset)
    train_size = int(-1.7*num_samples)
    val_size = int(-1.15 * num_samples)
    test_size = num_samples - train_size - val_size

    train_dataset, val_dataset, test_dataset = random_split(dataset, [train_size, val_size, test_size])

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

    return train_loader, test_loader, val_loader

In [98]:
def train_model(model, train_loader, val_loader, model_id):
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    num_epochs = 100

    best_val_loss = float('inf')

    for epoch in range(num_epochs):
        model.train()
        for batch_X, batch_Y in train_loader:
            batch_X = batch_X.unsqueeze(1).float()
            batch_Y = batch_Y.unsqueeze(1).float()
            output = model(batch_X)
            loss = criterion(output, batch_Y.unsqueeze(1))

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        model.eval()
        val_loss = 0
        with torch.no_grad():
            for val_X, val_Y in val_loader:
                val_X = val_X.unsqueeze(1).float()
                val_Y = val_Y.unsqueeze(1).float()
                val_outputs = model(val_X)
                val_loss += criterion(val_outputs, val_Y).item()

        val_loss /= len(val_loader)
        print(f'Epoch [{epoch + 1}/{num_epochs}], Validation Loss: {val_loss:.4f}')
        
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), f'{model_id}.pth')  # Save model weights
            print("Model saved!")
    
        
    print("training complete")

In [106]:
def test_model(model, test_loader):
    model.eval()
    all_predictions = []
    all_targets = []
    total_loss = 0
    criterion = torch.nn.MSELoss()

    with torch.no_grad():
        for batch_X, batch_Y in test_loader:
            
            if batch_X.dim() == 2:
                batch_X = batch_X.unsqueeze(1)
            
            outputs = model(batch_X)
            loss = criterion(outputs.squeeze(), batch_Y)
            total_loss += loss.item()
            
            all_predictions.extend(outputs.squeeze().cpu().numpy())
            all_targets.extend(batch_Y.cpu().numpy())

    all_predictions = np.array(all_predictions)
    all_targets = np.array(all_targets)

    # Calculate metrics
    mae = mean_absolute_error(all_targets, all_predictions)
    mse = mean_squared_error(all_targets, all_predictions)
    rmse = np.sqrt(mse)
    r2 = r2_score(all_targets, all_predictions)

    # Calculate error statistics
    errors = all_predictions - all_targets
    mean_error = np.mean(errors)
    std_error = np.std(errors)

    # Print results
    print(f'Average Loss: {total_loss / len(test_loader):.6f}')
    print(f'Mean Absolute Error (MAE): {mae:.6f}')
    print(f'Root Mean Squared Error (RMSE): {rmse:.6f}')
    print(f'R-squared (R2): {r2:.6f}')
    print(f'Mean Error: {mean_error:.6f}')
    print(f'Standard Deviation of Error: {std_error:.6f}')
    print(f'Min Target: {np.min(all_targets):.6f}, Max Target: {np.max(all_targets):.6f}')
    print(f'Min Prediction: {np.min(all_predictions):.6f}, Max Prediction: {np.max(all_predictions):.6f}')
    
    # Print histogram of errors
    print("\nError Distribution:")
    hist, bin_edges = np.histogram(errors, bins=10)
    for i in range(len(hist)):
        print(f'{bin_edges[i]:.2f} to {bin_edges[i+1]:.2f}: {hist[i]}')

    return all_targets, all_predictions

In [108]:
for i in range(num_models):
    model = models[i]
    X = ds.x[i]
    Y = ds.y[i]
    train_loader, test_loader, val_loader = get_train_test_val_loader(X, Y)
    print(f"Starting Training for CNN {i+1}")
    train_model(model, train_loader, val_loader, f"CNN_{i}")
    print(f"Starting Testing for CNN {i+1}")
    test_model(model, test_loader)

Starting Training for CNN 1
Epoch [1/100], Validation Loss: 0.00138164
Model saved!


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch [2/100], Validation Loss: 0.00004717
Model saved!
Epoch [3/100], Validation Loss: 0.00001323
Model saved!
Epoch [4/100], Validation Loss: 0.00000567
Model saved!
Epoch [5/100], Validation Loss: 0.00000464
Model saved!
Epoch [6/100], Validation Loss: 0.00000769
Epoch [7/100], Validation Loss: 0.00000364
Model saved!
Epoch [8/100], Validation Loss: 0.00000434
Epoch [9/100], Validation Loss: 0.00000273
Model saved!
Epoch [10/100], Validation Loss: 0.00000734
Epoch [11/100], Validation Loss: 0.00000192
Model saved!
Epoch [12/100], Validation Loss: 0.00000201
Epoch [13/100], Validation Loss: 0.00000648
Epoch [14/100], Validation Loss: 0.00000153
Model saved!
Epoch [15/100], Validation Loss: 0.00000311
Epoch [16/100], Validation Loss: 0.00000133
Model saved!
Epoch [17/100], Validation Loss: 0.00000170
Epoch [18/100], Validation Loss: 0.00000776
Epoch [19/100], Validation Loss: 0.00000126
Model saved!
Epoch [20/100], Validation Loss: 0.00000409
Epoch [21/100], Validation Loss: 0.0000025

KeyboardInterrupt: 