<a href="https://colab.research.google.com/github/kraszor/mgr_tests/blob/main/basic_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pyarrow.parquet as pq
import torch
from torch.utils.data import IterableDataset, DataLoader
import glob
import random

class ParquetDataset(IterableDataset):
    def __init__(self, folder_path, features, target, batch_size=1024, shuffle_files=True):
        self.folder_path = folder_path
        self.files = glob.glob(f"{folder_path}/*.parquet")
        self.features = features
        self.target = target
        self.batch_size = batch_size
        self.shuffle_files = shuffle_files

    def __iter__(self):
        files = self.files.copy()
        if self.shuffle_files:
            random.shuffle(files)

        for file_path in files:
            parquet_file = pq.ParquetFile(file_path)
            for batch in parquet_file.iter_batches(batch_size=self.batch_size, columns=self.features + [self.target]):
                batch = batch.to_pydict()
                x = torch.tensor(
                    [[batch[f][i] for f in self.features] for i in range(len(batch[self.target]))],
                    dtype=torch.float32
                )
                y = torch.tensor(batch[self.target], dtype=torch.float32)
                yield x, y

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
import numpy as np

class BinaryClassifier(nn.Module):
    def __init__(self, input_dim=4112, hidden_dims=[512, 256, 128]):
        """
        Simple feedforward neural network for binary classification.

        Args:
            input_dim: Total input dimension (2056 + 2056 = 4112)
            hidden_dims: List of hidden layer dimensions
        """
        super(BinaryClassifier, self).__init__()

        layers = []
        prev_dim = input_dim
        for hidden_dim in hidden_dims:
            layers.extend([
                nn.Linear(prev_dim, hidden_dim),
                nn.BatchNorm1d(hidden_dim),
                nn.ReLU(),
                nn.Dropout(0.3)
            ])
            prev_dim = hidden_dim
        layers.append(nn.Linear(prev_dim, 1))

        self.network = nn.Sequential(*layers)

    def forward(self, x):
        return self.network(x)


def train_epoch(model, loader, criterion, optimizer, device):
    """Train for one epoch."""
    model.train()
    total_loss = 0
    correct = 0
    total = 0

    for batch_idx, (x, y) in enumerate(loader):
        x = x.view(x.size(0), -1).to(device)
        y = y.unsqueeze(1).to(device)

        optimizer.zero_grad()
        outputs = model(x)
        loss = criterion(outputs, y)

        loss.backward()
        optimizer.step()

        predictions = (torch.sigmoid(outputs) > 0.5).float()
        correct += (predictions == y).sum().item()
        total += y.size(0)
        total_loss += loss.item()

        if (batch_idx + 1) % 10 == 0:
            print(f"  Batch {batch_idx + 1}, Loss: {loss.item():.4f}, "
                  f"Acc: {100 * correct / total:.2f}%")

    avg_loss = total_loss / (batch_idx + 1)
    accuracy = 100 * correct / total
    return avg_loss, accuracy



def train_model(train_loader, val_loader=None, epochs=10, lr=0.001, device='cuda'):
    """
    Main training function.

    Args:
        train_loader: DataLoader for training data
        val_loader: DataLoader for validation data (optional)
        epochs: Number of training epochs
        lr: Learning rate
        device: 'cuda' or 'cpu'
    """
    device = torch.device(device if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")

    model = BinaryClassifier(input_dim=4112).to(device)

    criterion = nn.BCEWithLogitsLoss()
    optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=1e-5)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(
        optimizer, mode='min', factor=0.5, patience=2
    )

    best_val_loss = float('inf')

    for epoch in range(epochs):
        print(f"\nEpoch {epoch + 1}/{epochs}")
        print("-" * 50)

        train_loss, train_acc = train_epoch(model, train_loader, criterion, optimizer, device)
        print(f"Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.2f}%")

        if val_loader is not None:
            val_loss, val_acc = validate(model, val_loader, criterion, device)
            print(f"Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.2f}%")

            scheduler.step(val_loss)

            if val_loss < best_val_loss:
                best_val_loss = val_loss
                torch.save(model.state_dict(), 'best_model.pth')
                print("✓ Saved best model")
        else:
            scheduler.step(train_loss)

    return model

In [None]:
# # Example usage:
# if __name__ == "__main__":

#     # Create datasets
#     features = ['POS_vector', 'Patho_Vector']
#     target = 'ClinSigSimple'

#     train_dataset = ParquetDataset(
#         folder_path="data.parquet",
#         features=features,
#         target=target,
#         batch_size=256,
#         shuffle_files=True
#     )

#     # val_dataset = ParquetDataset(
#     #     folder_path="data.parquet/val",
#     #     features=features,
#     #     target=target,
#     #     batch_size=32,
#     #     shuffle_files=False
#     # )

#     train_loader = DataLoader(train_dataset, batch_size=None, num_workers=0)
#     # val_loader = DataLoader(val_dataset, batch_size=None, num_workers=0)

#     # Train model
#     model = train_model(
#         train_loader=train_loader,
#         # val_loader=val_loader,
#         epochs=10,
#         lr=0.001,
#         device='cpu'
#     )

    # # Load best model for inference
    # model.load_state_dict(torch.load('best_model.pth'))
    # model.eval()

Using device: cpu

Epoch 1/10
--------------------------------------------------
  Batch 10, Loss: 0.6391, Acc: 63.79%
  Batch 20, Loss: 0.5278, Acc: 66.70%
  Batch 30, Loss: 0.6045, Acc: 69.11%
  Batch 40, Loss: 0.4937, Acc: 70.20%
  Batch 50, Loss: 0.4574, Acc: 71.28%
  Batch 60, Loss: 0.5453, Acc: 71.75%
  Batch 70, Loss: 0.5270, Acc: 72.24%
  Batch 80, Loss: 0.4755, Acc: 72.69%
  Batch 90, Loss: 0.5213, Acc: 72.92%
  Batch 100, Loss: 0.5458, Acc: 73.06%
  Batch 110, Loss: 0.4956, Acc: 73.31%
  Batch 120, Loss: 0.5041, Acc: 73.60%
  Batch 130, Loss: 0.5054, Acc: 73.86%
  Batch 140, Loss: 0.4440, Acc: 74.03%
  Batch 150, Loss: 0.5092, Acc: 74.13%
  Batch 160, Loss: 0.4849, Acc: 74.30%
  Batch 170, Loss: 0.4707, Acc: 74.36%
  Batch 180, Loss: 0.4959, Acc: 74.47%
  Batch 190, Loss: 0.4861, Acc: 74.60%
Train Loss: 0.5200, Train Acc: 74.62%

Epoch 2/10
--------------------------------------------------
  Batch 10, Loss: 0.4221, Acc: 81.13%
  Batch 20, Loss: 0.4107, Acc: 81.39%
  Batch 30

In [None]:
def validate(model, loader, criterion, device):
    """Validate the model."""
    model.eval()
    total_loss = 0
    correct = 0
    total = 0

    with torch.no_grad():
        for batch_idx, (x, y) in enumerate(loader):
            x = x.view(x.size(0), -1).to(device)
            y = y.unsqueeze(1).to(device)

            outputs = model(x)
            loss = criterion(outputs, y)

            predictions = (torch.sigmoid(outputs) > 0.5).float()
            correct += (predictions == y).sum().item()
            total += y.size(0)
            total_loss += loss.item()

    avg_loss = total_loss / (batch_idx + 1)
    accuracy = 100 * correct / total
    return avg_loss, accuracy


def evaluate_model(model, loader, device='cuda'):
    """
    Evaluate model on any dataset and return detailed metrics.

    Args:
        model: Trained model
        loader: DataLoader with data to evaluate
        device: 'cuda' or 'cpu'

    Returns:
        avg_loss: Average loss
        accuracy: Accuracy percentage
    """
    device = torch.device(device if torch.cuda.is_available() else 'cpu')
    model.to(device)
    model.eval()

    criterion = nn.BCEWithLogitsLoss()
    total_loss = 0
    correct = 0
    total = 0

    print("Evaluating...")
    with torch.no_grad():
        for batch_idx, (x, y) in enumerate(loader):
            x = x.view(x.size(0), -1).to(device)
            y = y.unsqueeze(1).to(device)

            outputs = model(x)
            loss = criterion(outputs, y)

            predictions = (torch.sigmoid(outputs) > 0.5).float()
            correct += (predictions == y).sum().item()
            total += y.size(0)
            total_loss += loss.item()

            if (batch_idx + 1) % 50 == 0:
                print(f"  Processed {total} samples...")

    avg_loss = total_loss / (batch_idx + 1)
    accuracy = 100 * correct / total

    print(f"  Total samples: {total}")
    print(f"  Correct predictions: {correct}")

    return avg_loss, accuracy

In [None]:
model.eval()

print("\n" + "="*50)
print("Evaluating on Training Set:")
print("="*50)
train_loss, train_acc = evaluate_model(model, train_loader, device='cpu')
print(f"Training Set - Loss: {train_loss:.4f}, Accuracy: {train_acc:.2f}%")


Evaluating on Training Set:
Evaluating...
  Processed 12711 samples...
  Processed 25353 samples...
  Processed 37938 samples...
  Total samples: 49808
  Correct predictions: 42063
Training Set - Loss: 0.3717, Accuracy: 84.45%


In [None]:
features = ['POS_vector', 'Patho_Vector']
target = 'ClinSigSimple'
val_dataset = ParquetDataset(
        folder_path="test_data.parquet",
        features=features,
        target=target,
        batch_size=256,
        shuffle_files=False
    )

val_loader = DataLoader(val_dataset, batch_size=None, num_workers=0)

In [None]:
model = BinaryClassifier(input_dim=4112).to('cpu')
model.load_state_dict(torch.load('best_model.pth', map_location='cpu'))

val_loss, val_acc = evaluate_model(model, val_loader, device='cpu')
print(f"Validation Accuracy: {val_acc:.2f}%")

Evaluating...
  Processed 12703 samples...
  Total samples: 12703
  Correct predictions: 9843
Validation Accuracy: 77.49%
