In [None]:
!pip install torch torchvision transformers
!pip install --upgrade tensorflow
!pip install --upgrade keras
!pip install timm

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision.models import resnet50 # Import resnet50
from torchvision import datasets, transforms
from transformers import ViTConfig, ViTModel
from timm import create_model
from timm.data import resolve_data_config
from timm.data.transforms_factory import create_transform
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import time
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Configuration
BATCH_SIZE = 32
NUM_CLASSES = 2
EPOCHS = 30
LEARNING_RATE = 1e-4
DATA_DIR = "/content/drive/MyDrive/Pneumonia/"

# Data Transforms
config = resolve_data_config({}, model=create_model('swin_base_patch4_window7_224', pretrained=True))
transform_train = create_transform(**config)
transform_val = create_transform(**config)

# Datasets
train_dataset = datasets.ImageFolder(root=f'{DATA_DIR}/train', transform=transform_train)
val_dataset = datasets.ImageFolder(root=f'{DATA_DIR}/test', transform=transform_val)

# Data Loaders
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=4)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=4)

# Custom Model: ResNet50 + Vision Transformer
class ResNet50ViT(nn.Module):
    def __init__(self, num_classes=NUM_CLASSES):
        super(ResNet50ViT, self).__init__()

        # Load pre-trained ResNet50 and remove the final fully connected layer
        self.resnet = resnet50(pretrained=True)
        self.resnet = nn.Sequential(*list(self.resnet.children())[:-2])  # Retain layers until the penultimate conv layer

        # Vision Transformer configuration
        self.vit_config = ViTConfig(
            hidden_size=2048,  # Output from ResNet50
            num_attention_heads=8,  # Number of attention heads
            num_hidden_layers=6,  # Number of transformer layers
            intermediate_size=3072,  # Intermediate size in the transformer
            patch_size=1,
            image_size=7  # Assuming the feature map size is 7x7 after ResNet50
        )
        self.transformer = ViTModel(self.vit_config)

        # Output classification head
        self.fc = nn.Linear(self.vit_config.hidden_size, num_classes)

    def forward(self, x):
        # Extract features with ResNet50
        features = self.resnet(x)  # Shape: [batch_size, 2048, 7, 7]

        # Flatten and add positional encodings
        features = features.flatten(2).permute(0, 2, 1)  # Shape: [batch_size, 49, 2048]
        batch_size, seq_len, _ = features.size()
        positional_encoding = self.transformer.embeddings.position_embeddings[:, :seq_len, :]
        # Use clone to avoid in-place modification
        features = features + positional_encoding.clone()
       # Pass through Transformer encoder
        transformer_outputs = self.transformer.encoder(features) # Remove inputs_embeds
        transformer_features = transformer_outputs.last_hidden_state  # Shape: [batch_size, 49, hidden_size]

        # Classification head
        pooled_output = transformer_features.mean(dim=1)  # Global average pooling
        logits = self.fc(pooled_output)

        return logits

# Model
model = ResNet50ViT(num_classes=NUM_CLASSES)
model = model.cuda()  # Move to GPU if available

# Loss and Optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

# Training and Validation Loop
train_losses, val_losses, train_accuracies, val_accuracies = [], [], [], []

for epoch in range(EPOCHS):
    model.train()
    running_loss, running_corrects = 0.0, 0
    for inputs, labels in train_loader:
        inputs, labels = inputs.cuda(), labels.cuda()
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item() * inputs.size(0)
        _, preds = torch.max(outputs, 1)
        running_corrects += torch.sum(preds == labels.data)

    epoch_loss = running_loss / len(train_loader.dataset)
    epoch_acc = running_corrects.double() / len(train_loader.dataset)
    train_losses.append(epoch_loss)
    train_accuracies.append(epoch_acc.item())

    model.eval()
    val_running_loss, val_running_corrects = 0.0, 0
    with torch.no_grad():
        for inputs, labels in val_loader:
            inputs, labels = inputs.cuda(), labels.cuda()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            val_running_loss += loss.item() * inputs.size(0)
            _, preds = torch.max(outputs, 1)
            val_running_corrects += torch.sum(preds == labels.data)

    val_loss = val_running_loss / len(val_loader.dataset)
    val_acc = val_running_corrects.double() / len(val_loader.dataset)
    val_losses.append(val_loss)
    val_accuracies.append(val_acc.item())

    print(f"Epoch {epoch+1}/{EPOCHS}, Train Loss: {epoch_loss:.4f}, Train Acc: {epoch_acc:.4f}, Test Loss: {val_loss:.4f}, Test Acc: {val_acc:.4f}")

print("Training complete.")

# Plotting Histograms
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.plot(train_losses, label='Train Loss')
plt.plot(train_accuracies, label='Train Accuracy')
plt.title('Train Loss and Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Value')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(val_losses, label='Test Loss')
plt.plot(val_accuracies, label='Test Accuracy')
plt.title('Test Loss and Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Value')
plt.legend()

plt.tight_layout()
plt.show()

# Confusion Matrix and Inference Time
y_true, y_pred = [], []

model.eval()
with torch.no_grad():
    for inputs, labels in val_loader:
        inputs, labels = inputs.cuda(), labels.cuda()
        outputs = model(inputs)
        _, preds = torch.max(outputs, 1)
        y_true.extend(labels.cpu().numpy())
        y_pred.extend(preds.cpu().numpy())

cm = confusion_matrix(y_true, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=train_dataset.classes)
disp.plot(cmap=plt.cm.Blues)
plt.show()

# Test and Inference Time
test_dataset = datasets.ImageFolder(root=f'{DATA_DIR}/test', transform=transform_val)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=4)

model.eval()
test_running_loss, test_running_corrects, inference_times = 0.0, 0, []

with torch.no_grad():
    for inputs, labels in test_loader:
        inputs, labels = inputs.cuda(), labels.cuda()
        start_time = time.time()
        outputs = model(inputs)
        inference_time = time.time() - start_time
        inference_times.append(inference_time)
        loss = criterion(outputs, labels)
        test_running_loss += loss.item() * inputs.size(0)
        _, preds = torch.max(outputs, 1)
        test_running_corrects += torch.sum(preds == labels.data)

test_loss = test_running_loss / len(test_loader.dataset)
test_acc = test_running_corrects.double() / len(test_loader.dataset)
avg_inference_time = sum(inference_times) / len(inference_times)

print(f"Test Loss: {test_loss:.4f}, Test Accuracy: {test_acc:.4f}")
print(f"Average Inference Time per Batch: {avg_inference_time:.4f} seconds")
