In [1]:
import os
import random
from pathlib import Path
from PIL import Image
import pandas as pd

# Data Cleaning

### A. Get Paths

In [2]:
# Find Data
data = Path("Data/")
image_path = data / "XRAY_DATA"

# Train/Test Directory
train_dir = image_path / "train"
test_dir = image_path / "test"

In [3]:
# Get all image paths
train_image_paths = (list(train_dir.glob("*/*.png")) + 
                     list(train_dir.glob("*/*.jpeg")) + 
                     list(train_dir.glob("*.jpg")))

test_image_paths = (list(test_dir.glob("*/*.png")) +
                    list(test_dir.glob("*/*.jpeg")) +
                    list(test_dir.glob("*/*.jpg")))

### B. Get Data Labels

In [4]:
# Load Image Meta Data For Labels
df = pd.read_csv(data / "Metadata.csv")

# Remove unneeded columns
df.drop('Unnamed: 0', axis=1, inplace=True)
df.drop('Label_2_Virus_category', axis=1, inplace=True)
df.drop('Label_1_Virus_category', axis=1, inplace=True)

### C. Sort Data into Classes based on labels

In [5]:
# Sort image into folders by label and delete files not in the dataframe
for image_path in train_image_paths:
    image_name = image_path.name
    if image_name not in df['X_ray_image_name'].values:
        os.remove(image_path)
    else:
        label = df[df['X_ray_image_name'] == image_name]['Label'].values[0]
        if label == 'Normal':
            os.rename(image_path, train_dir / "Normal" / image_name)
        elif label == 'Pnemonia':
            os.rename(image_path, train_dir / "Pnemonia" / image_name)

for image_path in test_image_paths:
    image_name = image_path.name
    if image_name not in df['X_ray_image_name'].values:
        os.remove(image_path)
    else:
        label = df[df['X_ray_image_name'] == image_name]['Label'].values[0]
        if label == 'Normal':
            os.rename(image_path, test_dir / "Normal" / image_name)
        elif label == 'Pnemonia':
            os.rename(image_path, test_dir / "Pnemonia" / image_name)

# Transform and Load Data with Torchvision

In [6]:
import torch
from torch import nn
from torchvision import models, transforms, datasets
from torch.utils.data import DataLoader

In [7]:
# Make all of the images the same size
img_transforms = transforms.Compose([
    transforms.Resize([512, 512]),
    transforms.ToTensor()
])

In [8]:
# Load Data
train_data = datasets.ImageFolder(root = train_dir,
                                  transform=img_transforms,
                                  target_transform=None)

test_data = datasets.ImageFolder(root= test_dir,
                                 transform=img_transforms)

In [9]:
# Turn into dataloaders
BATCH_SIZE = 32
NUM_WORKERS = 4

train_dataloader = DataLoader(dataset = train_data,
                              batch_size=BATCH_SIZE,
                              shuffle = True,
                              num_workers= NUM_WORKERS)

test_dataloader = DataLoader(dataset=test_data,
                             batch_size=BATCH_SIZE,
                             shuffle=False,
                             num_workers=NUM_WORKERS)

# Create CNN Model

In [21]:
class CNN_model(nn.Module):
    def __init__(self, input_shape, hidden_dim, output_shape):
        super().__init__()
        self.conv_block_1 = nn.Sequential(
            nn.Conv2d(in_channels=input_shape,
                      out_channels=hidden_dim,
                      kernel_size= 7,
                      stride = 2,
                      padding = 0),
            nn.ReLU(),
            nn.Conv2d(in_channels=hidden_dim,
                      out_channels=hidden_dim,
                      kernel_size= 7,
                      stride = 2,
                      padding = 0),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2,
                         stride = 2)
        )
        self.conv_block_2 = nn.Sequential(
            nn.Conv2d(in_channels=hidden_dim,
                      out_channels=hidden_dim,
                      kernel_size= 3,
                      stride = 1,
                      padding = 0),
            nn.ReLU(),
            nn.Conv2d(in_channels=hidden_dim,
                      out_channels=hidden_dim,
                      kernel_size= 3,
                      stride = 1,
                      padding = 0),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2,
                         stride = 2)
        )
        self.conv_block_3 = nn.Sequential(
            nn.Conv2d(in_channels=hidden_dim,
                      out_channels=hidden_dim,
                      kernel_size= 3,
                      stride = 1,
                      padding = 0),
            nn.ReLU(),
            nn.Conv2d(in_channels=hidden_dim,
                      out_channels=hidden_dim,
                      kernel_size= 3,
                      stride = 1,
                      padding = 0),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2,
                         stride = 2)
        )
        self.classifier = nn.Sequential(
            nn.Flatten(),
            nn.Linear(in_features=hidden_dim*12*12,
                      out_features=output_shape)
        )

    def forward(self, x):
        x = self.conv_block_1(x)
        # print(x.shape)
        x = self.conv_block_2(x)
        # print(x.shape)
        x = self.conv_block_3(x)
        # print(x.shape)
        x = self.classifier(x)
        return x

In [22]:
# Initialize model
model = CNN_model(input_shape=3,
                  hidden_dim=10,
                  output_shape=len(train_data.classes))

In [23]:
# Create train_step()

def train_step(model: torch.nn.Module,
               dataloader: torch.utils.data.DataLoader,
               loss_fn: torch.nn.Module,
               optimizer: torch.optim.Optimizer
               ):
    
    # Put the model in train mode
    model.train()

    # Setup train loss and train accuracy value
    train_loss, train_acc = 0, 0

    # Loop through data loader data batches
    for batch, (X, y) in enumerate(dataloader):

        # 1. Forward Pass
        y_pred = model(X)

        # 2. Calculate the loss
        loss = loss_fn(y_pred, y)
        train_loss += loss.item()

        # 3. Optimizer zero grad
        optimizer.zero_grad()

        # 4. Loss Backward
        loss.backward()

        # 5. optimizer step
        optimizer.step()

        # Calculate accuracy metric
        y_pred_class = torch.argmax(torch.softmax(y_pred, dim=1), dim=1)
        train_acc += (y_pred_class==y).sum().item()/len(y_pred) 

    # Adjust metrics to get average loss and accuracy per batch
    train_loss /= len(dataloader)
    train_acc /= len(dataloader)
    return train_loss, train_acc

In [30]:
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params=model.parameters(),
                             lr = 0.001)

In [32]:
for epochs in range(20):
    train_loss, train_acc = train_step(model = model,
                                       dataloader = test_dataloader,
                                       loss_fn= loss_fn,
                                       optimizer= optimizer)
    print(train_loss, train_acc)


0.6349715375237994 0.6822916666666666
0.6347053796052933 0.6822916666666666
0.6344438311126497 0.6822916666666666
0.6341877265108956 0.6822916666666666
0.6339374631643295 0.6822916666666666
0.6336934069792429 0.6822916666666666
0.6334555165635215 0.6822916666666666
0.6332237985399034 0.6822916666666666


KeyboardInterrupt: 