In [9]:
import os
import random
from pathlib import Path
from PIL import Image
import pandas as pd

# Data Cleaning

### A. Get Paths

In [10]:
# Find Data
data = Path("Data/")
image_path = data / "XRAY_DATA"

# Train/Test Directory
train_dir = image_path / "train"
test_dir = image_path / "test"

In [11]:
# Get all image paths
train_image_paths = (list(train_dir.glob("*/*.png")) + 
                     list(train_dir.glob("*/*.jpeg")) + 
                     list(train_dir.glob("*.jpg")))

test_image_paths = (list(test_dir.glob("*/*.png")) +
                    list(test_dir.glob("*/*.jpeg")) +
                    list(test_dir.glob("*/*.jpg")))

### B. Get Data Labels

In [12]:
# Load Image Meta Data For Labels
df = pd.read_csv(data / "Metadata.csv")

# Remove unneeded columns
df.drop('Unnamed: 0', axis=1, inplace=True)
df.drop('Label_2_Virus_category', axis=1, inplace=True)
df.drop('Label_1_Virus_category', axis=1, inplace=True)

### C. Sort Data into Classes based on labels

In [13]:
# Sort image into folders by label and delete files not in the dataframe
for image_path in train_image_paths:
    image_name = image_path.name
    if image_name not in df['X_ray_image_name'].values:
        os.remove(image_path)
    else:
        label = df[df['X_ray_image_name'] == image_name]['Label'].values[0]
        if label == 'Normal':
            os.rename(image_path, train_dir / "Normal" / image_name)
        elif label == 'Pnemonia':
            os.rename(image_path, train_dir / "Pnemonia" / image_name)

for image_path in test_image_paths:
    image_name = image_path.name
    if image_name not in df['X_ray_image_name'].values:
        os.remove(image_path)
    else:
        label = df[df['X_ray_image_name'] == image_name]['Label'].values[0]
        if label == 'Normal':
            os.rename(image_path, test_dir / "Normal" / image_name)
        elif label == 'Pnemonia':
            os.rename(image_path, test_dir / "Pnemonia" / image_name)

# Transform and Load Data with Torchvision

In [14]:
import torch
from torch import nn
from torchvision import models, transforms, datasets
from torch.utils.data import DataLoader

In [31]:
# Make all of the images the same size
img_transforms = transforms.Compose([
    transforms.Resize([512, 512]),
    transforms.ToTensor()
])

In [32]:
# Load Data
train_data = datasets.ImageFolder(root = train_dir,
                                  transform=img_transforms,
                                  target_transform=None)

test_data = datasets.ImageFolder(root= test_dir,
                                 transform=img_transforms)

In [33]:
# Turn into dataloaders
BATCH_SIZE = 32
NUM_WORKERS = 4

train_dataloader = DataLoader(dataset = train_data,
                              batch_size=BATCH_SIZE,
                              shuffle = True,
                              num_workers= NUM_WORKERS)

test_dataloader = DataLoader(dataset=test_data,
                             batch_size=BATCH_SIZE,
                             shuffle=False,
                             num_workers=NUM_WORKERS)

# Create CNN Model

In [34]:
from sklearn.svm import SVC

In [37]:
class CNN_model(nn.Module):
    def __init__(self, input_shape, hidden_dim, output_shape):
        super().__init__()
        self.conv_block_1 = nn.Sequential(
            nn.Conv2d(in_channels=input_shape,
                      out_channels=hidden_dim,
                      kernel_size= 3,
                      stride = 1,
                      padding = 0),
            nn.Conv2d(in_channels=hidden_dim,
                      out_channels=hidden_dim,
                      kernel_size= 3,
                      stride = 1,
                      padding = 0),
            nn.MaxPool2d(kernel_size=2)
        )
        self.conv_block_2 = nn.Sequential(
            nn.Conv2d(in_channels=input_shape,
                      out_channels=hidden_dim,
                      kernel_size= 3,
                      stride = 1,
                      padding = 0),
            nn.Conv2d(in_channels=hidden_dim,
                      out_channels=hidden_dim,
                      kernel_size= 3,
                      stride = 1,
                      padding = 0),
            nn.MaxPool2d(kernel_size=2)
        )
        self.conv_block_3 = nn.Sequential(
            nn.Conv2d(in_channels=input_shape,
                      out_channels=hidden_dim,
                      kernel_size= 3,
                      stride = 1,
                      padding = 0),
            nn.Conv2d(in_channels=hidden_dim,
                      out_channels=output_shape,
                      kernel_size= 3,
                      stride = 1,
                      padding = 0),
            nn.MaxPool2d(kernel_size=2)
        )
        self.classifier = SVC(C=1, kernel='rbf', gamma='auto')

    def forward(self, x):
        return self.classifier(self.conv_block_3(self.conv_block_2(self.conv_block_1(x))))


In [38]:
# Initialize model
model = CNN_model(input_shape=3,
                  hidden_dim=10,
                  output_shape=len(train_data.classes))