# Import the packages

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader, sampler
from datetime import date
from PIL import Image
import imageio
import torch
from torchvision.transforms import RandomHorizontalFlip
from torchvision import transforms
from torch import nn
import torch.nn.functional as F
import time

# Description of the dataset

In [3]:
df = pd.read_csv('clinical_annotation.csv')
df['AGE'] = df['DOB'].apply(lambda x : date.today().year-int(x[-4:]))
df = df.drop(columns = ['Unnamed: 0','DOB'])
df.head(3)

Unnamed: 0,ID,LABEL,GENDER,LYMPH_COUNT,AGE
0,P26,1,M,11.2,88
1,P183,1,M,12.8,79
2,P89,1,M,9.6,86


In [4]:
train_df = df[df['LABEL']!=-1]
test_df = df[df['LABEL']==-1]

In [5]:
def characteristics_table(df):
    """Creates a DataFrame that summarizes the characteristics of the DataFrame df"""
    diagnoses = np.unique(df.LABEL.values)
    population_df = pd.DataFrame(index=diagnoses,
                                columns=['N', 'age', '%sexF', 'LYMPH_COUNT'])

    
    for label in population_df.index.values:
        diagnosis_df = df[df.LABEL == label]
        population_df.loc[label, 'N'] = len(diagnosis_df)
        # Age
        mean_age = np.mean(diagnosis_df.AGE)
        std_age = np.std(diagnosis_df.AGE)
        population_df.loc[label, 'age'] = '%.1f ± %.1f' % (mean_age, std_age)
        # Sex
        population_df.loc[label, '%sexF'] = round((len(diagnosis_df[diagnosis_df.GENDER == 'F']) / len(diagnosis_df)) * 100, 1)
        # Lymph count
        mean_MMS = np.mean(diagnosis_df.LYMPH_COUNT)
        std_MMS = np.std(diagnosis_df.LYMPH_COUNT)
        population_df.loc[label, 'LYMPH_COUNT'] = '%.1f ± %.1f' % (mean_MMS, std_MMS)

    return population_df

print("Train")
population_train_df = characteristics_table(train_df)
print(population_train_df)
print("Test")
population_test_df = characteristics_table(test_df)
print(population_test_df)

Train
     N          age %sexF  LYMPH_COUNT
0   50  55.3 ± 19.6  52.0    5.0 ± 1.0
1  113  76.2 ± 12.0  47.8  35.9 ± 53.3
Test
     N          age %sexF  LYMPH_COUNT
-1  42  66.7 ± 19.5  42.9  24.4 ± 43.6


**The train and test set are balanced. Now we must create validation sets.\
A high lymphocyte count is strongly correlated with cancer.**

In [6]:
split_df = train_df
msk = np.random.rand(len(train_df)) < 0.75
train_df = split_df[msk]
val_df = split_df[~msk]
print("Train")
population_train_df = characteristics_table(train_df)
print(population_train_df)
print("Validation")
population_val_df = characteristics_table(val_df)
print(population_val_df)

Train
    N          age %sexF  LYMPH_COUNT
0  39  53.5 ± 19.4  53.8    5.0 ± 1.0
1  85  75.7 ± 12.4  42.4  34.1 ± 47.7
Validation
    N          age %sexF  LYMPH_COUNT
0  11  61.5 ± 19.1  45.5    5.0 ± 1.0
1  28  77.8 ± 10.7  64.3  41.3 ± 67.3


# Utils

In [18]:
class Lympho_Dataset(Dataset):
    
    def __init__(self, path_images, df, transform = None):
        """
        Args:
            path_images: (str) path to the images origin directory.
            data_df: (DataFrame) list of subjects used.
            transform: Optional, transformations applied to the tensor
        """
        self.path_images = path_images
        self.df = df
        self.transform = transform
        self.list_patients = df['ID'].tolist()
        self.lymph_count = df['LYMPH_COUNT'].tolist()
        self.age = df['AGE'].tolist()
        self.labels = df['LABEL'].tolist()
        self.img_dict = {idx : {'label' : self.labels[idx],
                           'age' : self.age[idx],
                           'lymph_count' : self.lymph_count[idx],
                            'patient' : self.list_patients[idx],
                           'images_path' : [path_images + '/' + patient + '/' + img_path for img_path in os.listdir(path_images + '/' + patient)]} for idx,patient in enumerate(self.list_patients)}
        
        
    def __len__(self):
        return len(self.df)
    
    
    def load_image(self, image_path):
        """Generate an image from the specs of the given image ID.
        Typically this function loads the image from a file, but
        in this case it generates the image on the fly.
        """
#         image = imageio.imread(image_path).astype(np.uint8)[...,None]
        image = Image.open(image_path)
        # Stack image on itself 3 times to simulate RGB image (3 channels required for model's input)  
        return image
    
    
    def __getitem__(self,idx):
        """
        Args:
            idx: (int) the index of the subject whom data is loaded.
        Returns:
            sample: (dict) corresponding data described by the following keys:
                image: (Tensor) Images of the patient's blood cells image in a tensor
                label: (int) the diagnosis code (0 for reactive or 1 for cancerous)
                participant_id: (str) ID of the participant 
                lymph_count : (int) Lymphocyte concentration in patient's blood
                age : (int) Patient's age
        """
        images = [self.transform(self.load_image(image)) for image in self.img_dict[idx]['images_path']]
        images = [transforms.ToTensor()(image).unsqueeze_(0) for image in images]
        images = torch.cat(images,axis=0)
        
        age = torch.Tensor([self.img_dict[idx]['age']])
        lymph_count = torch.Tensor([self.img_dict[idx]['lymph_count']])
        patient = self.img_dict[idx]['patient']
        label = torch.Tensor([self.img_dict[idx]['label']])
        
        sample = {'images' : images,
                  'lymph_count' : lymph_count,
                  'patient' : patient,
                  'label' : label,
                  'age' : age}
        return sample
        
        

In [8]:
path_images = r'C:\Users\Hugo\Desktop\MVA\S2\DLMI\Kaggle\trainset'

In [23]:
train_data = Lympho_Dataset(path_images, train_df, transform = RandomHorizontalFlip(p=0.5))

In [24]:
train_data.__getitem__(0)['images'].size()

torch.Size([100, 3, 224, 224])

# Model

##### All images are of shape (224,224,3)

In [46]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(3, 8, 3)
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(8, 16, 3)
        self.conv3 = nn.Conv2d(16, 32, 3)
        self.fc1 = nn.Linear(21632, 200)
        self.fc2 = nn.Linear(200, 100)
        self.fc3 = nn.Linear(100, 1)
        
    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = self.pool(F.relu(self.conv3(x)))
        x = x.view(-1, 21632)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = torch.sigmoid(self.fc3(x))
        return x
    
    def get_outputs(self,images):
        outputs = []
        for i in range(images.size()[0]):
            image = images[i][None,:,:,:]
            output = self.forward(image)
            outputs.append(output)
#         print(torch.stack(outputs).size())
        mean = torch.mean(torch.stack(outputs),axis=0)
        return mean
        

# Preprocessing

In [26]:
train_loader = torch.utils.data.DataLoader(
    train_data, batch_size=1, shuffle=True, num_workers=0)

In [27]:
def custom_fn(batch):
    elem = batch[0].keys()
    return {key : torch.cat(gen_list(key, batch), axis=0) if key != 'name' else gen_list(key, batch) for key in elem}

In [15]:
start = time.time()
for idx, batch in enumerate(train_loader):
    test = batch
print(f"Time for loading all the train images : {round(time.time() - start, 4)} seconds")

Time for loading all the train images : 62.9752 seconds


In [42]:
tensor_input = train_data.__getitem__(0)['images'][0]

In [52]:
batch['images']

tensor([[[[[0.9647, 0.9882, 1.0000,  ..., 1.0000, 1.0000, 1.0000],
           [0.9765, 0.9882, 1.0000,  ..., 1.0000, 1.0000, 0.9961],
           [0.9804, 0.9882, 1.0000,  ..., 1.0000, 1.0000, 0.9961],
           ...,
           [0.9882, 0.9922, 0.9922,  ..., 0.9922, 0.9922, 0.9882],
           [0.9882, 0.9882, 0.9882,  ..., 0.9961, 0.9922, 0.9882],
           [0.9804, 0.9882, 0.9882,  ..., 0.9961, 0.9922, 0.9882]],

          [[0.9059, 0.8941, 0.8784,  ..., 0.8980, 0.9020, 0.8980],
           [0.9059, 0.8863, 0.8824,  ..., 0.8980, 0.9020, 0.8941],
           [0.8980, 0.8863, 0.8863,  ..., 0.8980, 0.9020, 0.8941],
           ...,
           [0.8941, 0.8980, 0.8980,  ..., 0.8980, 0.8980, 0.8941],
           [0.8980, 0.8980, 0.8941,  ..., 0.9020, 0.8980, 0.8941],
           [0.8941, 0.8980, 0.8941,  ..., 0.9020, 0.8980, 0.8941]],

          [[0.7529, 0.7922, 0.8118,  ..., 0.7922, 0.7961, 0.7922],
           [0.7647, 0.7804, 0.8039,  ..., 0.7922, 0.7961, 0.7882],
           [0.7765, 0.7804

# Training

In [None]:
def test(model, data_loader, criterion):
    """
    Method used to test a CNN
    
    Args:
        model: (nn.Module) the neural network
        data_loader: (DataLoader) a DataLoader wrapping a MRIDataset
        criterion: (nn.Module) a method to compute the loss of a mini-batch of images
    
    Returns:
        results_df: (DataFrame) the label predicted for every subject
        results_metrics: (dict) a set of metrics
    """
    model.eval()
    data_loader.dataset.eval()
    columns = ["participant_id", "proba0", "proba1",
               "true_label", "predicted_label"]
    results_df = pd.DataFrame(columns=columns)
    total_loss = 0
    
    with torch.no_grad():
        for i, data in enumerate(data_loader, 0):
            images, labels = data['image'].cuda(), data['label'].cuda()
            outputs = model(images)
            loss = criterion(outputs, labels)
            total_loss += loss.item()
            probs = nn.Softmax(dim=1)(outputs)
            _, predicted = torch.max(outputs.data, 1)

            for idx, sub in enumerate(data['participant_id']):
                row = [sub,
                       probs[idx, 0].item(), probs[idx, 1].item(),
                       labels[idx].item(), predicted[idx].item()]
                row_df = pd.DataFrame([row], columns=columns)
                results_df = pd.concat([results_df, row_df])

    results_metrics = compute_metrics(results_df.true_label.values, results_df.predicted_label.values)
    results_df.reset_index(inplace=True, drop=True)
    results_metrics['mean_loss'] = total_loss / len(data_loader.dataset)
    
    return results_df, results_metrics


def compute_metrics(ground_truth, prediction):
    """Computes the accuracy, sensitivity, specificity and balanced accuracy"""
    tp = np.sum((prediction == 1) & (ground_truth == 1))
    tn = np.sum((prediction == 0) & (ground_truth == 0))
    fp = np.sum((prediction == 1) & (ground_truth == 0))
    fn = np.sum((prediction == 0) & (ground_truth == 1))
    
    metrics_dict = dict()
    metrics_dict['accuracy'] = (tp + tn) / (tp + tn + fp + fn)
    
    # Sensitivity
    if tp + fn != 0:
        metrics_dict['sensitivity'] = tp / (tp + fn)
    else:
        metrics_dict['sensitivity'] = 0.0
        
    # Specificity
    if fp + tn != 0:
        metrics_dict['specificity'] = tn / (fp + tn)
    else:
        metrics_dict['specificity'] = 0.0
        
    metrics_dict['balanced_accuracy'] = (metrics_dict['sensitivity'] + metrics_dict['specificity']) / 2
    
    return metrics_dict

In [60]:
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(net.parameters(), 1e-4)
net = Net()
num_epochs = 3

In [62]:
loss_track = []

for epoch in range(num_epochs):
    net.train()
    
    for idx, batch in enumerate(train_loader):
        image_tensor = batch['images']
        output = net.get_outputs(tensor_input)
        loss = criterion(output,batch['label'])
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        loss_track.append(loss.item())
        print('done')
    

done
done


KeyboardInterrupt: 