# Exploratory Analysis - Draft Version 

## Package Import

In [None]:
# Import Packages
from tqdm import tqdm

import numpy as np
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt 
import torch
import sklearn 
import itertools 
import os 
import random
from PIL import Image

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.decomposition import PCA, KernelPCA
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression, Lasso
from sklearn.feature_selection import f_classif, chi2, mutual_info_classif as MI, SelectPercentile
from sklearn.metrics import accuracy_score, log_loss, f1_score, roc_auc_score

import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
import torchvision.transforms as transforms

import medmnist
from medmnist.info import INFO, HOMEPAGE, DEFAULT_ROOT

try:
    from collections.abc import Sequence
except ImportError:
    from collections import Sequence

## Functions for Analysis 

### Setup 

In [None]:
# Define dictionaries needed 
datasets = {}
class_number = {}
features = {}
labels = {}

### Function to name datasets 

In [None]:
def dataset_namer(input_name, suffix, size=''): #size as optional parameter!
    global string
    if size != '':
        string = f"{input_name}_{suffix}_{size}"
    else:
        string = f"{input_name}_{suffix}"
        
    return string

### Function to generate the MedMNIST datasets

In [None]:
def medmnist_generator(data_flag, split, size):

    # Taken from MedMNIST v2 GitHub
    info = INFO[data_flag]
    task = info['task']
    n_channels = info['n_channels']
    n_classes = len(info['label'])

    DataClass = getattr(medmnist, info['python_class'])

    # Preprocessing
    data_transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize(mean=[.5], std=[.5])
        ])
    
    global class_number
    
    
    # Creates dictionary with number of classes for all features and labels.
    name = dataset_namer(data_flag, size)
    class_entry = {name: n_classes}
    class_number.update(class_entry)
    
    # Use of Dataset_Namer function to encode outputs
    ds_name = dataset_namer(data_flag, split, size)
    
    global datasets
    
    # Splits each dataset into training, validation and testing dataset. 
    value = DataClass(split=split, size=int(size), transform=data_transform,download=True) 
    entry = {ds_name: value}
    datasets.update(entry)
    
    globals()[ds_name] = value

### Function to retrieve variable name as a string

In [None]:
def get_var_name(input_var):
    for name, var in globals().items():
        if var is input_var:
            return name
    return None

### Function to extract features and labels from MedMNIST image data

In [None]:
# Extracts Features and Labels
def features_labels(key, value):   

    # Extract features and transform to torch
    X = value.imgs
    X = X.reshape(X.shape[0], -1)
    X = torch.from_numpy(X)
    
    # Extract labels and transform to torch
    y = value.labels
    y = np.ravel(y)
    y = torch.from_numpy(y)
    
    # Name feature and labels datasets
    f_name = dataset_namer(key, "features", '')
    l_name = dataset_namer(key, "labels", '')
    
    globals()[f_name] = X
    globals()[l_name] = y
    
    global features, labels

    f_entry = {f_name: X}
    features.update(f_entry) 

    l_entry = {l_name: y}
    labels.update(l_entry)

### Function to split dictionaries by specified split

In [None]:
# Function to split dictionaries
def dict_split(dictionary, split):
    
    new_dict = {}
    
    for key, value in dictionary.items():
        if split in key:
            name = dataset_namer(split, get_var_name(dictionary))
            new_dict[key] = value
            globals()[name] = new_dict    

### Function to transform data into dataloader form for deep learning 

In [None]:
# Function to transform data into dataloader form for deep learning
def data_loader(name, batch_size):
    name = dataset_namer(name, "loader", '')
    if 'train' in name:
        globals()[name] = data.DataLoader(dataset = name, batch_size = BATCH_SIZE, shuffle = True)
    else: 
        globals()[name] = data.DataLoader(dataset = name, batch_size = BATCH_SIZE, shuffle = False)

### Function for principal component analysis (linear and non-linear kernels)

In [None]:
# PCA/Kernel PCA
def pca(data, normalise='Yes', kernel='No',kernel_type='linear'):
    
    ds_name = variable_name(data)
    
    if kernel == 'No':
    
        if normalise == 'Yes':

            name = dataset_namer(ds_name, "normalised_pca", '')

            def pca_normalise(data):

                data = StandardScaler().fit_transform(data)
                feature_cols = ['feature'+str(i) for i in range(data.shape[1])]    
                normalised_features = pd.DataFrame(data,columns=feature_cols)
                data = normalised_features

                return data

            data = pca_normalise(data)

        elif normalise == 'No': 
            name = dataset_namer(ds_name, "pca", '')

        else: 
            print("ERROR: Invalid input to normalise parameter. Please choose 'Yes' or 'No'.")


        pca = PCA()
        principalComponents = pca.fit_transform(data)
        pca_cols = ['pc'+str(i) for i in range(principalComponents.shape[1])]

        value = pd.DataFrame(data = principalComponents, columns = pca_cols)
        entry = {name: value}
        datasets.update(entry)
        globals()[name] = value
        
    elif kernel == 'Yes':
    
        if normalise == 'Yes':

            name = dataset_namer(ds_name, "normalised_kernel_pca", '')

            def pca_normalise(data):

                data = StandardScaler().fit_transform(data)
                feature_cols = ['feature'+str(i) for i in range(data.shape[1])]    
                normalised_features = pd.DataFrame(data,columns=feature_cols)
                data = normalised_features

                return data

            data = pca_normalise(data)

        elif normalise == 'No': 
            name = dataset_namer(ds_name, "kernel_pca", '')

        else: 
            print("ERROR: Invalid input to normalise parameter. Please choose 'Yes' or 'No'.")


        kernel_pca = KernelPCA(kernel=kernel_type)
        kernel_principalComponents = kernel_pca.fit_transform(data)
        kernel_pca_cols = ['pc'+str(i) for i in range(kernel_principalComponents.shape[1])]

        value = pd.DataFrame(data = kernel_principalComponents, columns = kernel_pca_cols)
        entry = {name: value}
        datasets.update(entry)
        globals()[name] = value
    
    else:
        print("ERROR: Invalid input to kernel parameter. Please choose 'Yes' or 'No'.")

## Generating the Data 

### Specifying Function Inputs

In [None]:
# Specify Data Flags and Data Splits
data_flag = ('pathmnist','dermamnist','breastmnist')
split = ('train','test','val')
size = (28,64,128,224)

### Generating Datasets

In [None]:
# For Loop to Generate Data
for a, b, c in itertools.product(sorted(data_flag), split, size): 
    medmnist_generator(a,b,c)

### Show dataset information

In [None]:
# Show information for PathMNIST
print(pathmnist_train_28)
print(pathmnist_train_64)
print(pathmnist_train_128)
print(pathmnist_train_224)

# Show information for DermaMNIST
print(dermamnist_train_28)
print(dermamnist_train_64)
print(dermamnist_train_128)
print(dermamnist_train_224)

# Show information for BreastMNIST
print(breastmnist_train_28)
print(breastmnist_train_64)
print(breastmnist_train_128)
print(breastmnist_train_224)

### Generate samples of data

In [None]:
# Generate 7x7 grid (49 samples) original low resolution images
##breastmnist_train_28.montage(length=7).save("breastmnist_lowres_sample.jpeg")
##pathmnist_train_28.montage(length=7).save("pathmnist_lowres_sample.jpeg")
##dermamnist_train_28.montage(length=7).save("dermamnist_lowres_sample.jpeg")
 
# Generate 7x7 grid (49 samples) highest resolution images
##breastmnist_train_224.montage(length=7).save("breastmnist_highres_sample.jpeg")
##pathmnist_train_224.montage(length=7).save("pathmnist_highres_sample.jpeg")
##dermamnist_train_224.montage(length=7).save("dermamnist_highres_sample.jpeg")

# Resolution Comparison 
##dermamnist_train_28.montage(length=7).save("res_comp1.jpeg")
##dermamnist_train_64.montage(length=7).save("res_comp2.jpeg")
##dermamnist_train_128.montage(length=7).save("res_comp3.jpeg")
##dermamnist_train_224.montage(length=7).save("res_comp4.jpeg")

### Extract Features and Labels from each dataset

In [None]:
# For loop to extract features and labels over whole dictionary
for key, value in datasets.items():
    features_labels(key, value)

### Split features and labels by train/test/val

In [None]:
# Split features and labels into train/test/val
for i in split:
    dict_split(features, i)
    dict_split(labels, i)

## Preprocessing 

### Quantitites needed

In [None]:
# Pre-processing quantities needed
NUM_EPOCHS = 3
BATCH_SIZE = 128
lr = 0.001

## Deep Learning 

### Transform data into dataloader form for deep learning 

In [None]:
# Run function over all our datasets
for key in datasets.keys():
    data_loader(key, BATCH_SIZE)