# Capstone Project: Modeling - Transfer Learning Embeddings

For this project, in order to properly leverage convolutional neural networks with a small dataset, we will be using transfer learning. Essentially we will be using multiple models in sequence. First instead of a training a CNN on our dataset, we will use a pretrained CNN (RESNET50) and transfer it's domain knowledge by using it as a feature extractor. We accomplish this by removing the last softmax layer and use the 2048 size vector from RESNET50's convolutions as features for a traditional ML model. This gives us the ability to leverage a CNN trained on millions of images and repurpose it for another task.

In [19]:
import warnings

warnings.filterwarnings('ignore')

In [135]:
import pandas as pd
import numpy as np
import os

import json
import random

import cv2
from PIL import Image

# import tensorflow as tf
import torch
import torch.nn as nn
# pretrained resnet
from torchvision.models import resnet50, ResNet50_Weights


In [2]:
ext_path = os.path.join('data', 'extracted')
save_path = os.path.join('data', 'augmentation')
random.seed(42)

Using a pretrained encoder, in this case RESNET50 may yield better results rather than training the model from scratch. One reason being the small number of samples (approximately 3800 + dataset augmentation), particularly with limited time and compute resources available for this project.

Essentially we will be using transfer learning 

In [3]:
resnet = resnet50(weights=ResNet50_Weights.DEFAULT)

In [4]:
# Get all hidden layers before softmax, this will be the pretrained encoder generating the embeddings
modules = list(resnet.children())[:-1]
encoder = nn.Sequential(*modules)

for param in encoder.parameters():
    # Freeze parameters so gradient is not computed in backward()
    # https://pytorch.org/tutorials/beginner/transfer_learning_tutorial.html#sphx-glr-beginner-transfer-learning-tutorial-py
    param.requires_grad = False

In [5]:
encoder

Sequential(
  (0): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (2): ReLU(inplace=True)
  (3): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (4): Sequential(
    (0): Bottleneck(
      (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (downsample): Sequential(
        (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)


## Embedding Test

In [6]:
# Test
test = cv2.imread('data/extracted/Run11_0Hr_XY02_00001_CH4.png')
im_pil = Image.fromarray(test)

In [7]:
encoder.eval()

# Initialize the Weight Transforms
weights = ResNet50_Weights.DEFAULT
preprocess = weights.transforms()

# Apply it to the input image
img_transformed = preprocess(im_pil)

emb = encoder(img_transformed.unsqueeze(0))

In [8]:
emb[0].shape

torch.Size([2048, 1, 1])

In [20]:
# Test embedding
emb[0].flatten()

tensor([0.0126, 0.0000, 0.1524,  ..., 0.0106, 0.0000, 0.0141])

# Classifier Training Set

Need to combine embeddings from RESNET50 with other features from tabular data such as position, drugs/amounts, and labels for training the binary classifier

## Create Embeddings of Image Extractions

In [10]:
def chunker(l, n):
    for i in range(0, len(l), n):
        yield l[i:i + n]

In [49]:
# Generate embeddings using feature extractor from RESNET 50

def generate_embeddings(path):

    emb_df = pd.DataFrame()

    for chunk in chunker(os.listdir(path), 50):

        images_input = []
        file_names = []
        results = []

        for file in chunk:
            img = cv2.imread(os.path.join(path, file))

            try:
                im_pil = Image.fromarray(img)
            except AttributeError as e:
                print('Problem with file ' + file)
                continue

            # Initialize the Weight Transforms
            weights = ResNet50_Weights.DEFAULT
            preprocess = weights.transforms()

            # Apply it to the input image
            img_transformed = preprocess(im_pil)
            images_input.append(img_transformed)

            file_names.append(file)

        images_input = torch.stack(images_input)
        encoder.cpu()
        embeddings = encoder(images_input).squeeze().numpy()
        ret_df = pd.DataFrame(embeddings)
        ret_df['file'] = file_names

        emb_df = pd.concat([emb_df, ret_df], axis=0)
    
    return emb_df

In [12]:
emb_df = generate_embeddings(ext_path)
emb_df.shape

(4174, 2049)

In [13]:
emb_df.to_csv('data/embeddings.csv', index=False)

## Merge Tabular Data

In [21]:
tab_df = pd.read_csv('data/merged.csv')

In [24]:
tab_df['OMIT_CALL'].value_counts()

 1    2384
-1     799
 0     228
Name: OMIT_CALL, dtype: int64

In [26]:
2384+228

2612

In [57]:
tab_df['file'] = tab_df['IMAGE_ID'].str.split('/').str[2]

In [58]:
data_df = pd.merge(tab_df, emb_df, how='inner', left_on=['file'], right_on=['file'])

In [60]:
data_df.shape

(3411, 2074)

In [61]:
data_df[data_df.columns.tolist()[:26]]

Unnamed: 0,IMAGE_ID,SUGGEST_OMIT,CIRCULARITY_VALUE,REGRESSION_R2,EC50Y,OMIT_CALL,GROUP_ID,MEASURE_T0_BASE_VALUE,MEASURE_MAG_VALUE,MENISCUS_LENGTH,...,DRUG_CONTROL,DRUG_MISSING,DRUG_BUMETANIDE,DRUG_E2,DRUG_GLYH101,DRUG_INH172,DRUG_MANNITOL,DRUG_P4,DRUG_PBS,file
0,data/extracted/Run15_0Hr_XY01_00001_CH4.png,Possible poor detection of inflection / menisc...,OK,0.999429,20343.270855,-1,,,,,...,,1.0,400uM,100nM,1uM,10uM,300mM,1uM,8uL,Run15_0Hr_XY01_00001_CH4.png
1,data/extracted/Run15_0Hr_XY02_00001_CH4.png,OK: fit 0.909665,OK,0.999262,32229.751964,-1,,,,,...,,1.0,0,0,0,0,0,0,0,Run15_0Hr_XY02_00001_CH4.png
2,data/extracted/Run15_0Hr_XY05_00001_CH4.png,OK: fit 0.935102,OK,0.999214,26054.752969,-1,,,,,...,,1.0,0,1nM,0,0,0,0,0,Run15_0Hr_XY05_00001_CH4.png
3,data/extracted/Run15_0Hr_XY06_00001_CH4.png,OK: fit 0.975717,OK,0.999291,20132.237483,-1,,,,,...,,1.0,0,0,0,0,0,0,0,Run15_0Hr_XY06_00001_CH4.png
4,data/extracted/Run15_0Hr_XY07_00001_CH4.png,OK: fit 0.240573,OK,0.998862,24441.022005,-1,,,,,...,,1.0,0,0,0,0,0,0,0,Run15_0Hr_XY07_00001_CH4.png
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3406,data/extracted/Run80_0Hr_XY87_00001_CH4.png,,OK,0.999691,25581.022992,1,GRP3,1.6872,0.0,1.6872,...,,,0,0,0,0,0,0,0,Run80_0Hr_XY87_00001_CH4.png
3407,data/extracted/Run80_0Hr_XY89_00001_CH4.png,,OK,0.999760,23556.403156,0,GRP4,1.4706,0.0,1.4706,...,1.0,,0,0,0,0,0,0,0,Run80_0Hr_XY89_00001_CH4.png
3408,data/extracted/Run80_0Hr_XY90_00001_CH4.png,,OK,0.999815,22319.134225,0,GRP4,1.5390,0.0,1.5390,...,1.0,,0,0,0,0,0,0,0,Run80_0Hr_XY90_00001_CH4.png
3409,data/extracted/Run80_0Hr_XY91_00001_CH4.png,,OK,0.999826,24803.166235,0,GRP4,1.5732,0.0,1.5732,...,1.0,,0,0,0,0,0,0,0,Run80_0Hr_XY91_00001_CH4.png


## Remove rows where OMIT_CALL is null
Unfortunately we cannot use imputation techniques otherwise we may introduce unintended bias in the dataset. This has the unfortunate consequence of reducing our dataset to 2,612 samples

In [62]:
data_df = data_df[data_df['OMIT_CALL'] != -1]
data_df.shape

(2612, 2074)

## Dataset Augmentation
In order to increase the size of our dataset, we processed additional images using random linear transformations such as rotation and flipping. Since well indexing is important, we cannot use skewing or scaling to transform the images.

In [63]:
# Get embeddings for rotations and flips
rot_df = generate_embeddings(os.path.join('data', 'augmentation', 'rotated'))
flipped_df = generate_embeddings(os.path.join('data', 'augmentation', 'flipped'))

rot_df['transform'] = 'rotation'
flipped_df['transform'] = 'flipped'

# Get row data for each transformed file, do not include the embeddings from data_df though
rot_df = pd.merge(data_df[data_df.columns.tolist()[:26]], rot_df, how='left', on=['file'])
flipped_df = pd.merge(data_df[data_df.columns.tolist()[:26]], flipped_df, how='left', on=['file'])

In [64]:
# Export embeddings for transformed images
rot_df.to_csv('data/rotation_embeddings.csv', index=False)
flipped_df.to_csv('data/flipped_embeddings.csv', index=False)

In [65]:
rot_df.head()

Unnamed: 0,IMAGE_ID,SUGGEST_OMIT,CIRCULARITY_VALUE,REGRESSION_R2,EC50Y,OMIT_CALL,GROUP_ID,MEASURE_T0_BASE_VALUE,MEASURE_MAG_VALUE,MENISCUS_LENGTH,...,2039,2040,2041,2042,2043,2044,2045,2046,2047,transform
0,data/extracted/Run31_0Hr_XY01_00001_CH4.png,OK: fit 0.821293,OK,0.998637,33969.7216,1,GRP1,1.8924,0.0,1.8924,...,0.156123,0.0,0.0,0.0,0.0,0.12284,0.0,0.0,0.0,rotation
1,data/extracted/Run31_0Hr_XY02_00001_CH4.png,OK: fit 0.923463,OK,0.999266,30643.443651,1,GRP1,2.052,0.0,2.052,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,rotation
2,data/extracted/Run31_0Hr_XY03_00001_CH4.png,OK: fit 0.799446,OK,0.997888,33822.248568,1,GRP1,1.7784,0.0,1.7784,...,0.966884,0.0,0.0,0.0,0.0,0.035339,0.0,0.0,0.0,rotation
3,data/extracted/Run31_0Hr_XY04_00001_CH4.png,OK: fit 0.818367,OK,0.99821,33984.693894,1,GRP1,1.9152,0.0,1.9152,...,0.807247,0.0,0.0,0.0,0.0,0.063473,0.0,0.0,0.0,rotation
4,data/extracted/Run31_0Hr_XY05_00001_CH4.png,OK: fit 0.698901,OK,0.999372,32531.729709,0,GRP1,2.166,0.0,2.166,...,0.056762,0.0,0.0,0.026642,0.0,0.090039,0.0,0.0,0.0,rotation


In [66]:
flipped_df.head()

Unnamed: 0,IMAGE_ID,SUGGEST_OMIT,CIRCULARITY_VALUE,REGRESSION_R2,EC50Y,OMIT_CALL,GROUP_ID,MEASURE_T0_BASE_VALUE,MEASURE_MAG_VALUE,MENISCUS_LENGTH,...,2039,2040,2041,2042,2043,2044,2045,2046,2047,transform
0,data/extracted/Run31_0Hr_XY01_00001_CH4.png,OK: fit 0.821293,OK,0.998637,33969.7216,1,GRP1,1.8924,0.0,1.8924,...,0.299295,0.0,0.0,0.000245,0.0,0.062064,0.0,0.0,0.0,flipped
1,data/extracted/Run31_0Hr_XY02_00001_CH4.png,OK: fit 0.923463,OK,0.999266,30643.443651,1,GRP1,2.052,0.0,2.052,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,flipped
2,data/extracted/Run31_0Hr_XY03_00001_CH4.png,OK: fit 0.799446,OK,0.997888,33822.248568,1,GRP1,1.7784,0.0,1.7784,...,1.198271,0.0,0.0,0.0,0.0,0.034073,0.0,0.0,0.0,flipped
3,data/extracted/Run31_0Hr_XY04_00001_CH4.png,OK: fit 0.818367,OK,0.99821,33984.693894,1,GRP1,1.9152,0.0,1.9152,...,0.983211,0.0,0.0,0.0,0.0,0.009318,0.0,0.0,0.0,flipped
4,data/extracted/Run31_0Hr_XY05_00001_CH4.png,OK: fit 0.698901,OK,0.999372,32531.729709,0,GRP1,2.166,0.0,2.166,...,0.0,0.0,0.0,0.33973,0.0,0.061403,0.0,0.0,0.0,flipped


In [67]:
data_df['transform'] = 'untransformed'

In [68]:
# Concatenate augmented data files
augmented_df = pd.concat([data_df, rot_df, flipped_df], axis=0)

In [69]:
augmented_df.shape

(7836, 2075)

In [70]:
augmented_df['OMIT_CALL'].value_counts()

1    7152
0     684
Name: OMIT_CALL, dtype: int64

# Prepare Augmented Dataset for training on traditional ML models

Need to prepare dataset for model ingestion, dropping unnecessary columns, columns with high null values. Then need to scale variables, one-hot encode.

In [98]:
# Donut Area is missing 34% of values, will drop
ml_df[ml_df['DONUT_AREA'].isna()].shape[0] / ml_df.shape[0]

0.33843797856049007

In [126]:
ml_df = augmented_df.copy()

# Drop unused columns
ml_df.drop(columns=['IMAGE_ID', 'SUGGEST_OMIT', 'CIRCULARITY_VALUE', 
                    'MEASURE_MAG_VALUE', 'file', 'transform', 
                    'DONUT_AREA', 'DRUG_MISSING'], inplace=True)

# Drug_dry and Drug_control are binary valued. Convert null to 0
ml_df['DRUG_DRY'] = ml_df['DRUG_DRY'].fillna(0).astype('int')
ml_df['DRUG_CONTROL'] = ml_df['DRUG_CONTROL'].fillna(0).astype('int')

# DRY and CONTROL are zero variance and need to drop
ml_df.drop(columns=['DRY', 'CONTROL'], inplace=True)

# DRUG_ are all near zero variance and need to drop
ml_df.drop(columns=['DRUG_BUMETANIDE', 'DRUG_E2', 'DRUG_GLYH101', 
                    'DRUG_INH172', 'DRUG_MANNITOL', 
                    'DRUG_P4', 'DRUG_PBS'], inplace=True)

In [127]:
# Min-Max scale EC50Y and ESTIMATE_AUC
ml_df['EC50Y'] = (ml_df['EC50Y'] - ml_df['EC50Y'].min()) / (ml_df['EC50Y'].max() - ml_df['EC50Y'].min())
ml_df['ESTIMATE_AUC'] = (ml_df['ESTIMATE_AUC'] - ml_df['ESTIMATE_AUC'].min()) / (ml_df['ESTIMATE_AUC'].max() - ml_df['ESTIMATE_AUC'].min())

In [130]:
# Dummy Encode GROUP_ID
ml_df = pd.concat([pd.get_dummies(ml_df['GROUP_ID']), ml_df], axis=1)
ml_df.drop(columns=['GROUP_ID'], inplace=True)

In [133]:
ml_df.head()

Unnamed: 0,GRP1,GRP2,GRP3,GRP4,REGRESSION_R2,EC50Y,OMIT_CALL,MEASURE_T0_BASE_VALUE,MENISCUS_LENGTH,EC50X,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
671,1,0,0,0,0.998637,0.880752,1,1.8924,1.8924,1.7594,...,0.0,0.155963,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
672,1,0,0,0,0.999266,0.778462,1,2.052,2.052,1.8737,...,0.0,0.009089,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
673,1,0,0,0,0.997888,0.876217,1,1.7784,1.7784,1.7339,...,0.0,1.111396,0.0,0.0,0.0,0.0,0.036014,0.0,0.0,0.0
674,1,0,0,0,0.99821,0.881212,1,1.9152,1.9152,1.7677,...,0.0,0.731969,0.0,0.0,0.0,0.0,0.009445,0.0,0.0,0.0
675,1,0,0,0,0.999372,0.836531,0,2.166,2.166,1.8894,...,0.0,0.0,0.0,0.0,0.05862,0.0,0.0,0.0,0.0,0.0


In [134]:
# Save ML Dataset
ml_df.to_csv('data/ML_Dataset.csv', index=False)