In [3]:
from PIL import Image
import pandas as pd
import numpy as np
import os
import pickle
import re

In [4]:
input_csv = pd.read_csv("../data.csv")

In [3]:
IMAGE_DIM = 384 # most images are already 384x384
def pad_square(image):
    old_size = image.size  # old_size[0] is in (width, height) format
    ratio = float(IMAGE_DIM)/max(old_size)
    new_size = tuple([int(x*ratio) for x in old_size])
    im = image.resize(new_size, Image.ANTIALIAS)
    # create a new image and paste the resized on it
    new_im = Image.new("RGB", (IMAGE_DIM, IMAGE_DIM))
    new_im.paste(im, ((IMAGE_DIM-new_size[0])//2,
                        (IMAGE_DIM-new_size[1])//2))
    return new_im

### Attempt with Tensorflow - This actually works
### Inception-v3 model

In [5]:
import tensorflow as tf
import tensorflow.python.platform
from tensorflow.python.platform import gfile

In [4]:
from PIL import Image

# This cell REPLACES tif images with padded jpg images
IMAGE_DIM = 384 # most images are already 384x384
def create_padded_jpg(file_path):
    image = Image.open(file_path)
    old_size = image.size  # old_size[0] is in (width, height) format
    ratio = float(IMAGE_DIM)/max(old_size)
    new_size = tuple([int(x*ratio) for x in old_size])
    im = image.resize(new_size, Image.ANTIALIAS)
    # create a new image and paste the resized on it
    new_im = Image.new("RGB", (IMAGE_DIM, IMAGE_DIM))
    new_im.paste(im, ((IMAGE_DIM-new_size[0])//2,
                        (IMAGE_DIM-new_size[1])//2))
    new_im.save(file_path.replace("tif", "jpg"), "jpeg")

for directory, subdirectories, files in os.walk("legs_folder_jpg"):
    for file in files:
        path = os.path.join(directory, file)
        if file.endswith(".tif"):
            create_padded_jpg(path)
            os.remove(path)

### Feature Vector extraction (produces list with vector per image)

In [5]:
def create_graph():
    with gfile.FastGFile('inception-2015-12-05/classify_image_graph_def.pb', 'rb') as f:
        graph_def = tf.GraphDef()
        graph_def.ParseFromString(f.read())
    _ = tf.import_graph_def(graph_def, name='')

def extract_features(image):
    nb_features = 2048
    features = np.empty(nb_features)
    labels = []

    create_graph()

    with tf.Session() as sess:
        next_to_last_tensor = sess.graph.get_tensor_by_name('pool_3:0')

        image_data = gfile.FastGFile(image, 'rb').read()

        predictions = sess.run(next_to_last_tensor, {'DecodeJpeg/contents:0': image_data})
        return np.squeeze(predictions)

In [None]:
patient_ids = []
dates = []
feature_vects = []
for directory, subdirectories, files in os.walk("legs_folder_jpg"):
    for file in files:
        path = os.path.join(directory, file)
        if path.endswith(".jpg"):
            patient_ids.append(directory.split("/")[1] + "/" + file.split(".")[0])
            feature_vects.append(extract_features(path))
            
df = pd.DataFrame(feature_vects, index=patient_ids)
pickle.dump(df, open("inception_cnn_features.pkl", "wb") )

In [19]:
from sklearn import preprocessing

test_patient_ids = ['2','32','24','24b','6','7', '41']
# df contains extracted features from a CNN (with index <patient_id>/<scan_date>)
# data filename contains scan pairing and class
# feature combine type specifies how to combine features ('c' = concat, 'd' = difference)
def produce_data(df, data_filename, feature_combine_type, augment = False, normalize=False):
    input_csv = pd.read_csv(data_filename)

    features = []
    row_ids = []
    Y = []
    
    if normalize:
        x = df.values #returns a numpy array
        min_max_scaler = preprocessing.MinMaxScaler()
        x_scaled = min_max_scaler.fit_transform(x)
        df = pd.DataFrame(x_scaled)

    for index, row in input_csv.iterrows():
        patient_id = str(row["patient_id"])
        scan_1 = os.path.join(patient_id, row["scan_1"])
        scan_2 = os.path.join(patient_id, row["scan_2"])
        row_id = patient_id + "/" + row["scan_1"] + "/" + row["scan_2"]
        
        if scan_1 not in df.index or scan_2 not in df.index:
            continue

        # Skip if either scan is not found
        v1 = df.loc[scan_1]
        v2 = df.loc[scan_2]

        Y.append(row["y"])
        
        augment_sample = (augment and patient_id not in test_patient_ids and not row["y"] == 'S')
        row_ids.append(row_id)
        if augment_sample:
            row_ids.append(patient_id + "/" + row["scan_2"] + "/" + row["scan_1"])
            if row["y"] == 'I':
                Y.append('R')
            elif row["y"] == 'R':
                Y.append('I')

        if feature_combine_type == 'c':
            features.append(v1 + v2)
            if augment_sample:
                features.append(v2 + v1)
        elif feature_combine_type == 'd':
            features.append(np.subtract(v1,v2))
            if augment_sample:
                features.append(np.subtract(v2,v1))

    df = pd.DataFrame(features, index=row_ids)
    df["y"] = Y

    print(df.shape)
    return df.loc[:, (df != 0).any(axis=0)]

In [20]:
# THIS PRODUCES A FEATURE SET BY CONCATENATING FEATURE VECTORS
inception_df = pd.read_pickle("inception_cnn_features.pkl")
data2 = produce_data(inception_df, "../data.csv", 'c')
pickle.dump(data2, open("data_inception_cnn.pkl", "wb") )

(56, 2049)


In [23]:
# THIS PRODUCES A FEATURE SET BY DIFFERENCING FEATURE VECTORS
inception_df = pd.read_pickle("inception_cnn_features.pkl")
data3 = produce_data(inception_df, "../data.csv", 'd', normalize=False)
pickle.dump(data3, open("data_inception_cnn_diff.pkl", "wb") )

(56, 2049)


In [24]:
# Produce reduced dimension feature set
vectors = pd.read_pickle("inception_cnn_features.pkl")
data4 = produce_data(vectors.loc[:, (vectors.std()**2) > .1], "../data.csv", 'd')
pickle.dump(data4, open("data_inception_cnn_diff_reduced_dim.pkl", "wb") )

(56, 396)


In [25]:
# Produce reduced dimension feature set
vectors = pd.read_pickle("inception_cnn_features.pkl")
data4 = produce_data(vectors.loc[:, (vectors.std()**2) > .1], "../data.csv", 'd', normalize=False, augment=True)
pickle.dump(data4, open("data_inception_cnn_diff_reduced_dim_augmented.pkl", "wb") )

(89, 396)
