In [1]:
import numpy as np
import pandas as pd
import keras
import os
from os.path import join
import matplotlib.pyplot as plt
from keras.models import Model, Sequential
from keras.layers import GlobalAveragePooling2D, Dense
from keras.applications.vgg16 import VGG16
from keras.preprocessing.image import ImageDataGenerator
import random
from trypo_backend import precision, recall
from keras import optimizers
from sklearn.metrics import classification_report, accuracy_score
import sys


IMG_PATH = '/mnt/mydata'
TRAIN_NORM = 'train/norm'
TRAIN_TRYPO = 'train/trypo'
VALID_NORM = 'valid/norm'
VALID_TRYPO = 'valid/trypo'

os.environ['CUDA_VISIBLE_DEVICES'] = ('0')


Using TensorFlow backend.


In [2]:
#lookup_df is a df holding paths, spread train/test, spread trypo/normal
def create_lookup_df():
    
    vals = []
    trypos = []
    fpaths = []
    for root, dirs, files in os.walk(IMG_PATH):  
        for filename in files:
            filepath = join(root, filename)
            fpaths.append(filepath)
            
            if 'train' in filepath:
                vals.append(0)
            else: vals.append(1)
            
            if 'norm' in filepath:
                trypos.append(0)
            else: trypos.append(1)
                
    lookup_df = pd.DataFrame({'path': fpaths, 'validation': vals, 'trypo': trypos}) 
    lookup_df.reset_index(level=0, inplace=True)
    return lookup_df

def res_to_VGG(image):
    for i in range(len(image)):
        image[i] = cv2.resize(image[i], (224, 224))
    return image

def make_vgg():
    model = VGG16(include_top = False, weights='imagenet')
    x = model.output
    x = GlobalAveragePooling2D()(x)
    x = Dense(1, activation = 'sigmoid')(x)
    model = Model(model.input, x)
    #for layer in model.layers[:-1]:
    #    layer.trainable = False

    return model

In [6]:
#divide indices for 1 fold of 5-fold CV
def divide_for_5CV(lookup_df):
    fold_ind = {}
    for i in range(5):
        fold_ind[i] = []
        
    norm_imgs = list(lookup_df[lookup_df.validation == 0][lookup_df.trypo == 0].index)
    trypo_imgs = list(lookup_df[lookup_df.validation == 0][lookup_df.trypo == 1].index)
    
    random_norm = random.sample(norm_imgs, len(norm_imgs))
    for i in range(len(norm_imgs)):
        fold_ind[i%5].append(random_norm[i])
        
    random_trypo = random.sample(trypo_imgs, len(trypo_imgs))
    for i in range(len(trypo_imgs)):
        fold_ind[i%5].append(random_trypo[i])        
    
    return fold_ind

#for generator to work with paths stored in dataframe
def flow_from_dataframe(img_data_gen, in_df, path_col, y_col, **dflow_args):
    base_dir = os.path.dirname(in_df[path_col].values[0])
    print(base_dir)
    print('## Ignore next message from keras, values are replaced anyways')
    df_gen = img_data_gen.flow_from_directory(base_dir, 
                                     class_mode = 'sparse',
                                    **dflow_args)
    df_gen.filenames = in_df[path_col].values
    df_gen.classes = np.stack(in_df[y_col].values)
    df_gen.samples = in_df.shape[0]
    df_gen.n = in_df.shape[0]
    df_gen._set_index_array()
    df_gen.directory = '' # since we have the full path
    print('Reinserting dataframe: {} images'.format(in_df.shape[0]))
    return df_gen

def get_generators(train_df, valid_df):
    train_gen = flow_from_dataframe(core_train_generator, 
                              train_df,
                              path_col = 'path',
                              y_col = 'trypo',
                              target_size = (224,224),
                              color_mode = 'rgb',
                              batch_size = 32,
                              seed=8)

    valid_gen = flow_from_dataframe(core_valid_generator, 
                              valid_df,
                              path_col = 'path',
                              y_col = 'trypo',
                              target_size = (224,224),
                              color_mode = 'rgb',
                              batch_size = 32,
                              shuffle = False)
    
    return train_gen, valid_gen

#evaluation
def get_predictions(model, valid_df, valid_ind):
    test_gen = flow_from_dataframe(core_valid_generator, 
                              valid_df,
                              path_col = 'path',
                              y_col = 'trypo',
                              target_size = (224,224),
                              color_mode = 'rgb',
                              batch_size = 1,
                              shuffle = False)
    r = model.evaluate_generator(test_gen)
    print(r)
    predictions = model.predict_generator(test_gen)
    predictions = np.round([p[0] for p in predictions])
    
    labels = valid_df['trypo'].values
    acc = accuracy_score(labels, predictions)
    rep = classification_report(labels, predictions)
    print(acc, rep)
    return acc, rep

In [None]:
core_train_generator = ImageDataGenerator(rescale=1./255, horizontal_flip=True, vertical_flip=True, 
                                         rotation_range=20, shear_range=0.2, zoom_range=0.2)
core_valid_generator = ImageDataGenerator(rescale=1./255)

epochs = [15, 5, 5, 5]

lookup_df = create_lookup_df()
fold_ind = divide_for_5CV(lookup_df)

for i in range(1, 5):
        model = make_vgg()
        model.compile(optimizer = optimizers.Adam(lr=1e-05, decay=0.01), loss="binary_crossentropy", metrics=["accuracy", precision, recall])

        
        train_df = lookup_df.loc[np.r_[fold_ind[(i+1)%5], fold_ind[(i+2)%5], fold_ind[(i+3)%5], fold_ind[(i+4)%5]]]
        valid_df = lookup_df.loc[np.r_[fold_ind[i]]]
        
        valid_ind = fold_ind[i]
        train_gen, valid_gen = get_generators(train_df, valid_df)
        
        total_epochs = 0
        for epoch_num in epochs:
            total_epochs += epoch_num
            sys.stdout = open('trypo_nofreeze_'+str(total_epochs)+'_full__CV_'+str(i)+'.txt', 'w')
            
            model.fit_generator(train_gen, 
                                steps_per_epoch=400,
                                validation_data = valid_gen,
                                validation_steps = 5,
                                epochs = epoch_num, 
                               verbose = 1)
            #model.save('5CV_us'+str(i)+'_'+param+'.h5')
            get_predictions(model, valid_df, valid_ind)

  
  import sys
