In [1]:
# %%capture
import IPython
import os
import numpy as np
import pandas as pd
from pandas import HDFStore
import spacy
from keras.utils import to_categorical
import cv2

from vqa_logger import logger
from common.os_utils import File

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
from common.constatns import train_data, validation_data, data_location, fn_meta, vqa_specs_location, raw_data_location
from common.settings import input_length, embedding_dim, image_size, seq_length, get_nlp
from common.classes import VqaSpecs
from common.functions import get_highlited_function_code, get_image, get_text_features, pre_process_raw_data, get_size
from common.utils import VerboseTimer

In [3]:
meta_data = File.load_json(fn_meta)

### Preparing the data for training

#### Getting the nlp engine

In [4]:
nlp = get_nlp()

[10:07:49][DEBUG] using embedding vector: en_core_web_sm
[10:07:49][DEBUG] Got embedding


#### Where get_nlp is defined as:

In [5]:
code = get_highlited_function_code(get_nlp,remove_comments=True)
IPython.display.display(code)

In [6]:
with HDFStore(raw_data_location) as store:
    image_name_question = store['data']
# df_train = image_name_question[image_name_question.group == 'train']
# df_val = image_name_question[image_name_question.group == 'validation']

# from parsers.VQA18 import Vqa18Base
# df_train = Vqa18Base.get_instance(train_data.processed_xls).data            
# df_val = Vqa18Base.get_instance(validation_data.processed_xls).data

##### This is just for performance and quick debug cycles! remove before actual trainining:

In [7]:
# image_name_question = image_name_question.head(5)
# image_name_question_val = image_name_question_val.head(5)

### Aditional functions we will use:

#### get_text_features:

In [8]:
code = get_highlited_function_code(get_text_features,remove_comments=True)
IPython.display.display(code)

#### get_image:

In [9]:
code = get_highlited_function_code(get_image,remove_comments=True)
IPython.display.display(code)

#### pre_process_raw_data:

In [10]:
code = get_highlited_function_code(pre_process_raw_data,remove_comments=True)
IPython.display.display(code)


#### This is for in case we want to classify by categorial labels (TBD):
(i.e. make it into a one large multiple choice test)

In [11]:
def get_categorial_labels(df, meta):
    ans_to_ix = meta['ans_to_ix']
    all_classes =  ans_to_ix.keys()
    data_classes = df['answer']
    class_count = len(all_classes)

    classes_indices = [ans_to_ix[ans] for ans in data_classes]
    categorial_labels = to_categorical(classes_indices, num_classes=class_count)
    
    for i in range(len(categorial_labels)):
        assert np.argmax(categorial_labels[i])== classes_indices[i], 'Expected to get argmax at index of label'
    


    return categorial_labels

with VerboseTimer("Getting categorial training labels"):
    df_train = image_name_question[image_name_question.group == 'train']
    categorial_labels_train = get_categorial_labels(df_train, meta_data)

# with VerboseTimer("Getting categorial validation labels"):
#     categorial_labels_val = get_categorial_labels(df_val, meta_data)
# categorial_labels_train.shape, categorial_labels_val.shape
# del df_train
# del df_val

Getting categorial training labels: 0:00:00.103868


In [12]:
from common.functions import enrich_data, clean_data
image_name_question = clean_data(image_name_question)
image_name_question = enrich_data(image_name_question)
image_name_question.head()

Unnamed: 0,image_name,question,answer,group,path,ct,mri,tumor,hematoma,brain,abdomen,neck,liver
0,rjv03401,what does MRI show?,tumor at tail pancreas,train,C:\Users\Public\Documents\Data\2018\VQAMed2018...,False,True,True,False,False,False,False,False
1,AIAN-14-313-g002,where does axial seCTion MRI abdomen show hypo...,distal pancreas,train,C:\Users\Public\Documents\Data\2018\VQAMed2018...,False,True,False,False,False,True,False,False
2,wjem-11-76f3,what do arrows denote noncontrast CT pelvis?,complex fluid colleCTion with layerg consisten...,train,C:\Users\Public\Documents\Data\2018\VQAMed2018...,True,False,False,True,False,False,False,False
3,ccr30002-0045-f3,what was normal?,blood supply to bra,train,C:\Users\Public\Documents\Data\2018\VQAMed2018...,False,False,False,False,False,False,False,False
4,rjt01904,what shows evidence a contaed rupture?,repeat CT abdomen,train,C:\Users\Public\Documents\Data\2018\VQAMed2018...,True,False,False,False,False,True,False,False


In [24]:
image_name_question.groupby('group').describe()

Unnamed: 0_level_0,abdomen,abdomen,abdomen,abdomen,answer,answer,answer,answer,brain,brain,...,path,path,question,question,question,question,tumor,tumor,tumor,tumor
Unnamed: 0_level_1,count,unique,top,freq,count,unique,top,freq,count,unique,...,top,freq,count,unique,top,freq,count,unique,top,freq
group,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
test,500,2,False,483,0,0,,,500,2,...,C:\Users\Public\Documents\Data\2018\VQAMed2018...,4,500,307,where is tumor located?,37,500,2,False,410
train,5413,2,False,5008,5413,4337,MRI,139.0,5413,2,...,C:\Users\Public\Documents\Data\2018\VQAMed2018...,7,5413,4534,what does CT show?,149,5413,2,False,4649
validation,500,2,False,479,500,454,no,18.0,500,2,...,C:\Users\Public\Documents\Data\2018\VQAMed2018...,4,500,353,what does CT show?,51,500,2,False,437


### Do the actual pre processing
Note:  
This might take a while...

In [13]:
# # # # RRR
# # # logger.debug('Getting answers embedding')
# df = image_name_question
# df['l'] = df.answer.apply(lambda a: len(str(a)))
# df[df.l > 2].sort_values('l')
# # print(len(df[(df.answer == np.nan) | (df.question == np.nan)]))


# # df['answer'].apply(lambda q: get_text_features(q))
# # # a= df['answer'].apply(lambda q: 0 if q == np.nan else 1)
# # # sum(a), len(a), len(image_name_question)

# import json
# # json.load(open)
# a = df[df.group == 'test']['answer'].values[0]
# type(a)




In [14]:
logger.debug('----===== Preproceccing train data =====----')
image_locations = train_data.images_path
with VerboseTimer("Pre processing training data"):
    image_name_question_processed = pre_process_raw_data(image_name_question)

[10:07:50][DEBUG] ----===== Preproceccing train data =====----
[10:07:51][DEBUG] Getting answers embedding
[10:08:41][DEBUG] Getting questions embedding
[10:09:48][DEBUG] Getting image features
[10:10:22][DEBUG] Done
Pre processing training data: 0:02:31.168283


In [15]:
# logger.debug('----===== Preproceccing validation data =====----')
# image_locations = validation_data.images_path
# with VerboseTimer("Pre processing validation data"):
#     image_name_question_val = pre_process_raw_data(image_name_question_val, image_locations)

#### Saving the data, so later on we don't need to compute it again

In [16]:
def get_vqa_specs(meta_data):    
    dim = embedding_dim
    s_length = seq_length    
    return VqaSpecs(embedding_dim=dim, seq_length=s_length, data_location=data_location,meta_data=meta_data)

vqa_specs = get_vqa_specs(meta_data)

# Show waht we got...
s = str(vqa_specs)
s[:s.index('meta_data=')+10]

"VqaSpecs(embedding_dim=384, seq_length=26, data_location='C:\\\\Users\\\\avitu\\\\Documents\\\\GitHub\\\\VQA-MED\\\\VQA-MED\\\\Cognitive-LUIS-Windows-master\\\\Sample\\\\VQA.Python\\\\data\\\\model_input.h5', meta_data="

In [17]:
logger.debug("Saving the data")

item_to_save = image_name_question_processed
# item_to_save = image_name_question.head(10)

# remove if exists
try:
    os.remove(data_location)
except OSError:
    pass


with VerboseTimer("Saving model training data"):
    with HDFStore(data_location) as store:
        store['data']  = image_name_question_processed[(image_name_question_processed.group == 'train') | (image_name_question_processed.group == 'validation')]
        store['test']  = image_name_question_processed[image_name_question_processed.group == 'test']
        
        
        

size = get_size(data_location)
logger.debug(f"training data's file size was: {size}")


# item_to_save.to_hdf(vqa_specs.data_location, key='df')    
# logger.debug(f"Saved to {vqa_specs.data_location}")

[10:10:22][DEBUG] Saving the data
Saving model training data: 0:00:14.066018
[10:10:36][DEBUG] training data's file size was: 2.03 GB


In [18]:
File.dump_pickle(vqa_specs, data_location)
logger.debug(f"VQA Specs saved to:\n{vqa_specs_location}")

[10:10:36][DEBUG] VQA Specs saved to:
C:\Users\avitu\Documents\GitHub\VQA-MED\VQA-MED\Cognitive-LUIS-Windows-master\Sample\VQA.Python\data\vqa_specs.pkl


In [19]:
print (f"vqa_specs_location = '{vqa_specs_location}'".replace('\\','\\\\'))

vqa_specs_location = 'C:\\Users\\avitu\\Documents\\GitHub\\VQA-MED\\VQA-MED\\Cognitive-LUIS-Windows-master\\Sample\\VQA.Python\\data\\vqa_specs.pkl'
