In [15]:
import json
import os
import re
import pickle
from keras.callbacks import ModelCheckpoint,EarlyStopping
from lib.data_handler import trim_cases_by_class
from lib.data_handler import balanced_split_list
from lib.data_handler import get_data_token_count
from lib.data_handler import wv_initialize
from lib.data_handler import cnn_tokensToIdx
from lib.data_handler import get_list_unique
from lib.data_handler import balancedCV
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score
from lib import basic_cnn
from keras.models import load_model
import numpy as np
import wandb
from wandb.keras import WandbMetricsLogger, WandbModelCheckpoint
'''
valid task names:
    gs_behavior_label
    gs_organ_label
    gs_icd_label
    gs_hist_grade_label
    gs_lat_label
'''
# parameters ---------------------------------------------------------------------------
task = 'gs_icd_label'
#test_prop = .1
num_cv = 1
val_prop = .25
preloadedWV=None
min_df = 2  #This means that a term must appear in at least 2 documents to be included in the model's vocabulary. This parameter is useful for removing very rare words or terms that are unlikely to contribute to the model's performance due to their sparse occurrence across the dataset.
pretrained_cnn_name = 'pretrained.h5'
rand_seed = 3545
cnn_seq_len = 1500
# n your parameters, cnn_seq_len is set to 1500, meaning that the CNN is designed to process sequences (e.g., sentences, documents, or any other form of sequential data) of up to 1500 elements (such as words, characters, or time steps) in length.
#For text processing tasks, this could mean that each document or piece of text is either padded or truncated to ensure it has exactly 1500 tokens (words or characters) before being fed into the CNN model. This uniformity in sequence length is important for batch processing in neural networks, allowing the model to efficiently learn from the data.
reverse_seq = True
train_epochs = 50

def read_json():
    """
    function to read matched_fd.json as list
    """
    with open('raw_last_150_items.json') as data_file:
        data = json.load(data_file)
    return data


def get_valid_label(task_name,in_data):
    """
    function to get text,labels for valid tasks
    """
    #print(in_data[0])
    # valid_entries = [x for x in in_data if x[task_name]['match_status']=="matched"]
    
    valid_entries = in_data
    valid_text = [x['doc_raw_text'] for x in valid_entries]
    valid_tokens = [cleanText(x) for x in valid_text]
    valid_labels = [x[task_name]['match_label'] for x in valid_entries]
    return list(zip(valid_tokens,valid_labels)) #it returns 951 valid data

def get_task_labels(in_task):
    read_data = read_json()
    return get_valid_label(in_task,read_data)

def cleanText(text):
    '''
    function to clean text
    '''
    #replace symbols and tokens
    text = re.sub('\n|\r', ' ', text)
    text = re.sub('o clock', 'oclock', text, flags=re.IGNORECASE)
    text = re.sub(r'(p\.?m\.?)','pm', text, flags=re.IGNORECASE)
    text = re.sub(r'(a\.?m\.?)', 'am', text, flags=re.IGNORECASE)
    text = re.sub(r'(dr\.)', 'dr', text, flags=re.IGNORECASE)
    text = re.sub('\*\*NAME.*[^\]]\]', 'nametoken', text)
    text = re.sub('\*\*DATE.*[^\]]\]', 'datetoken', text)
    text = re.sub("\?|'", '', text)
    text = re.sub('[^\w.;:]|_|-', ' ', text)
    text = re.sub('[0-9]+\.[0-9]+','floattoken', text)
    text = re.sub('floattokencm','floattoken cm', text)
    text = re.sub(' [0-9][0-9][0-9]+ ',' largeint ', text)
    text = re.sub('\.', ' . ', text)
    text = re.sub(':', ' : ', text)
    text = re.sub(';', ' ; ', text)

    #lowercase
    text = text.lower()

    #tokenize
    text = text.split()
    return text


In [18]:
import pickle

def algo():
    
    rand_state = np.random.RandomState(rand_seed)
    data_label_pairs = get_task_labels(task)
    # label_list = [x[1] for x in data_label_pairs]
    # label_encoder = LabelEncoder()
    # label_encoder.fit(label_list)
    
    # cv_list = balancedCV(label_list, num_cv, rand_state)

    y_actual, y_pred = [], []
    # for this_cv in range(num_cv):
    # train_idx = [i for i, cv in enumerate(cv_list) if cv != this_cv]
    # test_idx = [i for i, cv in enumerate(cv_list) if cv == this_cv or cv != this_cv]

    # train = [x for i, x in enumerate(data_label_pairs) if i in train_idx]
    test = data_label_pairs
    
    # with open('test_data.pkl', 'rb') as file:
    #     test = pickle.load(file)

    print(len(test))
    print(test[0])
    
    with open('vocab.pkl', 'rb') as handle:
        vocab_counter = pickle.load(handle)
    
    wv_mat, wv_to_idx = wv_initialize(preloadedWV,min_df,vocab_counter,rand_state)  
    
    test_tokens,test_y = list(zip(*test))
    test_x = [cnn_tokensToIdx(x,wv_to_idx,cnn_seq_len,0,reverse_seq) \
             for x in test_tokens]
        
    # print(test_x[0])
    
        
        

In [19]:
algo()

153
(['clinical', 'history', ':', 'atypical', 'lymph', 'node', 'suspicious', 'for', 'adenocarcinoma', '.', 'ap', 'speci', 'gross', 'and', 'microscopic', 'pathologic', 'diagnosis', ':', 'a', '.', 'lung', 'right', 'upper', 'lobe', 'wedge', ':', 'invasive', 'non', 'small', 'cell', 'carcinoma', '1', '.', 'histologic', 'type', ':', 'invasive', 'adenocarcinoma', 'with', 'a', 'lepidic', 'pattern', '.', '2', '.', 'grade', ':', 'well', 'differentiated', '.', '3', '.', 'tumor', 'size', ':', 'tumor', 'measures', 'floattoken', 'x', 'floattoken', 'x', 'floattoken', 'cm', '.', '4', '.', 'margins', 'of', 'resection', 'and', 'extent', ':', 'a', '.', 'the', 'specimen', 'is', 'intact', '.', 'b', '.', 'the', 'tumor', 'is', 'unifocal', '.', 'c', '.', 'no', 'visceral', 'pleural', 'invasion', 'is', 'identified', '.', 'd', '.', 'the', 'tumor', 'is', 'confined', 'to', 'the', 'lung', '.', 'e', '.', 'the', 'wedge', 'margin', 'is', 'free', 'of', 'tumor', '.', 'f', '.', 'the', 'tumor', 'comes', 'within', 'floatto