In [31]:
import json
import os
import re
from keras.callbacks import ModelCheckpoint,EarlyStopping
from lib.data_handler import trim_cases_by_class
from lib.data_handler import balanced_split_list
from lib.data_handler import get_data_token_count
from lib.data_handler import wv_initialize
from lib.data_handler import cnn_tokensToIdx
from lib.data_handler import get_list_unique
from lib.data_handler import balancedCV
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score
from lib import basic_cnn
from keras.models import load_model
import numpy as np
import wandb
from wandb.keras import WandbMetricsLogger, WandbModelCheckpoint
'''
valid task names:
    gs_behavior_label
    gs_organ_label
    gs_icd_label
    gs_hist_grade_label
    gs_lat_label
'''
# parameters ---------------------------------------------------------------------------
task = 'gs_icd_label'
test_prop = .1
num_cv = 1
val_prop = .25
preloadedWV=None
min_df = 2  #This means that a term must appear in at least 2 documents to be included in the model's vocabulary. This parameter is useful for removing very rare words or terms that are unlikely to contribute to the model's performance due to their sparse occurrence across the dataset.
pretrained_cnn_name = 'pretrained.h5'
rand_seed = 3545
cnn_seq_len = 1500
# n your parameters, cnn_seq_len is set to 1500, meaning that the CNN is designed to process sequences (e.g., sentences, documents, or any other form of sequential data) of up to 1500 elements (such as words, characters, or time steps) in length.
#For text processing tasks, this could mean that each document or piece of text is either padded or truncated to ensure it has exactly 1500 tokens (words or characters) before being fed into the CNN model. This uniformity in sequence length is important for batch processing in neural networks, allowing the model to efficiently learn from the data.
reverse_seq = True
train_epochs = 50


def read_json():
    """
    function to read matched_fd.json as list
    """
    with open('matched_fd.json') as data_file:
        data = json.load(data_file)
    return data

def get_valid_label(task_name,in_data):
    """
    function to get text,labels for valid tasks
    """
    #print(in_data[0])
    valid_entries = [x for x in in_data if x[task_name]['match_status']=="matched"]
    valid_text = [x['doc_raw_text'] for x in valid_entries]
    valid_tokens = [cleanText(x) for x in valid_text]
    valid_labels = [x[task_name]['match_label'] for x in valid_entries]
    return list(zip(valid_tokens,valid_labels)) #it returns 951 valid data

def get_task_labels(in_task):
    read_data = read_json()
    return get_valid_label(in_task,read_data)

def cleanText(text):
    '''
    function to clean text
    '''
    #replace symbols and tokens
    text = re.sub('\n|\r', ' ', text)
    text = re.sub('o clock', 'oclock', text, flags=re.IGNORECASE)
    text = re.sub(r'(p\.?m\.?)','pm', text, flags=re.IGNORECASE)
    text = re.sub(r'(a\.?m\.?)', 'am', text, flags=re.IGNORECASE)
    text = re.sub(r'(dr\.)', 'dr', text, flags=re.IGNORECASE)
    text = re.sub('\*\*NAME.*[^\]]\]', 'nametoken', text)
    text = re.sub('\*\*DATE.*[^\]]\]', 'datetoken', text)
    text = re.sub("\?|'", '', text)
    text = re.sub('[^\w.;:]|_|-', ' ', text)
    text = re.sub('[0-9]+\.[0-9]+','floattoken', text)
    text = re.sub('floattokencm','floattoken cm', text)
    text = re.sub(' [0-9][0-9][0-9]+ ',' largeint ', text)
    text = re.sub('\.', ' . ', text)
    text = re.sub(':', ' : ', text)
    text = re.sub(';', ' ; ', text)

    #lowercase
    text = text.lower()

    #tokenize
    text = text.split()
    return text


In [40]:
import pickle

def algo():
    #Initializing a random state with a fixed seed ensures reproducibility. It means that random processes (like data shuffling) are consistent across different runs of the script.
    rand_state = np.random.RandomState(rand_seed)
    #Calls get_task_labels to read data and labels for the specified task and then applies trim_cases_by_class to remove classes with insufficient data.
    #eikhane 951 ta text data and corresponding level ache
    data_label_pairs = get_task_labels(task)
    
    #eikhane shudhu jeishob label er freq shudhu 10 or 10 er upore taderke nise
    data_label_pairs = trim_cases_by_class(data_label_pairs)
    label_list = [x[1] for x in data_label_pairs] #eikhane data er shudhu label gulake nitese
    label_encoder = LabelEncoder()
    label_encoder.fit(label_list)
    cv_list = balancedCV(label_list,num_cv,rand_state)
    y_actual,y_pred = [],[]
    for this_cv in range(num_cv):
        
        #train test split (bujhini)
        #this cv er upore basis kore on index er value gula train and kon gula test e jabe sheta ber kora hoitese
        # train_idx = [i for i,cv in enumerate(cv_list) if cv != this_cv]
        # test_idx = [i for i,cv in enumerate(cv_list) if cv == this_cv]
        # #train idx or test idx er je index gul ase shei index gular data gula amra niboh
        # train = [x for i,x in enumerate(data_label_pairs) if i in train_idx]
        # test = [x for i,x in enumerate(data_label_pairs) if i in test_idx]
        
        train,test = balanced_split_list(data_label_pairs,label_list,test_prop)
        
        # with open('test_data_2.pkl', 'wb') as file:
        #     pickle.dump(test, file)
        
        print(len(test))
        print(test[0])
        print(test[-1])
        
        train_label_list = [x[1] for x in train]
        train,val = balanced_split_list(train,train_label_list,val_prop)
        #get train vocab, initialize train wv matrix, token to wv_idx mappings
        vocab_counter = get_data_token_count(train)
        
        wv_mat, wv_to_idx = wv_initialize(preloadedWV,min_df,vocab_counter,rand_state)
        
        test_tokens,test_y = list(zip(*test))
        test_x = [cnn_tokensToIdx(x,wv_to_idx,cnn_seq_len,0,reverse_seq) \
                 for x in test_tokens]
        
        # print(test_x[0])
        
        test_y = label_encoder.transform(test_y)
        
        

In [41]:
algo()


90
(['clinical', 'info', 'age', 'in', '70s', 'year', 'old', 'female', 'with', 'mediastinal', 'lung', 'mass', '.', 'bronchoscopy', 'shows', 'tumor', 'along', 'right', 'bronchus', '.', 'gross', 'a', '.', 'received', 'appropriately', 'labeled', 'with', 'the', 'patients', 'name', 'and', 'designated', 'as', 'brush', 'biopsy', 'of', 'right', 'mainstem', 'bronchus', 'is', 'fluid', 'submitted', 'in', 'cytolyt', 'fixative', 'which', 'is', 'used', 'to', 'make', 'one', 'monolayer', 'preparation', 'slide', 'labeled', 'as', 'a1', '.', 'also', 'received', 'are', 'six', 'slides', 'in', 'alcohol', 'which', 'are', 'subsequently', 'pap', 'stained', 'and', 'labeled', 'as', 'a2', 'a7', '.', 'b', '.', 'received', 'appropriately', 'labeled', 'with', 'the', 'patients', 'name', 'and', 'designated', 'as', 'bronchial', 'washing', '1', 'is', 'fluid', 'submitted', 'in', 'cytolyt', 'fixative', 'which', 'is', 'used', 'to', 'make', 'one', 'monolayer', 'preparation', 'slide', 'labeled', 'as', 'b1', '.', 'c', '.', 're