In [1]:
import sklearn_crfsuite
from sklearn_crfsuite.metrics import flat_f1_score
from sklearn_crfsuite import CRF
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from spacy.lang.en import English
from spacy import displacy
from pathlib import Path
import joblib
from spacy.tokens import Doc
from typing import List
import random
from statistics import mean, stdev
from loguru import logger
import sys
import spacy
from medspacy_io.reader.brat_reader import BratDocReader, BratDirReader
import medspacy

## initialize brat reader

In [2]:
cleaned_train_dir=r'..\data\n2c2\cleaned_training'
cleaned_test_dir=r'..\data\n2c2\cleaned_test'
Path(cleaned_train_dir).exists(), Path(cleaned_test_dir).exists()

(True, True)

In [3]:
nlp=spacy.load('en_core_web_sm', disable=['ner'])

In [4]:
dir_reader = BratDirReader(nlp=nlp, schema_file=str(Path(cleaned_train_dir, 'annotation.conf')), support_overlap=True)

## Read eHOST annotations | load from pickles

In [5]:
pickle_file= r'..\data\n2c2\spacy_docs.joblib'

In [6]:
if not Path(pickle_file).exists():
    train_docs=dir_reader.read(txt_dir=cleaned_train_dir)
    test_docs=dir_reader.read(txt_dir=cleaned_test_dir)
    print(len(train_docs), len(test_docs))
    joblib.dump((train_docs, test_docs), pickle_file)
else:
    print(f'{pickle_file} already exists, load them directly')
    # before load from pickle, initiate EhostDirReader or EhostDocReader first, because some Doc extension used to store meta data will not be automatically recreated by loading.
    train_docs, test_docs=joblib.load(pickle_file)

..\data\n2c2\spacy_docs.joblib already exists, load them directly


## CRF Wrapper (only use for eval)

In [7]:
from CRFWrapper_Sentence import spans_to_bio, convert_docs, word2features, sent2features,compute_metrics_and_averages,  CRFModel


In [8]:
## Get all annotation types: 
annos=set()
for d in train_docs:
    for anno in d.spans.keys():
        annos.add(anno)
print(annos)

{'Form', 'Reason', 'Strength', 'Drug', 'Duration', 'Route', 'ADE', 'Frequency', 'Dosage'}


## converting docs into sentence level dataframe

In [9]:
from ALLSampler_Sentence import SamplingSimulator, ModelSamplingSimulator, VBSamplingSimulator, convert_docs_medspacyIOvec

In [10]:
sdf_labels_train=convert_docs_medspacyIOvec(train_docs)

In [11]:
_, train_df=convert_docs(train_docs, anno_types=annos)

In [12]:
_, test_df=convert_docs(test_docs, anno_types=annos)

In [13]:
# embedding for unique sentence
pickle_embedding_file= r'..\data\n2c2\embedding_df_uniqueSentID.joblib'
if Path(pickle_embedding_file).exists():
    embedding_df=joblib.load(pickle_embedding_file)

In [14]:
sdf_labels_sid = sdf_labels_train.merge(embedding_df, how='inner', on='sentence') 

In [15]:
scores=vb_simulator.simulate_rounds(boostrap_times=3)

NameError: name 'vb_simulator' is not defined

In [None]:
def compute_mean_ci(scores):
    ave=np.mean(scores)
    ci=np.percentile(scores, [2.5, 97.5])
    return ave, ci

summary={'precision': [], 'pl':[], 'pu': [], 'recall': [], 'rl':[], 'ru': [], 'f1':[], 'fl':[], 'fu': []}
for s in scores:    
    for k,v in s.items():
        ave, (l, u)=compute_mean_ci(v)
        summary[k].append(ave)
        summary[k[0]+'l'].append(l)
        summary[k[0]+'u'].append(u)

In [None]:
pd.options.display.float_format='{:,.5f}'.format
pd.DataFrame(summary)

In [16]:
sdf_labels_sid.shape

(638692, 6)

## bootstrap 3 runs

In [17]:
logger.remove()
logger.add(sys.stderr, level='INFO')

1

In [18]:
boostrap_runs=3
total_round=10

In [19]:
random.seed(14)
seeds=[random.randint(1,10000000) for  _ in range(boostrap_runs)]
seeds

[1792286, 8843471, 4142887]

In [20]:
all_scores=[]
pickle_embedding_file= r'..\data\n2c2\embedding_df_uniqueSentID.joblib'# r'..\data\n2c2\embedding_df.joblib'
faiss_index_path= r'..\data\n2c2\faiss_index_st768'
embedding_df=joblib.load(pickle_embedding_file)
for si, seed  in enumerate(seeds):
    logger.info(f'start run {si}.')
    pickle_embedding_file= r'..\data\n2c2\embedding_df_uniqueSentID.joblib' #r'..\data\n2c2\embedding_df.joblib'
    crf_model=CRFModel(anno_types=annos, topNUncertainToken=2)
    vb_simulator=VBSamplingSimulator(total_sents=train_df, 
                                 total_round=10, 
                                 modelWrapper=crf_model, 
                                 eval_sents=test_df, 
                                 init_seed=seed,
                                 sample_all_on_last_round=False, 
                                 faiss_index_path=faiss_index_path, 
                                 embedding_df=embedding_df,
                                 sdf_labels=sdf_labels_sid[['sentence','concept', 'y', 'doc_name','sentence_id']],
                                 min_dist_diff=True
                                )
    vb_simulator.num_per_round=200
    scores=vb_simulator.simulate_rounds(boostrap_times=500)
    all_scores.append(scores) 

[32m2024-07-16 01:12:24.502[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m6[0m - [1mstart run 0.[0m
[32m2024-07-16 01:12:25.281[0m | [1mINFO    [0m | [36mALLSampler_Sentence[0m:[36msimulate_rounds[0m:[36m99[0m - [1msimulate round 0.[0m
[32m2024-07-16 01:12:25.687[0m | [1mINFO    [0m | [36mALLSampler_Sentence[0m:[36mkeep_sample[0m:[36m89[0m - [1mcurrent sampled sentences: 200, remaining sentences: 51598[0m
[32m2024-07-16 01:21:31.058[0m | [1mINFO    [0m | [36mALLSampler_Sentence[0m:[36msimulate_rounds[0m:[36m99[0m - [1msimulate round 1.[0m
[32m2024-07-16 01:21:37.870[0m | [1mINFO    [0m | [36mALLSampler_Sentence[0m:[36mkeep_sample[0m:[36m89[0m - [1mcurrent sampled sentences: 400, remaining sentences: 51398[0m
[32m2024-07-16 01:30:52.776[0m | [1mINFO    [0m | [36mALLSampler_Sentence[0m:[36msimulate_rounds[0m:[36m99[0m - [1msimulate round 2.[0m
[32m2024-07-16 01:30:59.635[0m | [1mINFO    [0m | [36mALL

In [21]:
joblib.dump(all_scores, r'../data/n2c2/scores_sentence_sampling/ner_VBmin_scores_sentenceSampling_500bootstrap.joblib')


['../data/n2c2/scores_sentence_sampling/ner_VBmin_scores_sentenceSampling_500bootstrap.joblib']