# TREC

In [2]:
import datasets

  from .autonotebook import tqdm as notebook_tqdm


In [10]:
import json

In [3]:
data = datasets.load_dataset("trec")

Downloading builder script: 100%|██████████| 5.09k/5.09k [00:00<00:00, 9.54MB/s]
Downloading metadata: 100%|██████████| 3.34k/3.34k [00:00<00:00, 5.83MB/s]
Downloading readme: 100%|██████████| 10.6k/10.6k [00:00<00:00, 14.8MB/s]
Downloading data: 100%|██████████| 336k/336k [00:00<00:00, 393kB/s]
Downloading data: 100%|██████████| 23.4k/23.4k [00:00<00:00, 140kB/s] 
Downloading data files: 100%|██████████| 2/2 [00:05<00:00,  2.72s/it]
Generating train split: 100%|██████████| 5452/5452 [00:00<00:00, 29770.81 examples/s]
Generating test split: 100%|██████████| 500/500 [00:00<00:00, 24882.56 examples/s]


In [9]:
data["train"].to_json("./preprocess/trec_train.jsonl")

Creating json from Arrow format: 100%|██████████| 6/6 [00:00<00:00, 269.63ba/s]


520661

In [16]:
data_dict = {}

for split in data:
    split_dict = {}
    for i in range(len(data[split])):
        split_dict[i] = data[split][i]
    data_dict[split] = split_dict
        
with open("./preprocess/trec_split.json", "w") as f:
    json.dump(data_dict,f)

In [13]:
len(data["train"])

5452

# Penn Treebank

In [3]:
import nltk
nltk.download('treebank')

[nltk_data] Downloading package treebank to /home/lee-j/nltk_data...
[nltk_data]   Unzipping corpora/treebank.zip.


True

In [7]:

ruleset = set(rule for tree in nltk.corpus.treebank.parsed_sents() 
           for rule in tree.productions())
with open("preprocess/cfg.txt", "w") as f:
    for rule in ruleset:
        f.write(str(rule) + "\n")

# coreference

In [7]:
import coreferee, spacy, spacy_transformers
# !python -m spacy download en_core_web_trf
# !python -m spacy download en_core_web_lg
# python -m coreferee install en
nlp = spacy.load('en_core_web_trf')
nlp.add_pipe('coreferee')


<coreferee.manager.CorefereeBroker at 0x7f30d121cfa0>

In [None]:
<coreferee.manager.CorefereeBroker object at 0x000002DE8E9256D0>
>>>


In [8]:
doc = nlp("Although he was very busy with his work, Peter had had enough of it. He and his wife decided they needed a holiday. They travelled to Spain because they loved the country very much.")

doc._.coref_chains.print()


0: he(1), his(6), Peter(9), He(16), his(18)
1: work(7), it(14)
2: [He(16); wife(19)], they(21), They(26), they(31)
3: Spain(29), country(34)


In [12]:
doc[16]._.coref_chains.print()


0: he(1), his(6), Peter(9), He(16), his(18)
2: [He(16); wife(19)], they(21), They(26), they(31)


In [11]:
doc._.coref_chains.resolve(doc[31])


[Peter, wife]

# POS tagging

In [14]:
import nltk

In [15]:
tags = [one[1].lower() for one in nltk.pos_tag(nltk.word_tokenize("Although he was very busy with his work, Peter had had enough of it. He and his wife decided they needed a holiday. They travelled to Spain because they loved the country very much."))]

In [16]:
tags

['in',
 'prp',
 'vbd',
 'rb',
 'jj',
 'in',
 'prp$',
 'nn',
 ',',
 'nnp',
 'vbd',
 'vbn',
 'rb',
 'in',
 'prp',
 '.',
 'prp',
 'cc',
 'prp$',
 'nn',
 'vbd',
 'prp',
 'vbd',
 'dt',
 'nn',
 '.',
 'prp',
 'vbd',
 'to',
 'nnp',
 'in',
 'prp',
 'vbd',
 'dt',
 'nn',
 'rb',
 'rb',
 '.']

# semantic similarity

In [20]:
import spacy 
  
nlp = spacy.load('en_core_web_lg') 
  
print("Enter two space-separated words") 
words = input() 
  
tokens = nlp(words) 
  
for token in tokens: 
    # Printing the following attributes of each token. 
    # text: the word string, has_vector: if it contains 
    # a vector representation in the model,  
    # vector_norm: the algebraic norm of the vector, 
    # is_oov: if the word is out of vocabulary. 
    print(token.text, token.has_vector, token.vector_norm, token.is_oov) 
  
token1, token2 = tokens[0], tokens[1] 
  
print("Similarity:", token1.similarity(token2)) 

Enter two space-separated words
hot True 74.552574 False
warm True 59.266777 False
Similarity: 0.5941726565361023


In [21]:
print("Enter two space-separated words") 
words = input() 
  
tokens = nlp(words) 
  
for token in tokens: 
    # Printing the following attributes of each token. 
    # text: the word string, has_vector: if it contains 
    # a vector representation in the model,  
    # vector_norm: the algebraic norm of the vector, 
    # is_oov: if the word is out of vocabulary. 
    print(token.text, token.has_vector, token.vector_norm, token.is_oov) 
  
token1, token2 = tokens[0], tokens[1] 
  
print("Similarity:", token1.similarity(token2)) 

Enter two space-separated words
warm True 59.266777 False
cool True 46.042625 False
Similarity: 0.6782691478729248


# Word level embedding

In [9]:
from sentence_transformers import SentenceTransformer
embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L12-v2')

docs = ['how did serfdom develop in and then leave russia ?', 'what films featured the character popeye doyle ?', 'how can I find a list of celebrities \' real names ?']
docs_embeddings = embedding_model.encode(docs)
word_embeddings = embedding_model.encode(docs, output_value="token_embeddings")

token_ids = []
token_strings = []
tokenizer = embedding_model._first_module().tokenizer

word_emb_set = set()
word_emb_idx_set = set()

for doc in docs:
    ids = tokenizer.encode(doc)
    strings = tokenizer.convert_ids_to_tokens(ids)
    token_ids.append(ids)
    token_strings.append(strings)
    
    for id in ids[1:-1]:
        
        
        
        




In [10]:
ids

[101,
 2129,
 2064,
 1045,
 2424,
 1037,
 2862,
 1997,
 12330,
 1005,
 2613,
 3415,
 1029,
 102]

In [2]:
token_ids

[[101, 7632, 102], [101, 7592, 102], [101, 2129, 2024, 2017, 102]]

In [7]:
token_strings

[['[CLS]', 'hi', '[SEP]'],
 ['[CLS]', 'hello', '[SEP]'],
 ['[CLS]', 'how', 'are', 'you', '[SEP]']]

# MedQuad

In [7]:
import os
import xml.etree.ElementTree as ET

In [16]:

folders = os.listdir('./MedQuAD')
folders.remove('.git')
folders.remove('LICENSE.txt')
folders.remove('readme.txt')
folders.remove('__MACOSX')
folders.remove('QA-TestSet-LiveQA-Med-Qrels-2479-Answers.zip')
folders.remove('QA-TestSet-LiveQA-Med-Qrels-2479-Answers')
folders

['5_NIDDK_QA',
 '12_MPlusHerbsSupplements_QA',
 '1_CancerGov_QA',
 '6_NINDS_QA',
 '10_MPlus_ADAM_QA',
 '8_NHLBI_QA_XML',
 '4_MPlus_Health_Topics_QA',
 '2_GARD_QA',
 '7_SeniorHealth_QA',
 '3_GHR_QA',
 '11_MPlusDrugs_QA',
 '9_CDC_QA']

In [30]:
qns = []
labels = []

for folder in folders:
    files = os.listdir(f'./MedQuAD/{folder}')
    for file in files:
        try:
            tree = ET.parse(f'./MedQuAD/{folder}/{file}')
        except:
            print(folder, file)
        root = tree.getroot()
        qapairs = root.find('./QAPairs')
        try:
            for qapair in qapairs:
                for item in qapair:
                    if item.tag == "Question":
                        qns.append(item.text)
                        labels.append(item.attrib['qtype'])
        except:
            print(folder, file)
            
assert len(qns) == len(labels)
print(len(qns))
        
for i in range(200,500,7):
    print(qns[i], ':', labels[i])

47441
What are the treatments for Kidney Failure: Choosing a Treatment That's Right for You ? : treatment
What are the complications of Vesicoureteral Reflux ? : complications
What is (are) Childhood Nephrotic Syndrome ? : information
How to prevent Childhood Nephrotic Syndrome ? : prevention
What are the symptoms of Anemia in Chronic Kidney Disease ? : symptoms
What is (are) Diabetic Neuropathies: The Nerve Damage of Diabetes ? : information
What is (are) Diabetic Neuropathies: The Nerve Damage of Diabetes ? : information
What are the symptoms of Prevent diabetes problems: Keep your kidneys healthy ? : symptoms
Who is at risk for What I need to know about Hepatitis C? ? : susceptibility
What to do for What I need to know about Hepatitis C ? : considerations
What to do for What I need to know about Gas ? : considerations
What to do for Proteinuria ? : considerations
What are the treatments for Foodborne Illnesses ? : treatment
What are the treatments for Nonalcoholic Steatohepatitis ? 

In [37]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(labels)
labels_int = le.transform(labels)
max(labels_int)

38

In [38]:
le.classes_

array(['brand names', 'brand names of combination products', 'causes',
       'complications', 'considerations', 'contraindication', 'dietary',
       'dose', 'emergency or overdose', 'exams and tests',
       'forget a dose', 'frequency', 'genetic changes',
       'how can i learn more', 'how does it work', 'how effective is it',
       'interactions with foods',
       'interactions with herbs and supplements',
       'interactions with medications', 'other information', 'outlook',
       'precautions', 'prevention', 'research', 'severe reaction',
       'side effects', 'stages', 'storage and disposal', 'support groups',
       'susceptibility', 'symptoms', 'treatment', 'usage',
       'when to contact a medical professional', 'why get vaccinated'],
      dtype='<U39')

In [39]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(qns, labels_int, test_size=0.2, random_state=42) 

In [33]:
import json

In [41]:
train = {}
test = {}
with open('medquad_split_json', 'w') as f:
    for i in range(len(X_train)):
        sample_dict = {}
        sample_dict['text'] = X_train[i]
        sample_dict['label'] = str(y_train[i])
        train[i] = sample_dict
    for i in range(len(X_test)):
        sample_dict = {}
        sample_dict['text'] = X_test[i]
        sample_dict['label'] = str(y_test[i])
        test[i] = sample_dict
    medquad = {'train': train, 'test': test}
    json.dump(medquad, f, indent=2)
        
        