In [2]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 
import numpy as np
import re
from numpy import dot
from numpy.linalg import norm
import math
import random

stop_words = set(stopwords.words('english')) 

In [3]:
# Extracting word vectors from biowordvec

biword2vec_text_file = "/efs/CONSORT2/BioWordVec/BioWordVec_PubMed_MIMICIII_d200.txt"

embeddings_dict = {}

N= 16545452

with open(biword2vec_text_file,buffering=100000) as infile:   
    next(infile)
    for i in range(N):
        line = next(infile).strip()
        items = line.split()
        word = items[0]
        vector = np.asarray(items[1:], "float32")
        embeddings_dict[word] = vector

In [4]:
# Functions to get sentence embeddings
def avg_feature_vector(sentence,embedding_dict): 
    sentence_embedding = np.zeros((200, ), dtype='float32')
    
    words = preprocess(sentence)
    
    for word in words:
        if word in embedding_dict.keys():
            word_embedding = embedding_dict[word]
            sentence_embedding = np.vstack((sentence_embedding,word_embedding))

    avg_vector = np.mean(sentence_embedding, axis=0)
    return avg_vector

def preprocess(string):
    string = str(string)
    string = re.sub(r'[^\w\s]','',string)
    if string != "":
        string = string.lower()
        tokenized = word_tokenize(string)
        tokens_without_sw = [word for word in tokenized if not word in stop_words]
    else:
        tokens_without_sw = []
    return tokens_without_sw


In [5]:
# Functions to get consine similarities and top 5

def get_cosine(vec1, vec2):
    return dot(vec1, vec2)/(norm(vec1)*norm(vec2))

import operator
def get_top_n_similarities(N,a):
    # Indices of N largest elements in list 
    # using sorted() + lambda + list slicing 
    res = sorted(range(len(a)), key=lambda i: a[i])[-N:]
    scores = []
    for index in res:
        scores.append(a[index])
    return res,scores

In [6]:
core_sentences = "/efs/CONSORT/skr-consort/datasets/TrainingDatasets/95.txt"
core_sentences_label_0 = "/efs/CONSORT/HeuristicsBasedAnnotations/label0_200.txt"

core_sentences_lst = []
core_labels_lst = []
f = open(core_sentences, "r")
for line in f.readlines():
    sentence_items = line.split("|")
    CONSORT_Item = sentence_items[0]
    sentence_text = sentence_items[2]
    sentence_text = sentence_text.replace("\n","")
    sentence_text = sentence_text.lower()
    
    core_sentences_lst.append(sentence_text)
    core_labels_lst.append(CONSORT_Item)
    
f_0 = open(core_sentences_label_0,"r")
for line in f_0.readlines():
    sentence_items = line.split("|")
    CONSORT_Item = "0"
    sentence_text = sentence_items[3]
    sentence_text = sentence_text.replace("\n","")
    sentence_text = sentence_text.lower()
    
    core_sentences_lst.append(sentence_text)
    core_labels_lst.append(CONSORT_Item)
    
    
core_sentences_df = pd.DataFrame(
    {'CONSORT_Item': core_labels_lst,
     'text': core_sentences_lst
    })

core_sentences_df['bio_word2vec'] = ""
for index, row in core_sentences_df.iterrows():
    core_sentence_text = row["text"]
    core_sentence_avg_feature_vector = avg_feature_vector(core_sentence_text, embeddings_dict)
    core_sentences_df.at[index, "bio_word2vec"] = core_sentence_avg_feature_vector
    
display(core_sentences_df)

Unnamed: 0,CONSORT_Item,text,bio_word2vec
0,3a,"this was a multicenter, stratified (6 to 11 ye...","[-0.06170522, 0.060617864, 0.056086227, -0.003..."
1,3a,"in 6 tertiary neonatal intensive care units, w...","[-0.17332593, -0.0594163, -0.09075071, 0.14803..."
2,3b,"during the trial, this committee recommended t...","[-0.065421075, -0.013819709, -0.12178633, 0.14..."
3,3b,the protocol was immediately amended in accord...,"[-0.025238823, 0.04458754, -0.04113689, 0.0440..."
4,4a,eligible participants were all adults aged 18 ...,"[0.18882585, 0.35402963, -0.0257231, -0.073712..."
...,...,...,...
260,0,"shoe widths are measured 3a, 2a, a.","[0.110955335, 0.09091401, 0.08594134, 0.414183..."
261,0,"psqi scores range from 0 to 21, with higher sc...","[0.17635523, 0.45296505, 0.16676855, 0.0235239..."
262,0,this is followed by heating on a boiling water...,"[-0.10503286, -0.03416787, -0.09765086, 0.2744..."
263,0,the application uses bluetooth to connect with...,"[-0.16153307, 0.36132634, -0.00096354017, 0.04..."


In [10]:
# get sentence embeddings for snorkel data
data_file = "/efs/CONSORT2/data/snorkel_data/all_data_SNORKEL_RESULTS_keywords_sectionheaders_threshold08.csv"
all_data_df = pd.read_csv(data_file)
print (len(all_data_df))

all_data_df['bio_word2vec'] = ""
all_data_df['core_sentence_index'] = ""
all_data_df['core_sentence_similarity_score'] = ""

count = 0
for index_snorkel, row_snorkel in all_data_df.iterrows():
    if (count%500 == 0):
        print (count)
    sentence_text = row_snorkel["text"]
    sentence_avg_feature_vector = avg_feature_vector(sentence_text, embeddings_dict)
    all_data_df.at[index_snorkel, "bio_word2vec"] = sentence_avg_feature_vector
    
    similarity_scores_list = []
    
    for index, row in core_sentences_df.iterrows():
        core_sentence_avg_feature_vector = row["bio_word2vec"]
        similarity_score = get_cosine(sentence_avg_feature_vector,core_sentence_avg_feature_vector)
        if isinstance(np.float32(similarity_score), np.floating) :
            similarity_scores_list.append(similarity_score)
        else:
            similarity_scores_list.append(0)
            
    top_5_similar_indexes, top_5_similar_scores = get_top_n_similarities(95,similarity_scores_list)
    
    all_data_df.at[index_snorkel,'core_sentence_index'] = top_5_similar_indexes
    all_data_df.at[index_snorkel,'core_sentence_similarity_score'] = top_5_similar_scores
        
    count += 1
all_data_df.to_csv("/efs/CONSORT2/data/snorkel_data/all_data_with_similar_sentences_indentified.csv")

1003604
0
500
1000
1500
2000
2500
3000
3500
4000
4500
5000
5500
6000
6500
7000
7500
8000
8500
9000
9500
10000
10500
11000
11500


  after removing the cwd from sys.path.


12000
12500
13000
13500
14000
14500
15000
15500
16000
16500
17000
17500
18000
18500
19000
19500
20000
20500
21000
21500
22000
22500
23000
23500
24000
24500
25000
25500
26000
26500
27000
27500
28000
28500
29000
29500
30000
30500
31000
31500
32000
32500
33000
33500
34000
34500
35000
35500
36000
36500
37000
37500
38000
38500
39000
39500
40000
40500
41000
41500
42000
42500
43000
43500
44000
44500
45000
45500
46000
46500
47000
47500
48000
48500
49000
49500
50000
50500
51000
51500
52000
52500
53000
53500
54000
54500
55000
55500
56000
56500
57000
57500
58000
58500
59000
59500
60000
60500
61000
61500
62000
62500
63000
63500
64000
64500
65000
65500
66000
66500
67000
67500
68000
68500
69000
69500
70000
70500
71000
71500
72000
72500
73000
73500
74000
74500
75000
75500
76000
76500
77000
77500
78000
78500
79000
79500
80000
80500
81000
81500
82000
82500
83000
83500
84000
84500
85000
85500
86000
86500
87000
87500
88000
88500
89000
89500
90000
90500
91000
91500
92000
92500
93000
93500
94000
94500
9500

610000
610500
611000
611500
612000
612500
613000
613500
614000
614500
615000
615500
616000
616500
617000
617500
618000
618500
619000
619500
620000
620500
621000
621500
622000
622500
623000
623500
624000
624500
625000
625500
626000
626500
627000
627500
628000
628500
629000
629500
630000
630500
631000
631500
632000
632500
633000
633500
634000
634500
635000
635500
636000
636500
637000
637500
638000
638500
639000
639500
640000
640500
641000
641500
642000
642500
643000
643500
644000
644500
645000
645500
646000
646500
647000
647500
648000
648500
649000
649500
650000
650500
651000
651500
652000
652500
653000
653500
654000
654500
655000
655500
656000
656500
657000
657500
658000
658500
659000
659500
660000
660500
661000
661500
662000
662500
663000
663500
664000
664500
665000
665500
666000
666500
667000
667500
668000
668500
669000
669500
670000
670500
671000
671500
672000
672500
673000
673500
674000
674500
675000
675500
676000
676500
677000
677500
678000
678500
679000
679500
680000
680500
681000

In [3]:
data_file = "/efs/CONSORT2/data/snorkel_data/all_data_with_similar_sentences_indentified.csv"
data_df = pd.read_csv(data_file)
print (len(data_df))

1003604


In [4]:
display(data_df.head(100))

Unnamed: 0.1,Unnamed: 0,PMID,sid,text,section_hierarchy,Snorkel_LF,label_model_probability,prediction_by_threshold_0.8,bio_word2vec,core_sentence_index,core_sentence_similarity_score
0,0,26330983,0,Competitive Protein-binding assay-based Enzyme...,['title'],"[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -...","[0.058823529411764705, 0.058823529411764705, 0...",0,[ 0.11882991 0.05238667 -0.12661609 0.018826...,"[18, 113, 97, 6, 89, 166, 162, 247, 174, 150, ...","[0.76758045, 0.767598, 0.7677218, 0.7680465, 0..."
1,1,26330983,3,The most reliable indicator of Vitamin D statu...,"['Background:', 'abstract']","[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -...","[0.058823529411764705, 0.058823529411764705, 0...",['0'],[ 0.10210992 -0.02056662 -0.06992093 -0.006068...,"[233, 4, 6, 72, 163, 125, 30, 143, 31, 190, 47...","[0.71284723, 0.7128486, 0.71321136, 0.713344, ..."
2,2,26330983,4,This study was performed to compare commonly u...,"['Background:', 'abstract']","[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -...","[0.058823529411764705, 0.058823529411764705, 0...",['0'],[ 0.00582925 0.03739018 -0.09997098 -0.062455...,"[140, 65, 150, 162, 125, 171, 233, 61, 97, 259...","[0.7059613, 0.7062894, 0.70657057, 0.70758903,..."
3,3,26330983,6,Concentrations of 25(OH) D in sera from 257 ra...,"['Methods:', 'abstract']","[-1, -1, -1, -1, -1, -1, -1, 7, -1, -1, -1, -1...","[0.036039944851563165, 0.03585632650051867, 0....",['0'],[ 0.23099278 0.14131418 0.11131693 0.069183...,"[159, 76, 27, 242, 162, 83, 248, 110, 209, 89,...","[0.75662655, 0.75722045, 0.75734884, 0.7573662..."
4,4,26330983,8,Mean 25(OH) D concentration was 22 ± 18.8 and ...,"['Results:', 'abstract']","[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -...","[0.058823529411764705, 0.058823529411764705, 0...",['0'],[ 4.64382261e-01 1.05110429e-01 1.75463334e-...,"[100, 2, 173, 164, 24, 68, 168, 52, 224, 123, ...","[0.6220976, 0.62360024, 0.6236373, 0.62492293,..."
...,...,...,...,...,...,...,...,...,...,...,...
95,95,32053942,11,This supplementation also induced a rearrangem...,['abstract'],"[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -...","[0.058823529411764705, 0.058823529411764705, 0...",['0'],[ 2.52154022e-02 3.92540358e-02 8.30058455e-...,"[85, 20, 161, 141, 208, 47, 56, 61, 118, 6, 90...","[0.7011961, 0.70127064, 0.701628, 0.7022391, 0..."
96,96,32053942,12,"In the clinical study, melon concentrate was a...",['abstract'],"[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -...","[0.058823529411764705, 0.058823529411764705, 0...",['0'],[ 1.4819278e-01 1.6439028e-01 1.7522444e-01 ...,"[93, 38, 48, 147, 49, 18, 83, 173, 181, 133, 1...","[0.75518334, 0.7562241, 0.75633436, 0.75770056..."
97,97,32053942,13,"Besides, magnesium (Mg) plasma level was highe...",['abstract'],"[-1, -1, -1, -1, -1, -1, -1, -1, -1, 9, -1, -1...","[0.03635834963330945, 0.03627712989887287, 0.0...",['0'],[ 4.5112379e-02 1.5270595e-01 -4.4477526e-03 ...,"[124, 77, 29, 0, 78, 243, 162, 16, 197, 80, 23...","[0.79743904, 0.798242, 0.7982564, 0.79865235, ..."
98,98,32053942,14,"Therefore, the melon concentrate allowed a bet...",['abstract'],"[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -...","[0.058823529411764705, 0.058823529411764705, 0...",['0'],[ 0.05415653 -0.0597183 0.08344594 0.088950...,"[60, 1, 39, 162, 184, 214, 56, 109, 10, 132, 1...","[0.7495179, 0.74965525, 0.74974644, 0.74987906..."


In [13]:
display(all_data_df.head(100))

Unnamed: 0,PMID,sid,text,section_hierarchy,Snorkel_LF,label_model_probability,prediction_by_threshold_0.8,bio_word2vec,core_sentence_index,core_sentence_similarity_score
0,26330983,0,Competitive Protein-binding assay-based Enzyme...,['title'],"[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -...","[0.058823529411764705, 0.058823529411764705, 0...",0,"[0.11882991, 0.05238667, -0.12661609, 0.018826...","[18, 113, 97, 6, 89, 166, 162, 247, 174, 150, ...","[0.76758045, 0.767598, 0.7677218, 0.7680465, 0..."
1,26330983,3,The most reliable indicator of Vitamin D statu...,"['Background:', 'abstract']","[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -...","[0.058823529411764705, 0.058823529411764705, 0...",['0'],"[0.102109924, -0.020566616, -0.06992093, -0.00...","[233, 4, 6, 72, 163, 125, 30, 143, 31, 190, 47...","[0.71284723, 0.7128486, 0.71321136, 0.713344, ..."
2,26330983,4,This study was performed to compare commonly u...,"['Background:', 'abstract']","[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -...","[0.058823529411764705, 0.058823529411764705, 0...",['0'],"[0.0058292467, 0.037390184, -0.09997098, -0.06...","[140, 65, 150, 162, 125, 171, 233, 61, 97, 259...","[0.7059613, 0.7062894, 0.70657057, 0.70758903,..."
3,26330983,6,Concentrations of 25(OH) D in sera from 257 ra...,"['Methods:', 'abstract']","[-1, -1, -1, -1, -1, -1, -1, 7, -1, -1, -1, -1...","[0.036039944851563165, 0.03585632650051867, 0....",['0'],"[0.23099278, 0.14131418, 0.111316934, 0.069183...","[159, 76, 27, 242, 162, 83, 248, 110, 209, 89,...","[0.75662655, 0.75722045, 0.75734884, 0.7573662..."
4,26330983,8,Mean 25(OH) D concentration was 22 ± 18.8 and ...,"['Results:', 'abstract']","[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -...","[0.058823529411764705, 0.058823529411764705, 0...",['0'],"[0.46438226, 0.10511043, 0.17546333, 0.1659763...","[100, 2, 173, 164, 24, 68, 168, 52, 224, 123, ...","[0.6220976, 0.62360024, 0.6236373, 0.62492293,..."
...,...,...,...,...,...,...,...,...,...,...
95,32053942,11,This supplementation also induced a rearrangem...,['abstract'],"[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -...","[0.058823529411764705, 0.058823529411764705, 0...",['0'],"[0.025215402, 0.039254036, 0.083005846, 0.1309...","[85, 20, 161, 141, 208, 47, 56, 61, 118, 6, 90...","[0.7011961, 0.70127064, 0.701628, 0.7022391, 0..."
96,32053942,12,"In the clinical study, melon concentrate was a...",['abstract'],"[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -...","[0.058823529411764705, 0.058823529411764705, 0...",['0'],"[0.14819278, 0.16439028, 0.17522444, 0.1479417...","[93, 38, 48, 147, 49, 18, 83, 173, 181, 133, 1...","[0.75518334, 0.7562241, 0.75633436, 0.75770056..."
97,32053942,13,"Besides, magnesium (Mg) plasma level was highe...",['abstract'],"[-1, -1, -1, -1, -1, -1, -1, -1, -1, 9, -1, -1...","[0.03635834963330945, 0.03627712989887287, 0.0...",['0'],"[0.04511238, 0.15270595, -0.0044477526, 0.1763...","[124, 77, 29, 0, 78, 243, 162, 16, 197, 80, 23...","[0.79743904, 0.798242, 0.7982564, 0.79865235, ..."
98,32053942,14,"Therefore, the melon concentrate allowed a bet...",['abstract'],"[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -...","[0.058823529411764705, 0.058823529411764705, 0...",['0'],"[0.054156534, -0.059718303, 0.08344594, 0.0889...","[60, 1, 39, 162, 184, 214, 56, 109, 10, 132, 1...","[0.7495179, 0.74965525, 0.74974644, 0.74987906..."
