In [29]:
import pandas as pd
import numpy as np
import re
import nltk
import math
import string
from scipy import spatial
from gensim.models import Word2Vec
import pickle

## COVID Hackerthon: Word2Vec 
**Purpose of this notebook:** Identify specific characteristics of patients within a population cohort that may be the best candidates of a COVID-19 clinical trial using Word2Vec.

Resources used:
- https://towardsdatascience.com/a-beginners-guide-to-word-embedding-with-gensim-word2vec-model-5970fa56cc92
- https://www.geeksforgeeks.org/python-word-embedding-using-word2vec/
- https://stackabuse.com/implementing-word2vec-with-gensim-library-in-python/
- https://machinelearningmastery.com/develop-word-embeddings-python-gensim/

## Functions

In [4]:
def tokenization(string):
    """
    Takes a string (document) and tokanizes it.
    """
    
    # convert into sentences
    all_sentences = nltk.sent_tokenize(string)

    # convert sentences into words
    all_words = [nltk.word_tokenize(sent) for sent in all_sentences]

    # remove punctuation
    all_words_no_punc = [[word for word in all_word if word.isalnum()] for all_word in all_words]
    
    # return 
    return all_words_no_punc

def cosine_sim(lst_1, lst_2):
    """
    Takes two lists and calculates their cosine similarity.
    """
        
    return 1 - spatial.distance.cosine(lst_1, lst_2)

## 1. Data Exploration

In [5]:
# load data
dat = pd.read_excel("Final_Hackathon_1082Patients.xlsx", index_col="Unnamed: 0")

# head dat
dat.head()

Unnamed: 0,CLINICAL NOTES
1081,new COVID-19 patient confirmed in Switzerland:...
1080,new COVID-19 patient confirmed in Croatia: mal...
1079,new COVID-19 patient confirmed in Algeria: mal...
1078,new COVID-19 patient confirmed in Afghanistan:...
1077,"new COVID-19 patient confirmed in Austria: 24,..."


In [6]:
# find len of the docs
len_doc = []
for i in range(len(dat)):
    len_doc.append(len(dat.iloc[i,0].split()))

# average len
print("Average number of tokens in the doc is {:.0f}.".format(np.mean(len_doc)))

# total tokes in the 
print("The total number of tokens in the doc is {:.0f}.".format(np.sum(len_doc)))

Average number of tokens in the doc is 1215.
The total number of tokens in the doc is 1312946.


## 2. Data Preprocessing

In [7]:
# convert to array
arr = dat.values

# all lower cases
lst_prepro = []
for i in arr:
    lst_prepro.append(i[0].lower().replace("-", ""))
    
# split() to find index on all words
lst_prepro_split = []
for i in lst_prepro:
    lst_prepro_split.append(i.split())

## 3. Word2Vec (own training)

In [8]:
# create one document to fit the model on
doc_train = " ".join(lst_prepro)

In [9]:
# tokenize entire document
all_words_no_punc = tokenization(doc_train)

In [10]:
# fit model with default parameters
embedding = Word2Vec(all_words_no_punc, min_count=1)

In [11]:
# find vocabulary
vocab = list(embedding.wv.vocab)
print("Length of the vocabulary is:", len(vocab))

Length of the vocabulary is: 25211


In [12]:
# find similar words
embedding.wv.most_similar('bmi')

[('56yearold', 0.7957502603530884),
 ('gentleman', 0.7912894487380981),
 ('50yearold', 0.7822447419166565),
 ('62yearold', 0.7819993495941162),
 ('76yo', 0.7772549390792847),
 ('22f', 0.7675936222076416),
 ('woman', 0.7593116760253906),
 ('man', 0.7561774253845215),
 ('yearold', 0.7482136487960815),
 ('mmp', 0.7461156249046326)]

In [13]:
# embedding for a word
tst = embedding.wv.__getitem__("covid19")
print("Length of the embedding is:", len(tst))
print(tst)

Length of the embedding is: 100
[ 3.6991994e+00  2.9168355e-01 -2.7359939e+00 -2.5249883e-03
  6.8504447e-01 -3.5621521e-01 -2.1205173e+00  2.8659716e+00
 -2.4683878e+00 -1.2167168e+00  1.0319304e+00 -2.7930064e+00
 -1.2197591e+00  2.3394346e+00  2.8453608e+00  1.5508924e-01
  1.8451068e+00  5.2849402e+00  2.0767226e+00  4.3833299e+00
  2.4888189e+00  2.6452329e+00 -2.2902064e+00  2.6005089e+00
 -8.7425131e-01 -4.4344461e-01  1.2826571e+00  4.6215571e-02
 -2.5944769e-01 -9.0982574e-01  2.3621571e+00  1.5400938e+00
  2.4593534e+00 -2.8693128e+00  1.3186111e+00 -1.2558672e+00
  1.4995192e+00 -1.6397364e-01  6.6479486e-01 -1.7692365e-02
  3.4675066e+00  1.5277199e+00  7.5546846e-02  6.0425329e-01
  3.8559443e-01 -1.3586041e+00 -2.3652210e+00 -2.4005497e+00
 -3.2230418e+00 -7.5790799e-01  4.0161591e+00  4.5248141e+00
  3.3418289e-01 -3.3568916e-01 -1.5802268e+00 -3.7338775e-02
 -2.1363802e+00 -2.9060760e+00 -4.0148187e-01 -3.1622691e+00
  4.1173381e-01  4.3294840e+00 -1.0095180e-01  8.3966

In [14]:
# create dictionary with embeddings
embedding_dct = {}
for i in vocab:
    embedding_dct[i]=list(embedding.wv.__getitem__(i))

## 4. Find relevant section in original documents
- The relevnat section will be the sentence that includes the word with the highest cosine similarity to the defined search word

### - find the cosine similarity for each word in the document to the search terms

In [15]:
# strip punctuation for all documents
lst_prepro_split_clean = []

for i in range(len(lst_prepro_split)):
    lst_prepro_split_clean.append([i.replace(".", "").replace(",", "").replace(":", "") for i in lst_prepro_split[i]])

In [16]:
# embedded representation for all documents

doc_embedded = []

# loop over all documents
for i in range(len(lst_prepro_split_clean)):
    # initialize temp lst for each document 
    temp = []
    # loop over each token in the document
    for token in lst_prepro_split_clean[i]:
        # try to find the respective embedding
        try:
            temp.append(embedding_dct[token])
        # cant find embedding, this happens because data preprocessing differs, append embedding of 0 (cosine similarity to that will be nan)
        except:
            temp.append(0)
    # append temp to doc_embedded
    doc_embedded.append(temp)

# error handling
if len(lst_prepro_split_clean) != len(doc_embedded):
    print("Error, lists have different lengths")

In [17]:
# search list
search_list = ['positive', 'covid19', 'age', 'year', 'female','male', 'man', 'woman', 'bmi','kg', 'consent','agree', 'allergies', 'hiv', 'immunodeficiency', 
               'virus', 'hepatitis', 'chronic', 'hepatic', 'drug', 'opioids', "alt", "tall", "feet", "height"]

In [None]:
# iterate over all search terms to find the cosine similarity between each word in the document and the search term
for search_term in search_list:
    
    #pnitor process
    print(search_term)
    
    # define search vector
    search = embedding_dct[search_term]

    # find cosine similarities of all words to the search term
    cosine_similarity = []

    # loop over all documents
    for i in range(len(doc_embedded)):
        # monitor process
        if i%100==0:
            print(i)
        # loop over each token in the document
        temp = [cosine_sim(search, token) for token in doc_embedded[i]]
        # replace nan with 0
        temp = [0 if math.isnan(i) else i for i in temp]
        # append temp to cosine_similarity
        cosine_similarity.append(temp)   

    # error handling
    if len(cosine_similarity) != len(doc_embedded):
        print("Error, something is wrong", search_term)

    # save dataframe (row - patient, colum - word index)
    pd.DataFrame(cosine_similarity).to_csv("data/" + str(search_term) + ".csv")

### - find the phrase (+/- n words around the keyword) for a search criterion

In [18]:
# sentence tokenizer (format we used across the different approaches)

# load data
raw = pd.read_excel("Final_Hackathon_1082Patients.xlsx")

# name columns
raw.columns = ['id','note']

sentences = []
for i in range(len(raw['note'])):
    sentences.append(nltk.tokenize.sent_tokenize(raw['note'][i]))

# add to df 
raw['sent'] = sentences

In [19]:
# define threshold for the cosine similarity
thres = 0.9

# loop over all search terms: create column with index of the sentence that includes keywords
for search_term in search_list:
    
    # load data
    df = pd.read_csv("data/" + str(search_term) + ".csv", index_col = "Unnamed: 0")

    # fill na with 0
    df = df.fillna(0).T

    # find indixes of the words most similar to the seach word
    search_idx = []

    # for each document (columns of the df)
    for document in range(df.shape[1]):

        # find words with a cosine similarity to the search word higher than the thres
        temp = df[document] > thres

        # save indices of the words (index of the series)
        temp = [float(i) for i in list(temp[temp == True].index)]

        # append indixes of search word to list
        search_idx.append(temp)  

    # define range
    word_range = 4

    # find phrases for each keyword
    phrases = []
    # loop over each patient
    for patient in range(len(lst_prepro_split_clean)):
        temp = []
        # for each keyword in the document
        for keyword_idx in search_idx[patient]:
            # append the phrase with the offset of the range around the keyword
            temp.append(" ".join(lst_prepro_split_clean[patient][max(0, int(keyword_idx-word_range)):min(int(keyword_idx+word_range), len(lst_prepro_split_clean[patient]))]))
        # append temp
        phrases.append(temp)

    # create column for criteria 
    raw[str(search_term)] = phrases

In [20]:
# criteria
covid_criteria = ['positive','covid19']
age_criteria = ['age', 'year']
sex_criteria = ['female','male', 'man', 'woman']
consent_criteria = ['consent','agree']
bmi_criteria = ['bmi']
weight_criteria = ['kg']
height_criteria = ["tall", "feet", "height"]
allergies_criteria = ['allergies', 'drug']
hiv_criteria = ['hiv', 'immunodeficiency', 'virus']
hepatitis_criteria = ['hepatitis', 'chronic', 'hepatic']
alt_asn_criteria = ["alt"]
opioids_criteria = ['opioids']

In [42]:
# create final df
temp = raw[["id", "note", "sent"]].copy()
temp["c1_word"] = raw["positive"] + raw["covid19"]
temp["c2_word"] = raw["age"] + raw["year"]
temp["c3_word"] = raw["female"] + raw["male"] + raw["man"] + raw["woman"]
temp["c4_word"] = raw["consent"] + raw["agree"]
temp["c5_word"] = raw["bmi"]
temp["c6_word"] = raw["kg"]
temp["c7_word"] = raw["tall"] + raw["feet"] + raw["height"]
temp["c8_word"] = raw["allergies"] + raw["allergies"]
temp["c9_word"] = raw["hiv"] + raw["immunodeficiency"] + raw["virus"]
temp["c10_word"] = raw["hepatitis"] + raw["hepatic"]
temp["c11_word"] = raw["alt"] 
temp["c12_word"] = raw["opioids"]

temp.drop(columns=["note", "sent"], inplace=True)

In [43]:
# display
temp.head()

Unnamed: 0,id,c1_word,c2_word,c3_word,c4_word,c5_word,c6_word,c7_word,c8_word,c9_word,c10_word,c11_word,c12_word
0,1081,[biopsy came back as positive for adenocarcino...,[],[patient confirmed in switzerland male 70 infe...,[],[],[],[],[],[],[vena cava and the hepatic artery in addition],[],[]
1,1080,"[new covid19 patient confirmed in, new covid19...",[],[patient confirmed in croatia male recently re...,[],[],[],[train and dragged ~200 feet on he],[from 2/192/21 service surgery allergies clind...,[],[],[],[]
2,1079,"[new covid19 patient confirmed in, new covid19...","[at age 25 s/p tah at age 32, age 32 5mm nodul...",[patient confirmed in algeria male italian who...,[],[],[],[],[on 2/17 service medicine allergies codeine / ...,[],[],[],[]
3,1078,"[new covid19 patient confirmed in, new covid19...","[age 25 s/p tah at age 32 social, 32 social hi...","[a 65 year old female with copd on, secondary ...",[],[],[],[],[from qom service medicine allergies codeine /...,[],[],[fev1 6 (35%) fev1/fvc 29 (41%); on advair],[]
4,1077,[magnesium hbsab is positive the patient was o...,[],[patient is a 57yearold female with hepatitis ...,[],[],[],[],[lopressor ranitidine aldactone interferon all...,[],[innsbruck service admission diagnosis hepatit...,"[bun 21 and creatinine alt is 349 ast, creatin...",[]


In [44]:
with open('w2v_results.pkl', 'wb') as f:
    pickle.dump(temp, f)

In [41]:
# save to csv
temp.to_csv("w2v_results.csv")

### Ideas for future work: 
- Use pre-trained word embeddings such as Word2Vec by Google or GloVe by Stanford