# word2vec demo

In [None]:
# gensim
import gensim
import numpy as np
from tqdm import tqdm

In [None]:
import gensim.downloader as api
wv = api.load('word2vec-google-news-300')

In [None]:
for index, word in enumerate(wv.index_to_key):
    if index == 10:
        break
    print(f"word #{index}/{len(wv.index_to_key)} is {word}")

In [None]:
wv.most_similar('corona', topn=10)

In [None]:
wv.doesnt_match("bilirubin dysmorphism influenca covid-19".split())

In [None]:
distance = wv.distance("media", "facebook")
print(f"{distance:.1f}")

In [None]:
# sentence_obama = 'Obama speaks to the media in Illinois'.lower().split()
# sentence_president = 'The president greets the press in Chicago'.lower().split()

# similarity = wv.wmdistance(sentence_obama, sentence_president)
# print(f"{similarity:.4f}")

distance = wv.distance("phone", "telephone")
print(f"{distance:.1f}")

similarity = wv.n_similarity(['sushi', 'shop'], ['japanese', 'restaurant'])
print(f"{similarity:.4f}")

vector = wv['computer']  # numpy vector of a word
vector.shape

vector = wv.get_vector('office', norm=True)
vector.shape


In [None]:
#wv.get_vector('to')
def get_embedding(word):
    if wv.key_to_index.get(word, -1) < 0:
        return wv.get_vector('##')
    else:
        return wv.get_vector(word)
    
get_embedding('to')

In [None]:
documents = ['Obama speaks to the media in Illinois'.lower().split(),
             'The president greets the press in Chicago'.lower().split()]

encoded_docs = [[get_embedding(word) for word in post] for post in documents]

In [None]:
len(encoded_docs[1])

# word2vec biobert

In [None]:
#model = gensim.models.KeyedVectors.load_word2vec_format('/tmp/vectors.txt', binary=False)
# using gzipped/bz2 input works too, no need to unzip
#model = gensim.models.KeyedVectors.load_word2vec_format('/tmp/vectors.bin.gz', binary=True)

In [None]:
# https://radimrehurek.com/gensim/models/keyedvectors.html

from gensim.models import KeyedVectors

model_path = "/Users/jplasser/gensim-data/wikipedia-pubmed-and-PMC-w2v.bin"
wv_from_bin = KeyedVectors.load_word2vec_format(model_path, binary=True)

# -> das werden die Embeddings, die man allerdings für das Vocabulary berechnen muss
# Steps
# 0. clean texts
# 1. compute vocabulary
# 2. compute embeddings
# 3. use all this information in the transformer/or classifier head

In [None]:
# model_path_out = "/Users/jplasser/gensim-data/wikipedia-pubmed-and-PMC-w2v.txt"
# wv_from_bin.save_word2vec_format(model_path_out, binary=False)

In [None]:
wv_from_bin.most_similar('dysmorphism', topn=10)

In [None]:
wv_from_bin.most_similar('bilirubin', topn=10)

In [None]:
wv_from_bin.doesnt_match("vascular bilirubin dysmorphism influenca".split())

In [None]:
wv_from_bin.doesnt_match("microcephaly cerebrum dysmorphic".split())

In [None]:
def get_embed(word):
    if wv_from_bin.key_to_index.get(word, -1) < 0:
        return np.random.rand(200)
    else:
        return wv_from_bin.get_vector(word)
    
get_embed('snup')

In [None]:
wv_from_bin.get_vecattr("cerebrum", "count")  # returns count of "my-word"

In [None]:
len(wv_from_bin)  # returns size of the vocabulary

In [None]:
(len(wv_from_bin)*200*4)/(1024*1024*1024)

In [None]:
documents = ['Obama speaks to the media in Illinois',
             'The president greets the press in Chicago']

#encoded_docs = [[get_embed(word) for word in post] for post in documents]

In [None]:
def clean(sentence):
    
    return sentence

cleaned_documents = [clean(post) for post in documents]

In [None]:
cleaned_documents

In [None]:
from string import punctuation
from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec
from nltk.corpus import gutenberg, stopwords

def preprocessing():
    raw_data = (gutenberg.raw('shakespeare-hamlet.txt'))
    tokens = word_tokenize(raw_data)
    tokens = [w.lower() for w in tokens]
    table = str.maketrans('', '', string.punctuation)
    stripped = [w.translate(table) for w in tokens]
    
    global words
    words = [word for word in stripped if word.isalpha()]
    sw = (stopwords.words('english'))
    sw1= (['.', ',', '"', '?', '!', ':', ';', '(', ')', '[', ']', '{', '}'])
    sw2= (['for', 'on', 'ed', 'es', 'ing', 'of', 'd', 'is', 'has', 'have', 'been', 'had', 'was', 'are', 'were', 'a', 'an', 'the', 't', 's', 'than', 'that', 'it', '&', 'and', 'where', 'there', 'he', 'she', 'i', 'and', 'with', 'it', 'to', 'shall', 'why', 'ham'])
    stop=sw+sw1+sw2
    words = [w for w in words if not w in stop]
    
preprocessing()

def freq_count():
    fd = nltk.FreqDist(words)
    print(fd.most_common())
    freq_count()
    
def word_embedding():
    for i in range(len(words)):
        model = Word2Vec(words, size = 100, sg = 1, window = 3, min_count = 1, iter = 10, workers = 4)
        model.init_sims(replace = True)
        model.save('word2vec_model')
        model = Word2Vec.load('word2vec_model')
        similarities = model.wv.most_similar('hamlet')
        for word, score in similarities:
            print(word , score)
word_embedding()

In [None]:
# use spacy with the given word2vec model
# a good procedure can be found here:
# https://stackoverflow.com/questions/50466643/in-spacy-how-to-use-your-own-word2vec-model-created-in-gensim



# Spacy

In [None]:
import spacy

In [None]:
!python -m spacy download en_core_web_sm

In [None]:
nlp = spacy.load("en_core_web_sm")
#nlp = spacy.load("/Users/jplasser/gensim-data/spacy.word2vec.model/")

In [None]:
nlp("My father was a rolling stone")

In [None]:
import re

# Vocab class to generate a vocabulary
class Vocab:
    """Vocabulary for text."""
    def __init__(self, tokens=None, min_freq=0, reserved_tokens=None):
        if tokens is None:
            tokens = []
        if reserved_tokens is None:
            reserved_tokens = []
        # Sort according to frequencies
        counter = count_corpus(tokens)
        self.token_freqs = sorted(counter.items(), key=lambda x: x[1],
                                  reverse=True)
        # The index for the unknown token is 0
        self.unk, uniq_tokens = 0, ['<unk>'] + reserved_tokens
        uniq_tokens += [
            token for token, freq in self.token_freqs
            if freq >= min_freq and token not in uniq_tokens]
        self.idx_to_token, self.token_to_idx = [], dict()
        for token in uniq_tokens:
            self.idx_to_token.append(token)
            self.token_to_idx[token] = len(self.idx_to_token) - 1

    def __len__(self):
        return len(self.idx_to_token)

    def __getitem__(self, tokens):
        if not isinstance(tokens, (list, tuple)):
            return self.token_to_idx.get(tokens, self.unk)
        return [self.__getitem__(token) for token in tokens]

    def to_tokens(self, indices):
        if not isinstance(indices, (list, tuple)):
            return self.idx_to_token[indices]
        return [self.idx_to_token[index] for index in indices]


def count_corpus(tokens):
    """Count token frequencies."""
    # Here `tokens` is a 1D list or 2D list
    if len(tokens) == 0 or isinstance(tokens[0], list):
        # Flatten a list of token lists into a list of tokens
        tokens = [token for line in tokens for token in line]
    return collections.Counter(tokens)

def preprocess(text):
    text = text.lower()
    text = " ".join(re.findall("[A-Za-z0-9\']{3,}", text))
    for o, r in text_replacements:
        text = text.replace(o, r)
    text = nlp(text)
    return [token.lemma_ for token in text if not token.is_stop]

def tokenize(lines, type):
    """Split text lines into word tokens."""
    print(f"Tokenizing {type} data set...")
    return [preprocess(line) for line in tqdm(lines)]


def truncate_pad(line, max_document_length, padding_token):
    """Truncate or pad sequences."""
    if len(line) > max_document_length:
        return line[:max_document_length],0  # Truncate
    padding_len = max_document_length - len(line)
    return line + [padding_token] * padding_len, padding_len   # Pad

# load space vocabulary, needed for tokenization
# nlp = spacy.load("/Users/jplasser/gensim-data/spacy.word2vec.model/")
text_replacements = [("n't", "not")]

In [None]:
tokens = tokenize(["bilirubin dysmorphism influenza covid-19",
                  "My father was a Rolling stone.",
                  "There is my father and my mother standing in line."], type='train')

tokens

# Embeddings and vocabulary of the original mimic-iii train data

In [None]:
import pickle

train_data = pickle.load(open('/Users/jplasser/Documents/AI Master/WS2021/MastersThesis/code.nosync/CNEP/src/data/mimic3/full_train_data_unique.pickle', 'rb'))
#val_data = pickle.load(open('/home/thetaphipsi/MasterAI/src/CNEP/src/data/mimic3/full_val_data_unique.pickle', 'rb'))
#test_data = pickle.load(open('/home/thetaphipsi/MasterAI/src/CNEP/src/data/mimic3/full_test_data_unique.pickle', 'rb'))

In [None]:
train_tokens = tokenize(train_data['notes'], type='train')
#val_tokens = tokenize(val_data['notes'], type='val')
#test_tokens = tokenize(test_data['notes'], type='test')

In [None]:
import collections

vocab = Vocab(train_tokens, min_freq=5, reserved_tokens=['<pad>'])

In [None]:
len(vocab)

In [None]:
len(train_tokens[0]), len(train_data['notes'][0])

In [None]:
max_sequence_len = max([len(token) for token in train_tokens])

In [None]:
train_data['notes'][0]

In [None]:
" ".join(train_tokens[0])

In [None]:
vocab['influenza'], vocab.idx_to_token[556]

In [None]:
max_document_length = 10
features = [truncate_pad(vocab[line], max_document_length, vocab['<pad>']) for line in tokens]
features

In [None]:
vocab.idx_to_token[:20]

In [None]:
import os
from pathlib import Path
import torch

# Download glove files from
# GloVe word vectors download URLs
glove_word_vectors = {
    'large':  '/Users/jplasser/gensim-data/wikipedia-pubmed-and-PMC-w2v.txt'
    }

glove_type = 'large'
glove_embedding_file = 'wikipedia-pubmed-and-PMC-w2v'
glove_dir = Path('/Users/jplasser/gensim-data/')


def bar_progress(current, total, width=80):
    progress_message = "Downloading: %d%% [%d / %d] bytes" % (current / total * 100, current, total)
    # Don't use print() as it will print in new line every time.
    sys.stdout.write("\r" + progress_message)
    sys.stdout.flush()

# Embeddings
class TokenEmbedding:
    """Token Embedding."""
    def __init__(self, embedding_name):
        self.embedding_name = embedding_name
        self.idx_to_token, self.idx_to_vec = self._load_embedding(embedding_name)
        self.unknown_idx = 0
        self.token_to_idx = {token: idx for idx, token in enumerate(self.idx_to_token)}

    def _load_embedding(self, embedding_name):
        idx_to_token, idx_to_vec = ['<unk>'], []
        #download_glove(glove_type = glove_type , download = False)
        glovefile = glove_dir / (embedding_name + '.txt')
        with tqdm(total=os.path.getsize(glovefile)) as pbar:
            with open(glovefile, 'r') as f:
                for line in f:
                    pbar.update(len(line.encode('utf-8')))
                    elems = line.rstrip().split(' ')
                    token, elems = elems[0], [float(elem) for elem in elems[1:]]
                    if len(elems) > 1:
                        idx_to_token.append(token)
                        idx_to_vec.append(elems)
        idx_to_vec = [[0] * len(idx_to_vec[0])] + idx_to_vec
        return idx_to_token, torch.tensor(idx_to_vec)

    def __getitem__(self, tokens):
        indices = [
            self.token_to_idx.get(token, self.unknown_idx)
            for token in tokens]
        vecs = self.idx_to_vec[torch.tensor(indices)]
        return vecs

    def __len__(self):
        return len(self.idx_to_token)

In [None]:
print('Start creating embeddings...')
glove_embedding = TokenEmbedding(glove_embedding_file)
embeds = glove_embedding[vocab.idx_to_token] # we can re-use this object (embeds) later with the other models
print(f'Finished creating embeddings.')

In [None]:
messages = [
    # Smartphones
    "I like my phone",
    "My phone is not good.",
    "Your cellphone looks great.",

    # Weather
    "Will it snow tomorrow?",
    "Recently a lot of hurricanes have hit the US",
    "Global warming is real",

    # Food and health
    "An apple a day, keeps the doctors away",
    "Eating strawberries is healthy",
    "Is paleo better than keto?",

    # Asking about age
    "How old are you?",
    "what is your age?",
    
]

run_and_plot(messages)

In [None]:
import seaborn as sns
import numpy as np

def plot_similarity(labels, features, rotation, print_labels=True):
    print(f"{features.shape=}")
    corr = np.inner(features, features)
    #print(corr)
    labels = [m[:25] + '/' + str(len(m)) for m in labels]
    sns.set(rc = {'figure.figsize':(20,12)})
    sns.set(font_scale=1.2)
    g = sns.heatmap(corr,
                      xticklabels=labels,
                      yticklabels=labels,
                      vmin=0,
                      vmax=1,
                      annot=print_labels, fmt='.1f',
                      cmap="YlOrRd")
    g.set_xticklabels(labels, rotation=rotation)
    g.set_title("Semantic Textual Similarity")

def run_and_plot(messages_):
    tokens = tokenize(messages, type='train')
    vocab = Vocab(tokens, min_freq=0, reserved_tokens=['<pad>'])
    emb = glove_embedding[vocab.idx_to_token]
    max_document_length = np.min([max_sequence_len, np.max([len(vocab[line]) for line in tokens])])//20
    print(f"{max_document_length=}")
    features = [truncate_pad(vocab[line], max_document_length, vocab['<pad>'])[0] for line in tokens]
    #message_embeddings_ = torch.nn.functional.normalize(torch.stack([torch.max(emb[f], dim=0)[0] for f in features]), dim=1)
    message_embeddings_ = torch.nn.functional.normalize(torch.stack([torch.mean(torch.nn.functional.avg_pool1d(emb[f], 3), dim=0) for f in features]), dim=1)
    message_embeddings_.shape
    plot_similarity(messages_, message_embeddings_, 90)

In [None]:
embeds.shape

In [None]:
messages = train_data['notes'][:-1:len(train_data['notes'])//20]
run_and_plot(messages)