<a href="https://colab.research.google.com/github/hanhluukim/replication-topic-modelling-in-embedding-space/blob/main/notebook_bert_sentence_embeddings_to_word_embeddings.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#**Bert-Sentence/Subwords-Embeddings zum Wortembeddings**

1. Das Notebook wurde hier nur genutzt, um die Richtigkeit der Implementierung zu kontrollieren
2. Die Implementierung mit Bert, die für ETM dann benutzt wurde, ist in den folgenden Dateien:

    - `src\bert_embedding.py`
    - `src\bert_preparing.py`
    - `src\covert_embeddings.py`
3. Verwendete Modelle: base-BERT und BertTokenizeFast

# **Transformation von Subwords-Embeddings zum Wort-Embeddings**

1. Originale Dokumenten werden mittels "." zu Sätzen gesplitted
2. Es gibt zwei Menge von kurzen und langen Sätzen
3. Die Sätze kurzer als 128 Tokens werden unverändert bleiben
4. Die Sätze länger als 128 Tokens werden durch die Länge 128 und ein Fenster von 10 Tokens gesplittet

In [None]:
!pip install transformers

You should consider upgrading via the '/home/miss-luu/anaconda3/bin/python -m pip install --upgrade pip' command.[0m[33m
[0m

In [None]:
!pip install stop-words



In [None]:
import torch
from transformers import BertTokenizer, BertModel
from sklearn.datasets import fetch_20newsgroups
import re
from nltk.tokenize import word_tokenize
import nltk
import string
nltk.download('punkt')
from stop_words import get_stop_words
stop_words = get_stop_words('en')
newsgroups_train = fetch_20newsgroups(subset='train')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# **Datenvorbearbeitung für BERT-Modell**

Note: Dieses Notebook ist nur für die Kontrolle der Richtigkeit von Implementierung, damit die richtige Code in dem `src` eingebaut wird



In [None]:
# read train_data from 20newsgroups
def read_raw_documents():
    raw_documents = []
    raw_labels = []
    for i in range(0,len(newsgroups_train.data)):
        raw_documents.append(newsgroups_train.data[i])
        raw_labels.append(newsgroups_train.target[i])
    return raw_documents, raw_labels

def simple_preprocess(raw_documents):
    def only_letters(tested_string):
        for letter in tested_string:
            if letter not in "abcdefghijklmnopqrstuvwxyz":
                return False
        return True
    def clean_doc_for_bert(doc): 
        doc = doc.replace(">","").lower()
        word_list = word_tokenize(doc) #only using empty space and punctation for tokenization
        cleaned = []
        for w in word_list:
            if w not in stop_words:
                if w in string.punctuation or only_letters(w): #using only character from punctation and alpha characters
                    if w in string.punctuation or len( set(w) ) > 1: #punctation with len 1 allowed but alpha word must be longer then 1
                        cleaned.append( w)
        return " ".join(cleaned), cleaned  #save doc in string and in token-list         
       
    cleaned_documents = []
    for doc in raw_documents:
        doc_in_string, doc_in_token_list = clean_doc_for_bert(doc)
        cleaned_documents.append(doc_in_string)
    return cleaned_documents

def transform_to_sentences_with_labels():
    # we will not use labels
    sentences_with_labels = []
    return sentences_with_labels

def fine_tune_bert():
    # should to be trained?
    # no, because for topic modelling, that is usupervised problem. We just find topics for the documents
    # topic modelling no targets
    return True

def transform_to_sentences(docs): #no labels
    data_as_sentences = []
    for doc in docs:
      for sent in doc.split(". "): #make sentences
        updated_sent = " ".join([t for t in sent.split(" ") if len(t) > 1])
        if len(updated_sent.split(" ")) > 1:
            data_as_sentences.append(updated_sent)
        else:
            if updated_sent not in data_as_sentences:
                data_as_sentences.append(updated_sent)
    return data_as_sentences

def split_long_sentence(splitted_sent, given_len):
    subsents = []
   #for i in range(0,len(splitted_sent), given_len):
    i=0
    while i < len(splitted_sent): 
        if i == 0:
            sub = " ".join(splitted_sent[i:i+given_len])
            subsents.append(sub)
            i = i + given_len
        if i!=0:
            j = i + given_len - 5 #windown 5
            if j + given_len <= len(splitted_sent):
                sub = " ".join(splitted_sent[j:j + given_len])
                subsents.append(sub)
            else:
                sub = " ".join(splitted_sent[j:])
                if len(sub)>1:
                    subsents.append(sub)
            i = j + given_len
    return subsents

def handle_long_sentences(sentences, given_len):
    # overlapped splitting sentence windown 5
    subsents = []
    deleted_long_sents = []
    for sent in sentences:
        splitted_sent = sent.split(" ")
        if len(splitted_sent) > given_len:
          long_sent_subsents = split_long_sentence(splitted_sent, given_len)
          subsents.extend(long_sent_subsents)
          deleted_long_sents.append(sent)
    # update sentences: remove and add subsents
    for del_sent in deleted_long_sents:
        sentences.remove(del_sent)
    for add_sent in subsents:
        sentences.append(add_sent)
    return sentences

def create_marked_senteces(sentences):
    return ['[CLS] ' + sent.strip() + ' [SEP]' for sent in sentences]
def save_sents_to_txt(shorted_sentences):
    with open(r'./bert_sentences.txt', 'w') as fp:
      for sent in shorted_sentences:
          # write each item on a new line
          fp.write(f'{sent} \n')
      print('saving sentences from bert-processing')
    return True

In [None]:
print("reading data:...")
raw_documents, _ = read_raw_documents()
print(len(raw_documents))
print("preprocess data:...")
preprocessed_docs = simple_preprocess(raw_documents)
print(len(preprocessed_docs))
print("transform to sentences:...")
sentences = transform_to_sentences(preprocessed_docs)
print("split sentences to 128 tokens:...")
shorted_sentences =  handle_long_sentences(sentences, 128)
marked_shorted_sentences = create_marked_senteces(shorted_sentences)
# write sentences to txt files
with open(r'bert_sentences.txt', 'w') as fp:
    for sent in shorted_sentences:
        # write each item on a new line
        fp.write(f'{sent} \n')
    print('saving sentences from bert-processing')
print("finished: ...")



reading data:...
11314
preprocess data:...
11314
transform to sentences:...
split sentences to 128 tokens:...
saving sentences from bert-processing
finished: ...


# **Bert-Modell and Bert-TokenizerFast**

In [None]:
from transformers import BertTokenizerFast, BertTokenizer
from transformers import BertTokenizerFast

tokenizerfast = BertTokenizerFast.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

model = BertModel.from_pretrained('bert-base-uncased', output_hidden_states = True)
model.eval()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          

# **Funktionen für Bert-Embeddings**

In [None]:
def tokenizerfast_for_a_sent(sent, tokenizer):
  this_sent_tokenizer = tokenizer(sent)
  # index of token in the vocabulary
  indexed_tokens = this_sent_tokenizer.input_ids
  segments_ids = [1] * len(indexed_tokens)
  # Convert inputs to PyTorch tensors
  tokens_tensor = torch.tensor([indexed_tokens])
  segments_tensors = torch.tensor([segments_ids])
  tokens_ids_with_belonging_information = this_sent_tokenizer.word_ids()
  return tokens_tensor, segments_tensors, tokens_ids_with_belonging_information
  
def reform_token_embeddings_of_sentence(full_outputs):
  hidden_states = outputs[2]
  token_embeddings = torch.stack(hidden_states, dim=0)
  #print(token_embeddings.shape)
  token_embeddings = torch.squeeze(token_embeddings, dim=1) # size= (n_hidden_layers, n_tokens, 768)
  #print(token_embeddings.shape)
  token_embeddings = token_embeddings.permute(1,0,2) # size= (n_tokens, n_hidden_layers, 768)
  #print(token_embeddings.shape)
  return token_embeddings 

def get_token_embeddings(reformed_token_embeddings):
  # using sum four last layers
  token_vecs_sum = []
  #print(f'get-token-embedding-function: {reformed_token_embeddings.shape}')
  for i, token in enumerate(reformed_token_embeddings): 
    sum_vec = torch.sum(token[-4:], dim=0)
    token_vecs_sum.append(sum_vec)
    #print(f'original {token.shape} and token-emb after sum {sum_vec.shape}')
  return token_vecs_sum # size: n_tokens: 768

def get_subwords_embeddings_of_word(bert_unique_token_id, tokenized_indices, tokens_embeddings):
    belongging_embeddings_of_word = []
    for idx, tokenizer_idx in enumerate(tokenized_indices):
        if tokenizer_idx == bert_unique_token_id:
            belongging_embeddings_of_word.append(tokens_embeddings[idx])
    return torch.stack(belongging_embeddings_of_word, dim=0)

def get_unique_embedding(embeddings=None, methode="mean"):
    #print(embeddings[0].shape)
    if methode == "mean":
        if embeddings.shape[0] == 1:
          return torch.squeeze(embeddings, dim=0)
        else:
          mean_embedding = torch.mean(embeddings, dim=0) #torch.tensor([embeddings])#.mean()
          return mean_embedding

def need_to_update(sent_tokens_ids):
    special_ids = [101, 102] #of CLS and SEP
    for e in sent_tokens_ids:
        if e in special_ids:
            sent_tokens_ids.remove(e)
    return sent_tokens_ids
  
def get_multiple_embeddings_for_words_in_sent(sent_tokens_ids, sent_outputs_tokens_embeddings):
    # a word can be one time oder multiple times in a sentence
    #print(f'tokens-ids in get_multiple_embeddings_: {sent_tokens_ids}')
    sent_tokens_ids = need_to_update(sent_tokens_ids)
    multiple_words_embeddings = []
    unique_words_ids = list(set(sent_tokens_ids))
    for unique_id in unique_words_ids:
        belong_embeddings = get_subwords_embeddings_of_word(unique_id, sent_tokens_ids, sent_outputs_tokens_embeddings)
        print(f'word-id: {unique_id} - belong-embeddings shape: {belong_embeddings.shape}')
        # mean of belonging_embeddings to get embedding of whole word
        word_embedding = get_unique_embedding(belong_embeddings, "mean")
        #print(f'mean-word-id {unique_id} word-embedding {word_embedding.shape}')
        multiple_words_embeddings.append(word_embedding)
        #print("----------------------------------------------------------")
    return torch.stack(multiple_words_embeddings, dim=0)

def get_indices_of_word_in_original_sent(word, splitted_original_sent):
    indices = []
    for i, e in enumerate(splitted_original_sent):
        if e == word:
            indices.append(i)
    return indices

def get_final_words_embeddings_in_sent(original_sent, sent_tokens_ids, sent_outputs_tokens_embeddings):
    #import numpy as np
    print(f'sentence-tokenizerfast-word-ids: {sent_tokens_ids}')
    not_unique_words_embeddings = get_multiple_embeddings_for_words_in_sent(sent_tokens_ids, sent_outputs_tokens_embeddings)
    print(f'total found embeddings in sent: {not_unique_words_embeddings.shape}')
    original_words_list = original_sent.split(" ")
    print(f'original-splitted: {original_words_list}')
    set_original_words_list = []
    for e in original_words_list:
        if e not in set_original_words_list:
            set_original_words_list.append(e) #[e for e in original_sent if e not in ]
    words_embeddings_in_sent_dict = {}
    for word in set_original_words_list:
        if word not in ['[CLS]', '[SEP]']:
          word_indices = get_indices_of_word_in_original_sent(word, original_words_list)
          print(f'word---- {word} ---- indices in original sent: {word_indices}')
          # a word can have different-word-embeddings in the sentence, because a word can occur multple times
          # each occurance has a different embedding for this word
          different_occurrences_embeddings_of_word = not_unique_words_embeddings[word_indices]
          print(f'test: {different_occurrences_embeddings_of_word.shape}')
          mean_unique_word_embedding = get_unique_embedding(torch.tensor(different_occurrences_embeddings_of_word), "mean")
          words_embeddings_in_sent_dict[word] = mean_unique_word_embedding
    return words_embeddings_in_sent_dict

def save_embeddings_in_sent_to_text(sent_id, words_embeddings_in_sent_dict):
    with open(f'./sent_{str(sent_id)}_words_embeddings.txt', 'w') as fp:
        for word, vector in words_embeddings_in_sent_dict.items():
            # write each item on a new line
            fp.write(f'{word}\t')
            for e in vector.tolist():
                fp.write(f'{e} ')
            fp.write("\n")
        print('saving embeddings')
    return True

def save_embeddings_to_text(words_embeddings_in_sent_dict):
    with open(r'./bert_words_embeddings.txt', 'a') as fp:
      for word, vector in words_embeddings_in_sent_dict.items():
          fp.write(f'{word}\t')
          for e in vector.tolist():
            fp.write(f'{e} ')
          fp.write("\n")
      print('saving embeddings')
    return True

def vocabulary_embeddings_to_text(vocab_embeddings):
    with open(r'./bert_vocab_embeddings.txt', 'w') as fp:
      for word, vector in vocab_embeddings.items():
          fp.write(f'{word}\t')
          for e in vector.tolist():
            fp.write(f'{e} ')
          fp.write("\n")
      print('saving embeddings')
    return True

# **BertTokenizerFast**

In [None]:
ex_sent = marked_shorted_sentences[0]
tokens_tensor, segments_tensors, tokens_ids_with_belonging_information = tokenizerfast_for_a_sent(ex_sent, tokenizerfast)

with torch.no_grad():
    outputs = model(tokens_tensor, segments_tensors)
    reformed = reform_token_embeddings_of_sentence(outputs)
    sent_tokens_embeddings = get_token_embeddings(reformed)
    print(f'number of found embeddings: {len(sent_tokens_embeddings)}')
    words_embeddings_in_sent_dict = get_final_words_embeddings_in_sent(ex_sent, tokens_ids_with_belonging_information, sent_tokens_embeddings)
    save_embeddings_in_sent_to_text(0, words_embeddings_in_sent_dict)
  

number of found embeddings: 25
sentence-tokenizerfast-word-ids: [None, 0, 1, 1, 1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 13, 13, 14, 15, 16, 17, None]
word-id: 0 - beling-embeddings shape: torch.Size([1, 768])
word-id: 1 - beling-embeddings shape: torch.Size([4, 768])
word-id: 2 - beling-embeddings shape: torch.Size([1, 768])
word-id: 3 - beling-embeddings shape: torch.Size([1, 768])
word-id: 4 - beling-embeddings shape: torch.Size([1, 768])
word-id: 5 - beling-embeddings shape: torch.Size([1, 768])
word-id: 6 - beling-embeddings shape: torch.Size([1, 768])
word-id: 7 - beling-embeddings shape: torch.Size([1, 768])
word-id: 8 - beling-embeddings shape: torch.Size([1, 768])
word-id: 9 - beling-embeddings shape: torch.Size([1, 768])
word-id: 10 - beling-embeddings shape: torch.Size([1, 768])
word-id: 11 - beling-embeddings shape: torch.Size([1, 768])
word-id: 12 - beling-embeddings shape: torch.Size([1, 768])
word-id: 13 - beling-embeddings shape: torch.Size([3, 768])
word-id: 14 - 



In [None]:
vocab = {}

for marked_sent in marked_shorted_sentences[:2]:
  print(marked_sent)
  tokens_tensor, segments_tensors, tokens_ids_with_belonging_information = tokenizerFast_for_a_sent(marked_sent, tokenizerfast)
  with torch.no_grad():
      outputs = model(tokens_tensor, segments_tensors)
      reformed = reform_token_embeddings_of_sentence(outputs)
      sent_tokens_embeddings = get_token_embeddings(reformed)
      print(f'number of found embeddings: {len(sent_tokens_embeddings)}')
      words_embeddings_in_sent_dict = get_final_words_embeddings_in_sent(marked_sent, tokens_ids_with_belonging_information, sent_tokens_embeddings)
      save_embeddings_to_text(words_embeddings_in_sent_dict)
      for word, vector in words_embeddings_in_sent_dict.items():
        #print(word)
        if word in vocab.keys():
          #print(vector[:2])
          sum_vector = vocab[word][1] + vector
          #print(sum_vector)
          count = vocab[word][0] + 1
          vocab[word] = (count, sum_vector)
        else:
          #print(vector[:2])
          vocab[word] = (1, vector)

      del tokens_tensor
      del segments_tensors
      del outputs
      del reformed
      del sent_tokens_embeddings
      del words_embeddings_in_sent_dict
  print("---------------------------------------------------------------------------------------")

#update vocab over all sentences
updated_vocab = {}
for word, (count, sum_vector) in vocab.items():
  updated_vocab[word] = (sum_vector/count)
vocabulary_embeddings_to_text(updated_vocab)

[CLS] lerxst thing subject car organization university maryland college park lines wondering anyone enlighten car saw day [SEP]
number of found embeddings: 25
sentence-tokenizerfast-word-ids: [None, 0, 1, 1, 1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 13, 13, 14, 15, 16, 17, None]
word-id: 0 - beling-embeddings shape: torch.Size([1, 768])
word-id: 1 - beling-embeddings shape: torch.Size([4, 768])
word-id: 2 - beling-embeddings shape: torch.Size([1, 768])
word-id: 3 - beling-embeddings shape: torch.Size([1, 768])
word-id: 4 - beling-embeddings shape: torch.Size([1, 768])
word-id: 5 - beling-embeddings shape: torch.Size([1, 768])
word-id: 6 - beling-embeddings shape: torch.Size([1, 768])
word-id: 7 - beling-embeddings shape: torch.Size([1, 768])
word-id: 8 - beling-embeddings shape: torch.Size([1, 768])
word-id: 9 - beling-embeddings shape: torch.Size([1, 768])
word-id: 10 - beling-embeddings shape: torch.Size([1, 768])
word-id: 11 - beling-embeddings shape: torch.Size([1, 768])
word-i



True