# Creating BERT Embeddings with bert-as-service
See: https://bert-as-service.readthedocs.io/en/latest/  
See: https://github.com/hanxiao/bert-as-service

In [1]:
from bert_serving.client import BertClient

In [2]:
import warnings
warnings.filterwarnings("ignore")

##### Build connection to service

In [3]:
bc = BertClient(ip='136.199.93.84') #remote server

##### Query directory of the model from the server

In [4]:
bc.server_status['model_dir'] #folder path of the pre-trained BERT model

'/media/data/models/bert/tf/uncased_L-12_H-768_A-12'

### Creating the sentence embeddings<br>
##### *Code snippet 1*: To read corpus file 1 with output file in "tense: sentence" format

In [5]:
dir_path1 = '../data/Sentences_GPT3.txt'

with open(dir_path1) as in_file:
    s_list = [line.strip() for line in in_file if line.strip()]
    tense_list, sent_list = map(list, zip(*(s.split(':') for s in s_list)))
    #print(sent_list)
    for s in sent_list:
        sent_list[sent_list.index(s)] = s.strip()
all_embs1 = bc.encode(sent_list, show_tokens = True, is_tokenized = False)

In [6]:
all_embs1[0]

array([[[ 0.21367194,  0.08240603, -0.06714838, ..., -0.53888416,
          0.19486403,  0.7237248 ],
        [ 0.6012578 , -0.32839608,  0.52842516, ..., -0.13718466,
          0.4736324 , -0.43120793],
        [ 0.18514141,  0.22976695,  0.22050108, ..., -0.3623354 ,
         -0.3686014 , -0.06571554],
        ...,
        [-0.        , -0.        ,  0.        , ..., -0.        ,
          0.        ,  0.        ],
        [ 0.        , -0.        ,  0.        , ..., -0.        ,
          0.        ,  0.        ],
        [ 0.        , -0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ]],

       [[ 0.4665227 ,  0.28267437,  0.25747856, ..., -0.32307202,
          0.17133777,  0.29996264],
        [ 0.75914913, -0.1126898 ,  0.71379143, ..., -0.05284319,
          0.7888655 , -0.52549255],
        [ 0.16767925, -0.1699537 ,  0.09485049, ..., -0.30874577,
          0.17867729,  0.61325   ],
        ...,
        [ 0.        ,  0.        ,  0.        , ...,  

##### *Code snippet 2*: To read corpus file 2 with output format in "sentence" format

In [7]:
dir_path2 = '../data/adverbSentences.txt'

with open(dir_path2, 'r') as in_file:
    sentence_list = [line.strip() for line in in_file if line.strip()]
    #print(sentence_list)
all_embs2 = bc.encode(sentence_list, show_tokens = True, is_tokenized = False)

In [8]:
all_embs2[0]

array([[[ 0.35714322, -0.04401983,  0.14798854, ..., -0.4364102 ,
          0.2483384 ,  0.3197378 ],
        [ 0.74051636,  0.36328253,  0.38493878, ..., -0.7083721 ,
          0.51678586, -0.90824944],
        [ 0.6775574 , -0.306359  ,  0.87465936, ..., -0.17938663,
          0.85302895, -0.59072936],
        ...,
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ]],

       [[ 0.22477609, -0.28845   , -0.39357919, ..., -0.23550169,
          0.14343336,  0.13852412],
        [ 0.10404697,  0.24688396, -0.6048023 , ..., -0.75887746,
          0.37205458, -1.2569041 ],
        [ 0.2544256 , -0.40743682, -0.08173469, ..., -0.06801052,
          0.98727196, -0.7952623 ],
        ...,
        [ 0.        ,  0.        ,  0.        , ...,  

### Visualization in the embedding projector Tensorboard
See: https://www.tensorflow.org/tensorboard/tensorboard_projector_plugin <br>
The embeddings must be uploaded in the form of TSV files (metadata & vectors). <br>
*Code snippet 1*: Creating TSV files for corpus without adverbs

In [9]:
import csv
import numpy as np

# tensors: embeddings
with open('../data/tensors.tsv', 'w') as tsvfile:
    writer = csv.writer(tsvfile, delimiter='\t')
    #for i in enumerate(all_embs1):
    i = 0
    while i < len(all_embs1[0]):
            writer.writerow(all_embs1[0][i][0])
            i += 1

# metadata: sentences
with open('../data/metadata.tsv', 'w') as tsvfile:
    writer = csv.writer(tsvfile, delimiter='\t')
    writer.writerow(["Id", "Tense", "Sentence"])
    i = 0
    count = 1
    while i < len(tense_list):
        writer.writerow([int(count), tense_list[i], sent_list[i]])
        i += 1
        count +=1

*Code snippet 2*: Creatinf TSV files for corpus with adverbs

In [10]:
with open('tensorsAdverb.tsv', 'w') as tsvfile:  #embeddings
    writer = csv.writer(tsvfile, delimiter='\t')
    #for i in enumerate(all_embs1):
    i = 0
    while i < len(all_embs2[0]):
            writer.writerow(all_embs2[0][i][0])
            i += 1
            
with open('metadataAdverb.tsv', 'w') as tsvfile: #Sätze
    writer = csv.writer(tsvfile, delimiter='\t')
    writer.writerow(["Id", "Tense", "Sentence"])
    i = 0
    count = 1
    while i < len(tense_list):
        writer.writerow([int(count), tense_list[i], sentence_list[i]]) #Schreibt Tense und Sentence neben einander
        i += 1
        count +=1