In [1]:
import torch 
import torch.nn as nn
import nltk
import numpy as np
import re
from sklearn.feature_extraction.text import CountVectorizer
from transformers import BertTokenizer, BertModel,AutoTokenizer, AutoModel
from tqdm import tqdm
from flair.embeddings import WordEmbeddings
from sentence_transformers import SentenceTransformer
import tensorflow as tf 
import os
import json
import pyarrow
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm





In [2]:
import pandas as pd

data = pd.read_table('fra.txt', header= None)

In [11]:
data.rename(columns= {0: 'English', 1: 'French', 2: 'Citation'}, inplace= True)
data = data[:1000]
data

Unnamed: 0,English,French,Citation
0,Go.,Va !,CC-BY 2.0 (France) Attribution: tatoeba.org #2...
1,Go.,Marche.,CC-BY 2.0 (France) Attribution: tatoeba.org #2...
2,Go.,En route !,CC-BY 2.0 (France) Attribution: tatoeba.org #2...
3,Go.,Bouge !,CC-BY 2.0 (France) Attribution: tatoeba.org #2...
4,Hi.,Salut !,CC-BY 2.0 (France) Attribution: tatoeba.org #5...
...,...,...,...
995,Get a saw.,Va chercher une scie.,CC-BY 2.0 (France) Attribution: tatoeba.org #1...
996,Get going.,Vas-y.,CC-BY 2.0 (France) Attribution: tatoeba.org #7...
997,Get going.,Allez-y.,CC-BY 2.0 (France) Attribution: tatoeba.org #7...
998,Get going.,Marche.,CC-BY 2.0 (France) Attribution: tatoeba.org #7...


In [13]:
pattern = r"[!'#$%&()*+,-./:;<=>?@[\]^`{|}~“”‘’«»‹›„‚–—…·•¡¿’\"\']"

eng_sent, french_sent = [], []

for e in range(len(data['English'])):
    eng_sent.append(re.sub(pattern, "", data['English'][e]))
    french_sent.append(re.sub(pattern, "", data['French'][e]))
#eng_sent[229801]

In [14]:
print(len(eng_sent))
print(len(french_sent))

1000
1000


In [15]:
print(100 - (len(set(eng_sent))/len(eng_sent))*100)
print(100 - (len(set(french_sent))/len(french_sent))*100)

63.9
18.60000000000001


In [16]:
eng_sent_unique = list(set(eng_sent))
french_sent_unique = list(set(french_sent))

In [17]:
sent_model = SentenceTransformer('all-MiniLM-L6-v2')
bert_small_tokenizer = AutoTokenizer.from_pretrained("nlpaueb/legal-bert-small-uncased")
bert_small_model = AutoModel.from_pretrained("nlpaueb/legal-bert-small-uncased")
bert_tiny_tokenizer = AutoTokenizer.from_pretrained("prajjwal1/bert-tiny")
Bert_tiny_model = AutoModel.from_pretrained("prajjwal1/bert-tiny")

In [18]:
def text_embedding(batch_tokens, max_len,model,tokenizer):
    batch_padded_tokens = [tokens + [tokenizer.pad_token_id 
                                 for i in range(max_len - len(tokens))]
                      for tokens in batch_tokens]
    tokens_tensor = torch.tensor(batch_padded_tokens)
    with torch.no_grad():
        output = model(tokens_tensor)
        embeddings = output.last_hidden_state
    return embeddings
def get_embeddings(max_length,batch_size,tokens,model,tokenizer):

    embedding_trans = []
    for i in tqdm(range(0, len(tokens), batch_size), "Embedding", colour= "green"):
        batch_token = tokens[i : i+batch_size]
        embedding_trans.extend(text_embedding(batch_token,max_length,model,tokenizer))

    return embedding_trans

In [19]:
english_tokens = [bert_tiny_tokenizer.encode(text,add_special_tokens = True,padding='max_length',max_length=104) for text in eng_sent]
french_token = [bert_tiny_tokenizer.encode(text,add_special_tokens = True,padding='max_length',max_length=104) for text in french_sent]
English_embeddings = get_embeddings(max_length=104,batch_size=32,tokens=english_tokens,model=Bert_tiny_model,tokenizer=bert_tiny_tokenizer)
print('----------------------English embededding done -------------------')
French_embeddings = get_embeddings(max_length=104,batch_size=32,tokens=french_token,model=Bert_tiny_model,tokenizer=bert_tiny_tokenizer)
print('----------------------French embededding done -------------------')

Embedding: 100%|[32m██████████[0m| 32/32 [00:02<00:00, 15.18it/s]


----------------------English embededding done -------------------


Embedding: 100%|[32m██████████[0m| 32/32 [00:01<00:00, 22.58it/s]

----------------------French embededding done -------------------





In [23]:
english_tensor_stacked = torch.stack(English_embeddings)
french_tensor_stacked = torch.stack(French_embeddings)

In [32]:
x = torch.randn(16,4)
x = torch.tile(x,(32,1,1))
x.size()

torch.Size([32, 16, 4])

In [36]:
import numpy as np

def positional_encoding(embedding_vectors, max_length,batch_size, positions=None,):
    """
    Add positional encoding to a batch of embedding vectors.

    Parameters:
    - embedding_vectors: The input batch of embedding vectors. Shape: (batch_size, sequence_length, embedding_dim).
    - positions: The positions of the elements in the sequence. If None, positional encoding will be added for all positions.
    - max_length: The maximum length of the sequence.

    Returns:
    - The batch of embedding vectors with added positional encoding.
    """

    records,sequence_length, d_model = embedding_vectors.size()
    
    even_i = torch.arange(0 , d_model , 2).float()
    even_denominator = torch.pow(10000, even_i/d_model)
    odd_i = torch.arange(1 , d_model , 2).float()
    odd_denominator = torch.pow(10000, (odd_i -1)/d_model)

    if positions is None:
        positions = torch.arange(max_length,dtype=torch.float).reshape(max_length,1)

    even_pe = torch.sin(positions/even_denominator)
    odd_pe = torch.sin(positions/even_denominator)
    stacked = torch.stack([even_pe , odd_pe] , dim  = 2)
    PE = torch.flatten(stacked,start_dim=1,end_dim=2)
    PE = torch.tile(PE,(batch_size,1,1))
    test_list=[]

    for i in range(len(embedding_vectors)//batch_size):
        test_list.append(embedding_vectors[(i*batch_size):(i*batch_size)+batch_size] + PE)
    return test_list

# Example usage:

l = positional_encoding(english_tensor_stacked,max_length=104,batch_size=32)
l

[tensor([[[-0.5336, -0.4289, -3.7353,  ..., -2.6731,  0.3956,  1.9470],
          [-0.4657,  1.7298, -0.7106,  ..., -2.6964,  0.2487,  1.7947],
          [-1.2109,  1.3027, -0.2843,  ..., -2.5654,  0.6906,  2.5321],
          ...,
          [-0.3143,  0.2684, -1.5411,  ..., -2.4333,  0.6550,  1.9330],
          [ 0.1154,  0.7722, -0.6843,  ..., -2.4907,  0.7890,  2.1468],
          [-0.2468,  0.4867, -0.0922,  ..., -2.4460,  0.8392,  2.2594]],
 
         [[-0.5336, -0.4289, -3.7353,  ..., -2.6731,  0.3956,  1.9470],
          [-0.4657,  1.7298, -0.7106,  ..., -2.6964,  0.2487,  1.7947],
          [-1.2109,  1.3027, -0.2843,  ..., -2.5654,  0.6906,  2.5321],
          ...,
          [-0.3143,  0.2684, -1.5411,  ..., -2.4333,  0.6550,  1.9330],
          [ 0.1154,  0.7722, -0.6843,  ..., -2.4907,  0.7890,  2.1468],
          [-0.2468,  0.4867, -0.0922,  ..., -2.4460,  0.8392,  2.2594]],
 
         [[-0.5336, -0.4289, -3.7353,  ..., -2.6731,  0.3956,  1.9470],
          [-0.4657,  1.7298,

In [31]:
sample = english_tokens[0]
len(sample)

104

In [34]:
batch_padded_tokens = [sample + [bert_tiny_tokenizer.pad_token_id 
                                 for i in range(104 - len(sample))]
                      ]
batch_padded_tokens
tokens_tensor = torch.tensor(batch_padded_tokens)
with torch.no_grad():
    output = Bert_tiny_model(tokens_tensor)
    embeddings = output.last_hidden_state
embeddings.size()

torch.Size([1, 104, 128])

In [13]:
path = "embedding_files"
if not os.path.exists(path):
   os.makedirs(path)
   print("The new directory is created!")
json_path = os.path.join(os.getcwd(),path)
json_path

'd:\\projects\\Machine-Translation\\embedding_files'

# writing the english embeddings in a json file and uploading it for time saving

In [17]:
eng_embedding_dict = {}
eng_embeddings_parquet_path = 'eng_embeds.parquet'
for i in range(len(eng_embeddings)):
    eng_embedding_dict[i] = eng_embeddings[i].tolist()
#eng_embedding_dict[1]
#eng_embedding_dict_json_object = json.dumps(eng_embedding_dict, indent = 4)
eng_embedding_df = pd.DataFrame(eng_embedding_dict)
eng_embedding_df.to_parquet(os.path.join(json_path,eng_embeddings_parquet_path))
#with open(os.path.join(json_path,eng_embeddings_json_path), "w") as outfile:
    #outfile.write(eng_embedding_dict_json_object)

# writing the french embeddings for time saving

In [18]:
french_embedding_dict = {}
french_embeddings_parquet_path = 'french_embeds.parquet'
for i in range(len(eng_embeddings)):
    french_embedding_dict[i] = fr_embeddings[i].tolist()
#eng_embedding_dict[1]
#french_embedding_dict_json_object = json.dumps(french_embedding_dict, indent = 4) 
fr_embedding_df = pd.DataFrame(french_embedding_dict)
fr_embedding_df.to_parquet(os.path.join(json_path,french_embeddings_parquet_path))
#with open(os.path.join(json_path,french_embeddings_json_path), "w") as outfile:
    #outfile.write(french_embedding_dict_json_object)