In [11]:
import torch 
import torch.nn as nn
import nltk
import numpy as np
import re
from sklearn.feature_extraction.text import CountVectorizer
from transformers import BertTokenizer, BertModel,T5Tokenizer, T5Model
from tqdm import tqdm
from flair.embeddings import WordEmbeddings
from sentence_transformers import SentenceTransformer


In [3]:
import pandas as pd

data = pd.read_table('fra.txt', header= None)

In [4]:
data.rename(columns= {0: 'English', 1: 'French', 2: 'Citation'}, inplace= True)

In [5]:
pattern = r"[!'#$%&()*+,-./:;<=>?@[\]^`{|}~“”‘’«»‹›„‚–—…·•¡¿’\"\']"

eng_sent, french_sent = [], []

for e in range(len(data['English'])):
    eng_sent.append(re.sub(pattern, "", data['English'][e]))
    french_sent.append(re.sub(pattern, "", data['French'][e]))

In [6]:
print(len(eng_sent))
print(len(french_sent))

229803
229803


In [7]:
print(100 - (len(set(eng_sent))/len(eng_sent))*100)
print(100 - (len(set(french_sent))/len(french_sent))*100)

28.653237773223154
11.28488313903648


In [8]:
eng_sent_unique = list(set(eng_sent))
french_sent_unique = list(set(french_sent))

In [13]:
sent_model = SentenceTransformer('all-MiniLM-L6-v2')


In [39]:
eng_embeddings = sent_model.encode(eng_sent_unique)


384

In [None]:
len(eng_embeddings[0])

In [36]:
fr_embeddings = sent_model.encode(french_sent_unique)

In [38]:
len(fr_embeddings[0])

384

In [22]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [23]:
eng_token = [tokenizer.encode(text, add_special_tokens= True) for text in eng_sent_unique]
fren_token = [tokenizer.encode(text, add_special_tokens= True) for text in french_sent_unique]

In [8]:
model = BertModel.from_pretrained('bert-base-uncased')


model.safetensors: 100%|██████████| 440M/440M [00:34<00:00, 12.7MB/s] 


In [23]:
def text_embedding(batch_tokens, max_len):
    batch_padded_tokens = [tokens + [tokenizer.pad_token_id 
                                 for i in range(max_len - len(tokens))]
                      for tokens in batch_tokens]
    tokens_tensor = torch.tensor(batch_padded_tokens)
    with torch.no_grad():
        output = model(tokens_tensor)
        embeddings = output.last_hidden_state
    return embeddings

In [11]:
batch_size = 1000

eng_embedding = []
for i in tqdm(range(0, len(eng_token), batch_size), "Embedding", colour= "green"):
    batch_token = eng_token[i : i+batch_size]
    eng_embedding.extend(text_embedding(batch_token, 30))

len(eng_embedding)

Embedding:   3%|[32m▎         [0m| 7/230 [03:08<1:40:02, 26.92s/it]


KeyboardInterrupt: 

In [12]:
batch_size = 100

fren_embedding = []
for i in tqdm(range(0, len(fren_token), batch_size), "Embedding", colour= "green"):
    batch_token = fren_token[i : i+batch_size]
    fren_embedding.extend(text_embedding(batch_token, 104))

len(fren_embedding)

Embedding:   7%|[32m▋         [0m| 153/2299 [12:32<3:49:40,  6.42s/it]

: 

for i in tqdm(range(len(eng_embedding)), "Saving"):
    file_name = "English_embedding_" + str(i) + ".pt"
    torch.save(eng_embedding[i], file_name)