In [1]:
import torch 
import torch.nn as nn
import nltk
import numpy as np
import re
from sklearn.feature_extraction.text import CountVectorizer
from transformers import BertTokenizer, BertModel
from tqdm import tqdm

In [2]:
import pandas as pd

data = pd.read_table('fra.txt', header= None)

In [3]:
data.rename(columns= {0: 'English', 1: 'French', 2: 'Citation'}, inplace= True)

In [4]:
pattern = r"[!'#$%&()*+,-./:;<=>?@[\]^`{|}~“”‘’«»‹›„‚–—…·•¡¿’\"\']"

eng_sent, french_sent = [], []

for e in range(len(data['English'])):
    eng_sent.append(re.sub(pattern, "", data['English'][e]))
    french_sent.append(re.sub(pattern, "", data['French'][e]))

In [5]:
print(len(eng_sent))
print(len(french_sent))

229803
229803


In [6]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [7]:
eng_token = [tokenizer.encode(text, add_special_tokens= True) for text in eng_sent]
fren_token = [tokenizer.encode(text, add_special_tokens= True) for text in french_sent]

In [8]:
model = BertModel.from_pretrained('bert-base-uncased')


In [9]:
max_len = 61

def text_embedding(batch_tokens):
    batch_padded_tokens = [tokens + [tokenizer.pad_token_id 
                                 for i in range(max_len - len(tokens))]
                      for tokens in batch_tokens]
    tokens_tensor = torch.tensor(batch_padded_tokens)
    with torch.no_grad():
        output = model(tokens_tensor)
        embeddings = output.last_hidden_state
    return embeddings

In [10]:
batch_size = 10000

eng_embedding = []
for i in tqdm(range(0, len(eng_token), batch_size), "Embedding", colour= "green"):
    batch_token = eng_token[i : i+batch_size]
    eng_embedding.extend(text_embedding(batch_token))

len(eng_embedding)

Embedding:   0%|[32m          [0m| 0/23 [00:00<?, ?it/s]We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.
Embedding:  61%|[32m██████    [0m| 14/23 [1:32:38<1:04:20, 428.95s/it]

: 

In [None]:
for batch in eng_embedding:
    print(batch.shape)

In [None]:
fren_embedding = []
for i in tqdm(range(0, len(fren_token), batch_size), "Embedding", colour= "green"):
    batch_token = fren_token[i : i+batch_size]
    batch_embedding = [text_embedding(token) for token in batch_token]
    fren_embedding.extend(batch_embedding)

len(fren_embedding)

In [None]:
sum = 0
for row in eng_embedding:
    sum += 1

print(sum)

In [None]:

for i in tqdm(range(0, len(fren_embedding), batch_size), "Saving"):
    tensors = ["French_Embedding_{}.pt".format(i) for i in range(i, i + batch_size) if i < len(fren_embedding)]
    print(tensors)
    #torch.save(fren_embedding[i], file_name)

for i in tqdm(range(len(eng_embedding)), "Saving"):
    file_name = "English_embedding_" + str(i) + ".pt"
    torch.save(eng_embedding[i], file_name)