In [21]:
import matplotlib.pyplot as plt
from transformers import BertTokenizer, TFBertModel
import tensorflow as tf
import random
import numpy as np
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.manifold import TSNE

# Initialize the BETO tokenizer and model
tokenizer = BertTokenizer.from_pretrained('dccuchile/bert-base-spanish-wwm-cased')
model = TFBertModel.from_pretrained('dccuchile/bert-base-spanish-wwm-cased', output_hidden_states=True)

Some layers from the model checkpoint at dccuchile/bert-base-spanish-wwm-cased were not used when initializing TFBertModel: ['mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFBertModel were not initialized from the model checkpoint at dccuchile/bert-base-spanish-wwm-cased and are newly initialized: ['bert/pooler/dense/bias:0', 'bert/pooler/dense/kernel:0']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [22]:
def extract_embeddings(file_path, word, num_samples):
    counter = 0
    embeddings = []
    sentences = []
    with open(file_path, encoding='utf-8') as file:
        lines = random.sample(file.read().splitlines(), k=num_samples)
        print(f"Extracted {num_samples} lines")
        lines = list(set([line.lower() for line in lines if len(line) < 512 and word in line]))
        print('Cleaning samples...')
        print(f'Total samples after cleaning: {len(lines)}')

    print('Extracting embeddings...')

    # Iterate over every match and extract the embedding for the given word
    for line in lines:
        
        # Print out the progress so far
        if counter % 100 == 0 and counter != 0:
            print(f'Processed {counter} lines...')

        # Tokenize the sentence and convert it to BERT input format
        inputs = tokenizer.encode_plus(line, add_special_tokens=True, return_tensors='tf')
        inputs['token_type_ids'] = tf.zeros_like(inputs['input_ids'])

        # Feed the inputs to BERT and extract the output hidden states
        outputs = model(**inputs)
        hidden_states = outputs.hidden_states

        # Extract the embedding for the word from the final hidden state
        try:
            embedding = hidden_states[-1][0][inputs['input_ids'][0].numpy().tolist().index(tokenizer.vocab[word])].numpy()
        except ValueError:
            print('The token {} is not present in the input text.'.format(word))
            embedding = None

        if embedding is not None:
            embedding = hidden_states[-1][0][inputs['input_ids'][0].numpy().tolist().index(tokenizer.vocab[word])].numpy()
            # Add the embedding to the list
            sentences.append(line)
            embeddings.append(embedding)
            counter += 1
    return embeddings, sentences


In [23]:
realmente_embeddings, realmente_sentences = extract_embeddings("realmente.txt", 'realmente', 5600)

# Save the embeddings to a file
embeddings_array = np.array(realmente_embeddings)
np.save('realmente_embeddings.npy', embeddings_array)

# Save the sentences to a file
with open('realmente_embeddings_sentences.txt', 'w', encoding='utf-8') as file:
    file.writelines('\n'.join(realmente_sentences))

Extracted 5600 lines
Cleaning samples...
Total samples after cleaning: 5339
Extracting embeddings...
Processed 100 lines...
Processed 200 lines...
Processed 300 lines...
Processed 400 lines...
Processed 500 lines...
Processed 600 lines...
Processed 700 lines...
Processed 800 lines...
Processed 900 lines...
Processed 1000 lines...
Processed 1100 lines...
Processed 1200 lines...
Processed 1300 lines...
Processed 1400 lines...
Processed 1500 lines...
Processed 1600 lines...
Processed 1700 lines...
Processed 1800 lines...
Processed 1900 lines...
Processed 2000 lines...
Processed 2100 lines...
Processed 2200 lines...
Processed 2300 lines...
Processed 2400 lines...
Processed 2500 lines...
Processed 2600 lines...
Processed 2700 lines...
Processed 2800 lines...
Processed 2900 lines...
Processed 3000 lines...
Processed 3100 lines...
Processed 3200 lines...
Processed 3300 lines...
Processed 3400 lines...
Processed 3500 lines...
Processed 3600 lines...
Processed 3700 lines...
Processed 3800 lines

In [24]:
muy_embeddings, muy_sentences = extract_embeddings("muy.txt", 'muy', 1000)

# Save the embeddings to a file
muy_embeddings_array = np.array(muy_embeddings)
np.save('muy_embeddings.npy', muy_embeddings_array)

# Save the sentences to a file
with open('muy_embeddings_sentences.txt', 'w', encoding='utf-8') as muy_file:
    muy_file.writelines('\n'.join(muy_sentences))

Extracted 1000 lines
Cleaning samples...
Total samples after cleaning: 974
Extracting embeddings...
Processed 100 lines...
Processed 200 lines...
The token muy is not present in the input text.
Processed 300 lines...
Processed 400 lines...
Processed 500 lines...
Processed 600 lines...
Processed 700 lines...
Processed 800 lines...
Processed 900 lines...


In [25]:
rapidamente_embeddings, rapidamente_sentences = extract_embeddings("rapidamente.txt", 'rápidamente', 1000)

# Save the embeddings to a file
rapidamente_embeddings_array = np.array(rapidamente_embeddings)
np.save('rapidamente_embeddings.npy', rapidamente_embeddings_array)

# Save the sentences to a file
with open('rapidamente_embeddings_sentences.txt', 'w', encoding='utf-8') as rapid_file:
    rapid_file.writelines('\n'.join(rapidamente_sentences))

Extracted 1000 lines
Cleaning samples...
Total samples after cleaning: 948
Extracting embeddings...
Processed 100 lines...
Processed 200 lines...
Processed 300 lines...
Processed 400 lines...
Processed 500 lines...
Processed 600 lines...
Processed 700 lines...
Processed 800 lines...
Processed 900 lines...
