In [1]:
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import pandas as pd
import math

In [3]:
target_sentences_df = pd.read_csv('/content/drive/MyDrive/PhD/target_sentences.csv')
target_sentences_df

Unnamed: 0,Text
0,još malo pa će izbori
1,naoblačilo se kao da će padati kiša
2,naša kola su stara 10 godina
3,otišao je u inostranstvo prošle godine i još s...
4,telefoni zvone po ceo dan i tebe traže
...,...
62,pojavio se u poslednjem trenutku
63,sanjao sam te pre neku noć
64,premijer je obećao viši standard
65,sutra je doček Nove Godine


In [4]:
pip install transformers



In [5]:
from transformers import AutoTokenizer, RobertaForCausalLM, AutoConfig
import torch.nn.functional as F
import torch

tokenizer = AutoTokenizer.from_pretrained("FacebookAI/roberta-base")
config = AutoConfig.from_pretrained("FacebookAI/roberta-base")
config.is_decoder = True
model = RobertaForCausalLM.from_pretrained("FacebookAI/roberta-base", config=config)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [6]:
from transformers import AutoTokenizer, AlbertForMaskedLM, AutoConfig
import torch.nn.functional as F
import torch

# Load ALBERT tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("albert-base-v2")
config = AutoConfig.from_pretrained("albert-base-v2")

# ALBERT does not have a causal LM head, so we use it for Masked LM instead
model = AlbertForMaskedLM.from_pretrained("albert-base-v2", config=config)

Some weights of the model checkpoint at albert-base-v2 were not used when initializing AlbertForMaskedLM: ['albert.pooler.bias', 'albert.pooler.weight']
- This IS expected if you are initializing AlbertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [7]:
def mask_each_word(sentence):
    # Tokenize the sentence
    tokenized_sentence = tokenizer.tokenize(sentence)

    # Lists to store masked sentences and masked words
    masked_sentences = []
    masked_words = []

    # Iterate through each word and replace it with [MASK]
    for i in range(len(tokenized_sentence)):
        masked_sentence = list(tokenized_sentence)  # Create a copy of the tokenized sentence
        masked_sentence[i] = tokenizer.mask_token  # Replace the i-th word with [MASK]

        # Add the masked sentence to the list
        masked_sentences.append(tokenizer.convert_tokens_to_string(masked_sentence))

        # Add the masked word to the list
        masked_words.append(tokenized_sentence[i])

    return masked_sentences, masked_words

In [8]:
def estimate_masked_probability(sentence, candidate_word, model = model, tokenizer = tokenizer):

    # Tokenize the input sentence
    tokenized_sentence = tokenizer.encode(sentence, add_special_tokens=True)

    # Find the index of the [MASK] token
    mask_index = tokenized_sentence.index(tokenizer.mask_token_id)

    # Replace [MASK] with the candidate word
    tokenized_sentence[mask_index] = tokenizer.convert_tokens_to_ids(candidate_word)

    # Convert tokenized sequence to PyTorch tensor
    input_ids = torch.tensor([tokenized_sentence])

    # Get model predictions
    with torch.no_grad():
        outputs = model(input_ids)
        logits = outputs.logits

    # Apply softmax to obtain probabilities
    probabilities = F.softmax(logits[0, mask_index], dim=0)

    # Find the probability of the candidate word
    candidate_index = tokenizer.convert_tokens_to_ids(candidate_word)
    candidate_probability = probabilities[candidate_index].item()

    return candidate_probability

In [9]:
def concatenate_words_and_probabilities(words, probabilities):
    new_words = []
    new_probabilities = []

    i = 0
    while i < len(words):
        current_word = words[i]
        current_probability = probabilities[i]

        while i + 1 < len(words) and words[i + 1].startswith('##'):
            next_word = words[i + 1][2:]
            current_word += next_word
            current_probability *= probabilities[i + 1]
            i += 1  # Move to the next word in the sequence

        new_words.append(current_word)
        new_probabilities.append(current_probability)

        i += 1

    return new_words, new_probabilities

# Example usage:
words = ['jos', 'malo', 'pa', 'ce', 'iz', '##bor', '##i']
probabilities = [0.1421, 0.9990, 0.9895, 0.9996, 0.9877, 0.9985, 0.9919, 0.95, 0.98]

new_words, new_probabilities = concatenate_words_and_probabilities(words, probabilities)

# Print the result
for word, probability in zip(new_words, new_probabilities):
    print(f"Probability of '{word}': {probability:.4f}")


Probability of 'jos': 0.1421
Probability of 'malo': 0.9990
Probability of 'pa': 0.9895
Probability of 'ce': 0.9996
Probability of 'izbori': 0.9782


In [10]:
def calculate_word_probabilities(sentence):

  masked_sentences, masked_words = mask_each_word(sentence)

  list_probabilities = []
  total = 1
  for candidate, masked_sentence in zip(masked_words, masked_sentences):
    probability = estimate_masked_probability(masked_sentence, candidate)
    list_probabilities.append(probability)
    total = total * probability

  words, probabilities = concatenate_words_and_probabilities(masked_words, list_probabilities)

  return words, probabilities, total

In [11]:
words_list = []
probabilities_list = []
target_sentence_list = []

for i in range(0,len(target_sentences_df)):
  sentence = target_sentences_df['Text'][i].lower()
  words = sentence.split(' ')
  _, probabilities, total  = calculate_word_probabilities(sentence)

  for word, prob in zip(words, probabilities):
    words_list.append(word)
    probabilities_list.append(-math.log2(prob))
    target_sentence_list.append(i)

# Create a DataFrame
df = pd.DataFrame({'Sentence': target_sentence_list, 'Word': words_list, 'Surprisal RoBERTa': probabilities_list})

# Display the DataFrame
print(df)

     Sentence    Word  Surprisal RoBERTa
0           0     još           2.187878
1           0    malo           1.643408
2           0      pa           0.887693
3           0      će           1.016810
4           0  izbori           0.179800
..        ...     ...                ...
453        65   doček           0.232650
454        65    nove           4.137129
455        65  godine           0.140202
456        66   nadam           0.090170
457        66      se           3.941138

[458 rows x 3 columns]


In [12]:
# Save the DataFrame to a CSV file
csv_file_path = "/content/drive/MyDrive/PhD/word_surprisals_albert.csv"
df.to_csv(csv_file_path, index=False)