In [1]:
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import torch.nn.functional as F
import pandas as pd
import math

In [3]:
target_sentences_df = pd.read_csv('/content/drive/MyDrive/PhD/target_sentences.csv')
target_sentences_df

Unnamed: 0,Text
0,još malo pa će izbori
1,naoblačilo se kao da će padati kiša
2,naša kola su stara 10 godina
3,otišao je u inostranstvo prošle godine i još s...
4,telefoni zvone po ceo dan i tebe traže
...,...
62,pojavio se u poslednjem trenutku
63,sanjao sam te pre neku noć
64,premijer je obećao viši standard
65,sutra je doček Nove Godine


In [4]:
vocabulary_df = pd.read_csv('/content/drive/MyDrive/PhD/wordlist_classlawiki_sr_cleaned.csv')
vocabulary_df

Unnamed: 0,word,freq
0,biti,5835707
1,u,4289454
2,i,2848044
3,na,1602609
4,sebe,1544095
...,...,...
336153,177.,5
336154,0038,5
336155,0036,5
336156,0032,5


In [5]:
pip install transformers



In [6]:
#from transformers import AutoTokenizer, RobertaForCausalLM, AutoConfig
#import torch.nn.functional as F
#import torch

#tokenizer = AutoTokenizer.from_pretrained("FacebookAI/roberta-base")
#config = AutoConfig.from_pretrained("FacebookAI/roberta-base")
#config.is_decoder = True
#model = RobertaForCausalLM.from_pretrained("FacebookAI/roberta-base", config=config)

In [7]:
from transformers import AutoTokenizer, AlbertForMaskedLM, AutoConfig
import torch.nn.functional as F
import torch

# Load ALBERT tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("albert-base-v2")
config = AutoConfig.from_pretrained("albert-base-v2")

# ALBERT does not have a causal LM head, so we use it for Masked LM instead
model = AlbertForMaskedLM.from_pretrained("albert-base-v2", config=config)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/684 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/760k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.31M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/47.4M [00:00<?, ?B/s]

Some weights of the model checkpoint at albert-base-v2 were not used when initializing AlbertForMaskedLM: ['albert.pooler.bias', 'albert.pooler.weight']
- This IS expected if you are initializing AlbertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


**Information Value**

In [8]:
def mask_each_word(sentence):
    # Tokenize the sentence
    tokenized_sentence = tokenizer.tokenize(sentence)

    # Lists to store masked sentences and masked words
    masked_sentences = []
    masked_words = []

    # Iterate through each word and replace it with [MASK]
    for i in range(len(tokenized_sentence)):
        masked_sentence = list(tokenized_sentence)  # Create a copy of the tokenized sentence
        masked_sentence[i] = tokenizer.mask_token  # Replace the i-th word with [MASK]

        # Add the masked sentence to the list
        masked_sentences.append(tokenizer.convert_tokens_to_string(masked_sentence))

        # Add the masked word to the list
        masked_words.append(tokenized_sentence[i])

    return masked_sentences, masked_words

In [9]:
def estimate_masked_probability(sentence, candidate_word, model = model, tokenizer = tokenizer):

    # Tokenize the input sentence
    tokenized_sentence = tokenizer.encode(sentence, add_special_tokens=True)

    # Find the index of the [MASK] token
    mask_index = tokenized_sentence.index(tokenizer.mask_token_id)

    # Replace [MASK] with the candidate word
    tokenized_sentence[mask_index] = tokenizer.convert_tokens_to_ids(candidate_word)

    # Convert tokenized sequence to PyTorch tensor
    input_ids = torch.tensor([tokenized_sentence])

    # Get model predictions
    with torch.no_grad():
        outputs = model(input_ids)
        logits = outputs.logits

    # Apply softmax to obtain probabilities
    probabilities = F.softmax(logits[0, mask_index], dim=0)

    # Find the probability of the candidate word
    candidate_index = tokenizer.convert_tokens_to_ids(candidate_word)
    candidate_probability = probabilities[candidate_index].item()

    return candidate_probability

In [10]:
def extract_words_and_probabilities(words, probabilities):
    new_words = []
    new_probabilities = []

    i = 0
    while i < len(words):
        current_word = words[i]
        current_probability = probabilities[i]

        while i + 1 < len(words) and words[i + 1].startswith('##'):
            next_word = words[i + 1][2:]
            current_word += next_word
            current_probability *= probabilities[i + 1]
            i += 1  # Move to the next word in the sequence

        new_words.append(current_word)
        new_probabilities.append(current_probability)

        i += 1

    return new_words, new_probabilities

In [11]:
def extract_words_and_embeddings(subwords, subword_embeddings):
    words = []
    word_embeddings = []

    current_word = ""
    current_subword_embeddings = []

    for subword, embedding in zip(subwords, subword_embeddings):
        # Check if the subword is a continuation (starts with '##')
        if subword.startswith('##'):
            # Remove '##' and concatenate
            current_word += subword[2:]
            current_subword_embeddings.append(embedding)
        else:
            # If we have a current word, save it before starting new one
            if current_word:
                words.append(current_word)
                # Average all subword embeddings for the word
                word_embeddings.append(
                    sum(current_subword_embeddings) / len(current_subword_embeddings)
                )

            # Start new word
            current_word = subword
            current_subword_embeddings = [embedding]

    # Add the last word if exists
    if current_word:
        words.append(current_word)
        word_embeddings.append(
            sum(current_subword_embeddings) / len(current_subword_embeddings)
        )

    return words, word_embeddings

In [12]:
def calculate_word_information_values(sentence, vocabulary_df, model = model, tokenizer = tokenizer):

    # calulate information value for one sentence
    words_list = sentence.split(' ')

    ce_iv = [[],[],[],[],[],[],[],[],[],[],[],[],[]]

    words = sentence.split(' ')
    input_ids = tokenizer.encode(" ".join(words), return_tensors='pt')

    # Forward pass to get hidden states
    with torch.no_grad():
      outputs = model(input_ids, output_hidden_states=True)

    decoded_subwords = tokenizer.convert_ids_to_tokens(input_ids[0].tolist())

    for j in range(1,13):
      last_hidden_state = outputs.hidden_states[-j]
      decoded_words1, embeddings1 = extract_words_and_embeddings(decoded_subwords, last_hidden_state[0,:])

      # loop through all words in sentence
      for i in range(0, len(sentence.split(' '))):

        words = sentence.split(' ')
        word = words[i]
        ce_iv[j].append(0)

        embedding_word1 = embeddings1[i]

        vocab_df = vocabulary_df.sample(n=50, random_state=42).reset_index(drop=True)
        # loop through all vocabulary words
        for vord in vocab_df['word'].tolist():

          words[i] = vord
          # Tokenize the input sentence
          input_ids = tokenizer.encode(" ".join(words), return_tensors='pt')

          # Generate word probabilities using GPT-2 model
          with torch.no_grad():
            outputs = model(input_ids, output_hidden_states=True)
            logits = outputs.logits
            last_hidden_state = outputs.hidden_states[-j]  # This is the final layer's hidden state for each token

          # Extract probabilities for each word
          subword_probabilities = torch.softmax(logits, dim=-1).mean(dim=2)

          # Decode the tokens back to words
          decoded_subwords = tokenizer.convert_ids_to_tokens(input_ids[0].tolist())
          decoded_words2, probabilities = extract_words_and_probabilities(decoded_subwords, subword_probabilities[0])

          decoded_words2, embeddings2 = extract_words_and_embeddings(decoded_subwords, last_hidden_state[0,:])
          embedding_word2 = embeddings2[i]

          context_probability = probabilities[decoded_words2==vord].item()

          # distances
          contextual_embedding_distance = 1 - 0.5 * (F.cosine_similarity(embedding_word1, embedding_word2, dim=0).item() + 1)
          ce_iv[j][i] += contextual_embedding_distance * context_probability

    return words_list, ce_iv

In [13]:
words_list = []
target_sentence_list = []
ce_iv_list = [[],[],[],[],[],[],[],[],[],[],[],[],[]]

# Save the DataFrame to a CSV file
csv_file_path = "/content/drive/MyDrive/PhD/information_value_2.csv"

for i in range(46,len(target_sentences_df)):
  sentence = target_sentences_df['Text'][i].lower()
  print(i)
  print(sentence)
  words, ce_ivs = calculate_word_information_values(sentence.strip(), vocabulary_df)

  for ind in range(0,len(words)):
    words_list.append(words[ind])
    target_sentence_list.append(i)
    for j in range(1,13):
      ce_iv_list[j].append(ce_ivs[j][ind])

  # Create a DataFrame
  df = pd.DataFrame({
      'Sentence': target_sentence_list,
      'Word': words_list,
      **{f'CE {j}': ce_iv_list[j] for j in range(1, 13)}}
      )
  df.to_csv(csv_file_path, index=False)

# Display the DataFrame
print(df)

46
prelazio je ulicu naspram one tek renovirane buregdzinice
47
dok se saginjao umalo ga ne zgazi ljubin ford
48
vratiću se brzo 
49
ne idem danas u školu
50
prespavaću ceo dan
51
napisaću mu pismo
52
krećemo za pet minuta
53
doćiće nam mirko na ručak
54
još malo pa će izbori
55
sutra će biti 27 u hladu
56
doveče je žurka kod komšija
57
za dva dana počinje škola
58
od sutra smo sami u kući
59
petar će doći u ponedeljak
60
moj sin ide prvi put sam na more
61
demonstracije su opet počele
62
pojavio se u poslednjem trenutku
63
sanjao sam te pre neku noć
64
premijer je obećao viši standard
65
sutra je doček nove godine
66
nadam se
     Sentence      Word      CE 1      CE 2      CE 3      CE 4      CE 5  \
0          46  prelazio  0.000089  0.000044  0.000043  0.000038  0.000031   
1          46        je  0.000031  0.000059  0.000064  0.000055  0.000040   
2          46     ulicu  0.000025  0.000029  0.000025  0.000033  0.000056   
3          46   naspram  0.000031  0.000027  0.000022  0.

In [14]:
import pandas as pd

df_0 = pd.read_csv("/content/drive/MyDrive/PhD/information_value_0.csv")
df_1 = pd.read_csv("/content/drive/MyDrive/PhD/information_value_1.csv")
df_2 = pd.read_csv("/content/drive/MyDrive/PhD/information_value_2.csv")

merged_df = pd.concat([df_0, df_1, df_2], axis=0)

In [15]:
merged_df.to_csv("/content/drive/MyDrive/PhD/information_value_albert.csv", index=False)