In [1]:
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import torch.nn.functional as F
import pandas as pd
import math

In [3]:
target_sentences_df = pd.read_csv('/content/drive/MyDrive/PhD/target_sentences.csv')
target_sentences_df

Unnamed: 0,Text
0,još malo pa će izbori
1,naoblačilo se kao da će padati kiša
2,naša kola su stara 10 godina
3,otišao je u inostranstvo prošle godine i još s...
4,telefoni zvone po ceo dan i tebe traže
...,...
62,pojavio se u poslednjem trenutku
63,sanjao sam te pre neku noć
64,premijer je obećao viši standard
65,sutra je doček Nove Godine


In [4]:
vocabulary_df = pd.read_csv('/content/drive/MyDrive/PhD/wordlist_classlawiki_sr_cleaned.csv')
vocabulary_df

Unnamed: 0,word,freq
0,biti,5835707
1,u,4289454
2,i,2848044
3,na,1602609
4,sebe,1544095
...,...,...
336153,177.,5
336154,0038,5
336155,0036,5
336156,0032,5


In [5]:
pip install transformers



In [6]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import torch

model_name = 'gpt2'
# Load pre-trained GPT-2 model and tokenizer
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

**Similarity Functions**

In [7]:
def non_context_embedding(word, vord, model, tokenizer, j):

  # Tokenize and get embedding for the target word
  word_input_ids = tokenizer.encode(word, return_tensors='pt')

  with torch.no_grad():
    outputs = model(word_input_ids, output_hidden_states=True)
    # Access hidden_states and get the last layer
    word_embedding = outputs.hidden_states[-j].mean(dim=1)

  vocab_input_ids = tokenizer.encode(vord, return_tensors='pt')
  with torch.no_grad():
    outputs = model(vocab_input_ids, output_hidden_states=True)
    # Access hidden_states and get the last layer
    vocab_embedding = outputs.hidden_states[-j].mean(dim=1)

  # Compute cosine similarity and normalize
  similarity = 0.5 * (F.cosine_similarity(word_embedding, vocab_embedding).item() + 1)

  return similarity

**Information Value**

In [8]:
def extract_words_and_embeddings(subwords, subword_embeddings):
    words = []
    word_embedding = []

    current_word = ""
    current_embedding = []

    for subword, embedding in zip(subwords, subword_embeddings):
        # Check if the subword starts with the special token 'Ġ'
        if subword.startswith('Ġ'):
            if current_word:
                words.append(current_word)
                word_embedding.append(sum(current_embedding) / len(current_embedding))

            # Reset current word and probability for the new word
            current_word = subword[1:]  # Remove 'Ġ' from the start
            current_embedding= [embedding]
        else:
            # Concatenate subwords to form the current word
            current_word += subword
            # Multiply probabilities for subwords within the same word
            current_embedding.append(embedding)

    # Add the last word and its probability
    if current_word:
        words.append(current_word)
        word_embedding.append(sum(current_embedding) / len(current_embedding))

    return words, word_embedding

In [9]:
def extract_words_and_probabilities(subwords, subword_probabilities):
    words = []
    word_probabilities = []

    current_word = ""
    current_probability = 1.0  # Initialize to 1.0 as we will multiply probabilities

    # Iterate through the subword probabilities
    for subword, probability in zip(subwords, subword_probabilities):
        # Check if the subword starts with the special token 'Ġ'
        if subword.startswith('Ġ'):
            # If we have a current word, add it to the list with its probability
            if current_word:
                words.append(current_word)
                word_probabilities.append(current_probability)

            # Reset current word and probability for the new word
            current_word = subword[1:]  # Remove 'Ġ' from the start
            current_probability = probability
        else:
            # Concatenate subwords to form the current word
            current_word += subword
            # Multiply probabilities for subwords within the same word
            current_probability *= probability

    # Add the last word and its probability
    if current_word:
        words.append(current_word)
        word_probabilities.append(current_probability)

    return words, word_probabilities

In [10]:
def calculate_word_information_values(sentence, vocabulary_df, model = model, tokenizer = tokenizer):

    # calulate information value for one sentence
    words_list = sentence.split(' ')

    ce_iv = [[],[],[],[],[],[],[],[],[],[],[],[],[]]
    nce_iv = [[],[],[],[],[],[],[],[],[],[],[],[],[]]

    words = sentence.split(' ')
    input_ids = tokenizer.encode(" ".join(words), return_tensors='pt')

    # Forward pass to get hidden states
    with torch.no_grad():
      outputs = model(input_ids, output_hidden_states=True)

    decoded_subwords = tokenizer.convert_ids_to_tokens(input_ids[0].tolist())

    for j in range(1,13):
      last_hidden_state = outputs.hidden_states[-j]
      decoded_words1, embeddings1 = extract_words_and_embeddings(decoded_subwords, last_hidden_state[0,:])

      # loop through all words in sentence
      for i in range(0, len(sentence.split(' '))):

        words = sentence.split(' ')
        word = words[i]
        ce_iv[j].append(0)
        nce_iv[j].append(0)

        embedding_word1 = embeddings1[i]

        vocab_df = vocabulary_df.sample(n=50, random_state=42).reset_index(drop=True)
        # loop through all vocabulary words
        for vord in vocab_df['word'].tolist():

          words[i] = vord
          # Tokenize the input sentence
          input_ids = tokenizer.encode(" ".join(words), return_tensors='pt')

          # Generate word probabilities using GPT-2 model
          with torch.no_grad():
            outputs = model(input_ids, output_hidden_states=True)
            logits = outputs.logits
            last_hidden_state = outputs.hidden_states[-j]  # This is the final layer's hidden state for each token

          # Extract probabilities for each word
          subword_probabilities = torch.softmax(logits, dim=-1).mean(dim=2)

          # Decode the tokens back to words
          decoded_subwords = tokenizer.convert_ids_to_tokens(input_ids[0].tolist())
          decoded_words2, probabilities = extract_words_and_probabilities(decoded_subwords, subword_probabilities[0])

          decoded_words2, embeddings2 = extract_words_and_embeddings(decoded_subwords, last_hidden_state[0,:])
          embedding_word2 = embeddings2[i]

          context_probability = probabilities[decoded_words2==vord].item()

          # distances
          contextual_embedding_distance = 1 - 0.5 * (F.cosine_similarity(embedding_word1, embedding_word2, dim=0).item() + 1)
          non_contextual_embedding_distance = 1 - non_context_embedding(word, vord, model, tokenizer, j)

          ce_iv[j][i] += contextual_embedding_distance * context_probability
          nce_iv[j][i] += non_contextual_embedding_distance * context_probability

    return words_list, ce_iv, nce_iv

In [11]:
words_list = []
target_sentence_list = []
ce_iv_list = [[],[],[],[],[],[],[],[],[],[],[],[],[]]
nce_iv_list = [[],[],[],[],[],[],[],[],[],[],[],[],[]]

# Save the DataFrame to a CSV file
csv_file_path = "/content/drive/MyDrive/PhD/information_value2.csv"

for i in range(47,50):
  sentence = target_sentences_df['Text'][i].lower()
  print(i)
  words, ce_ivs, nce_ivs = calculate_word_information_values(sentence.strip(), vocabulary_df)

  for ind in range(0,len(words)):
    words_list.append(words[ind])
    target_sentence_list.append(i)
    for j in range(1,13):
      ce_iv_list[j].append(ce_ivs[j][ind])
      nce_iv_list[j].append(nce_ivs[j][ind])

  # Create a DataFrame
  df = pd.DataFrame({
      'Sentence': target_sentence_list,
      'Word': words_list,
      **{f'CE {j}': ce_iv_list[j] for j in range(1, 13)},
      **{f'NCE {j}': nce_iv_list[j] for j in range(1, 13)}
                     })
  df.to_csv(csv_file_path, index=False)

# Display the DataFrame
print(df)

46
47
48
49
    Sentence          Word          CE 1          CE 2          CE 3  \
0         46      prelazio  1.989773e-07  1.229415e-07  4.483096e-08   
1         46            je  2.932871e-20  6.777237e-19  1.355427e-18   
2         46         ulicu  1.142676e-20  3.679082e-19  7.879443e-19   
3         46       naspram  1.292616e-20  3.808103e-19  7.531640e-19   
4         46           one  1.120038e-20  5.428158e-19  1.013974e-18   
5         46           tek  8.890555e-21  3.673506e-19  7.252831e-19   
6         46    renovirane  5.593610e-21  2.545139e-19  4.980110e-19   
7         46  buregdzinice  5.714367e-21  4.654171e-19  9.043368e-19   
8         47           dok  1.073253e-07  1.707259e-08  7.044255e-09   
9         47            se  4.357108e-11  1.080145e-09  2.557508e-09   
10        47      saginjao  1.886843e-11  6.271039e-10  1.294875e-09   
11        47         umalo  1.707904e-11  8.019957e-10  1.519002e-09   
12        47            ga  3.628440e-11  1.482417e-