In [10]:
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [11]:
import pandas as pd
import torch
import torch.nn.functional as F
import math

In [12]:
target_sentences_df = pd.read_csv('/content/drive/MyDrive/PhD/target_sentences.csv')
target_sentences_df

Unnamed: 0,Text
0,još malo pa će izbori
1,naoblačilo se kao da će padati kiša
2,naša kola su stara 10 godina
3,otišao je u inostranstvo prošle godine i još s...
4,telefoni zvone po ceo dan i tebe traže
...,...
62,pojavio se u poslednjem trenutku
63,sanjao sam te pre neku noć
64,premijer je obećao viši standard
65,sutra je doček Nove Godine


In [13]:
vocabulary_df = pd.read_csv('/content/drive/MyDrive/PhD/wordlist_classlawiki_sr_cleaned.csv')
vocabulary_df

Unnamed: 0,word,freq
0,biti,5835707
1,u,4289454
2,i,2848044
3,na,1602609
4,sebe,1544095
...,...,...
336153,177.,5
336154,0038,5
336155,0036,5
336156,0032,5


In [None]:
pip install transformers



In [None]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import torch

model_name = 'gpt2'
# Load pre-trained GPT-2 model and tokenizer
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

In [None]:
def extract_words_and_probabilities(subwords, subword_probabilities):
    words = []
    word_probabilities = []

    current_word = ""
    current_probability = 1.0  # Initialize to 1.0 as we will multiply probabilities

    # Iterate through the subword probabilities
    for subword, probability in zip(subwords, subword_probabilities):
        # Check if the subword starts with the special token 'Ġ'
        if subword.startswith('Ġ'):
            # If we have a current word, add it to the list with its probability
            if current_word:
                words.append(current_word)
                word_probabilities.append(current_probability)

            # Reset current word and probability for the new word
            current_word = subword[1:]  # Remove 'Ġ' from the start
            current_probability = probability
        else:
            # Concatenate subwords to form the current word
            current_word += subword
            # Multiply probabilities for subwords within the same word
            current_probability *= probability

    # Add the last word and its probability
    if current_word:
        words.append(current_word)
        word_probabilities.append(current_probability)

    return words, word_probabilities

In [None]:
def calculate_contextual_entropy(sentence, tokenizer, model, vocab_df):

    # calulate information value for one sentence
    words_list = []
    entropy_list = []

    # loop through all words in sentence
    for i in range(0, len(sentence.split(' '))):

      words = sentence.split(' ')
      words_list.append(words[i])

      entropy_list.append(0)
      vocab_df = vocabulary_df.sample(n=5, random_state=42).reset_index(drop=True)

      # loop through all vocabulary words
      for vord in vocab_df['word'].tolist():

        words[i] = vord
        # Tokenize the input sentence
        input_ids = tokenizer.encode(" ".join(words), return_tensors='pt')

        # Generate word probabilities using GPT-2 model
        with torch.no_grad():
            outputs = model(input_ids)
            logits = outputs.logits

        # Extract probabilities for each word
        subword_probabilities = torch.softmax(logits, dim=-1).mean(dim=1)

        # Decode the tokens back to words
        decoded_subwords = tokenizer.convert_ids_to_tokens(input_ids[0].tolist())
        decoded_words2, probabilities = extract_words_and_probabilities(decoded_subwords, subword_probabilities[0])

        context_probability = probabilities[decoded_words2==vord].item()
        entropy_list[i] -= context_probability * math.log(context_probability + 1e-40, 2)

    return words_list, entropy_list

In [None]:
words_list = []
target_sentence_list = []
entropy_list = []

# Save the DataFrame to a CSV file
csv_file_path = "/content/drive/MyDrive/PhD/contextual_entropy1.csv"

for i in range(14,len(target_sentences_df)):
  sentence = target_sentences_df['Text'][i].lower()
  print(i)
  words, entropies = calculate_contextual_entropy(sentence.strip(), tokenizer, model, vocabulary_df)

  for word, entropy in zip(words, entropies):
    words_list.append(word)
    target_sentence_list.append(i)
    entropy_list.append(entropy)

  # Create a DataFrame
  df = pd.DataFrame({
      'Sentence': target_sentence_list,
      'Word': words_list,
      'Contextual Entropy': entropy_list
                     })
  df.to_csv(csv_file_path, index=False)

# Display the DataFrame
print(df)