<a href="https://colab.research.google.com/github/jiveshj/SeniorThesis/blob/main/ThesisFunc.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy
print(numpy.__version__)
from concurrent.futures import ProcessPoolExecutor

1.26.4


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter

In [None]:
import torch
num_gpus = torch.cuda.is_available()
print("Number of GPUs available:", num_gpus)

Number of GPUs available: True


In [None]:
def clear_cache():
    torch.cuda.empty_cache()  # Frees unused memory from GPU cache


In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer




class LMHeadModel:
    def __init__(self, model_name, device="cuda" if torch.cuda.is_available() else "cpu"):
        # Initialize the model and tokenizer
        self.device = device
        self.model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float32).to(self.device)
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model.eval()

        # Ensure the tokenizer has a padding token
        if self.tokenizer.pad_token is None:
            self.tokenizer.pad_token = self.tokenizer.eos_token  # Use EOS token as padding
            self.tokenizer.padding_side = "right"

        self.batch_prediction_count = 0


    def batch_encode(self, sentences):
        """
        Encodes a batch of sentences into input tensors.
        Args:
            sentences (list of str): The input sentences to encode.
        Returns:
            inputs (dict): A dictionary of tokenized inputs ready for the model.
        """
        return self.tokenizer(
            sentences,
            return_tensors="pt",
            padding=True,  # Pad to the longest sequence in the batch
            truncation=True,  # Truncate sequences longer than the model's max length
        ).to(self.device)

    def batch_decode(self, token_ids):
        """
        Decodes a batch of token IDs back to sentences.
        Args:
            token_ids (torch.Tensor): A tensor of token IDs to decode.
        Returns:
            decoded_sentences (list of str): The decoded sentences.
        """
        return self.tokenizer.batch_decode(token_ids, skip_special_tokens=True)
    def batch_decode_top_k(self, token_ids_batch, tokenizer):
        """
        Decodes token IDs to meaningful text while merging subword tokens.
        Args:
            token_ids_batch (torch.Tensor): A batch of token IDs (e.g., from `topk`).
            tokenizer: The tokenizer used for encoding/decoding.
        Returns:
            list of list of str: Decoded tokens (words/subwords) for each sequence in the batch.
        """
        decoded_tokens = []
        for token_ids in token_ids_batch:
            # Decode each token ID in the batch, joining subwords correctly
            tokens = [tokenizer.decode([token_id]).strip() for token_id in token_ids]
            decoded_tokens.append(tokens)
        return decoded_tokens

    def get_batch_predictions(self, sentences, top_k=100):
        """
        Predicts the next tokens for a batch of input sentences.
        Args:
            sentences (list of str): The input sentences.
            top_k (int): Number of top tokens to return for each sentence.
        Returns:
            predictions (list of list of tuples): Top-k token predictions for each sentence.
        """
        #Increment to see how many times this function is called after a given layer of trellis.
        self.batch_prediction_count += 1


        # Tokenize inputs
        inputs = self.batch_encode(sentences)

        # Pass through the model
        with torch.no_grad():
            outputs = self.model(**inputs,use_cache = False)

        # Get logits for the last token in each sequence
        logits = outputs.logits[:, -1, :]  # Shape: (batch_size, vocab_size)


        # Compute probabilities using softmax
        probs = torch.nn.functional.softmax(logits, dim=-1)
        top_probs, top_token_ids = torch.topk(probs, k=top_k, dim=-1)
        top_tokens = self.batch_decode_top_k(top_token_ids, self.tokenizer)


        predictions = [
            [(token, prob.item()) for token, prob in zip(top_tokens[i], top_probs[i]) if token and token != "\n"]
            for i in range(len(sentences))
        ]
        return predictions

    def get_batch_prediction_count(self):
        """
        Returns the number of times batch predictions have been made.
        """
        return self.batch_prediction_count

    def reset_batch_prediction_count(self):
        """ Resets the count
        """

        self.batch_prediction_count = 0


In [None]:
model = LMHeadModel("gpt2")
tokens = model.get_batch_predictions(["I enjoy walking in the streets of London and seeing how many"])
print(tokens)

[[('people', 0.43639272451400757), ('of', 0.04618111252784729), ('different', 0.024586744606494904), ('cars', 0.012367425486445427), ('women', 0.011498896405100822), ('things', 0.010798175819218159), ('young', 0.01030801609158516), ('other', 0.009153882041573524), ('children', 0.007985253818333149), ('kids', 0.007831282913684845), ('pedestrians', 0.007828296162188053), ('beautiful', 0.007812305819243193), ('shops', 0.007584897801280022), ('times', 0.007466564420610666), ('new', 0.0064317211508750916), ('businesses', 0.0064195143058896065), ('more', 0.0061494712717831135), ('homeless', 0.006147547625005245), ('men', 0.00555956969037652), ('little', 0.004401943180710077), ('streets', 0.004189481493085623), ('families', 0.0034754821099340916), ('places', 0.0034438082948327065), ('great', 0.002904405351728201), ('houses', 0.0026904840487986803), ('tourists', 0.0026231110095977783), ('buildings', 0.0024502745363861322), ('pubs', 0.0024248340632766485), ('police', 0.0023816009052097797), ('c

In [None]:
class SearchTree:
    def __init__(self,context,probability,parent = None,child = None,parent_index = None):
        self.context = context
        self.probability = probability
        self.parent = parent
        self.child = []
        self.parent_index = parent_index  # newly created.
        if child is not None:
           self.child.append(child)

    def build_Context(self):
        context_list = []
        node = self
        while node.parent is not None:
            context_list.append(node.context)
            node = node.parent
        context_list.append(node.context)
        context_list.reverse()
        formatted_contextList = []
        for i in range(len(context_list)):
            if context_list[i] in ['.',':',',','?','!',';'] or ("'" in context_list[i]):
                if (i-1>= 0):
                    if context_list[i-1] not in  ['.',':',',','?','!',';'] and ("'" not in context_list[i-1]):#if two consecutive contexts are , ' etc.
                        word = context_list[i-1]+context_list[i]

                        formatted_contextList.remove(context_list[i-1])
                        formatted_contextList.append(word)
                    else:
                        formatted_contextList.append(context_list[i])
            else:

                 formatted_contextList.append(context_list[i])
        return ' '.join(formatted_contextList)

    def create_child(self):
        if self.parent is not None:
           self.parent.child.append(self)

    def replace_parent(self,new_parent):
        self.parent = new_parent

    def calcProbTillNow(self):
      prob = self.probability
      node = self
      while node.parent is not None:
        prob = prob*node.parent.probability
        node = node.parent
      return prob    #can make this negative log probability.

    def assign_parent_index(self,parent_index):
      self.parent_index = parent_index





def most_frequent(List):
       return max(set(List), key=List.count)
def sort_with_indices(arr):
  """Sorts a list and returns the original indices of the sorted elements."""
  indices = list(range(len(arr)))
  indices.sort(key=lambda i: arr[i], reverse = True)
  return indices

# Now, have the probability matrix ready in which one list contains the probability to reach that state from previous list of tokens
#To make the probs ready find the unique tokens and then number them/store in a list and then find top 3 tokens given those tokens, find unique
#tokens and then extract probs of getting those from the previous list. [state transition probmat] and then run viterbi!!

# def findProbability2(InitialToken, FinalToken, model):
#     context = InitialToken.build_Context()
#     tokens_50K = model.get_batch_predictions([context], 300)  # Generate top-k predictions
#     print("InitialToken.context:  ", InitialToken.build_Context())
#     print("FinalToken: ", FinalToken.context)
#     # Use a dictionary for faster lookups
#     token_probs = {token: prob for token, prob in tokens_50K[0]}

#     # Return the probability if the token exists, otherwise return 0
#     prob = token_probs.get(FinalToken.context,0)
#     print("prob: ", prob)
#     return prob




def findProbability(InitialToken,FinalToken,model):
    context = InitialToken.build_Context()
    tokens_50K = model.get_batch_predictions([context],500)  # can do this - I mean that I can set the topk to be equal to 50 and if it is not there then I will discard the probability as 0.

    for token,prob in tokens_50K[0]:
        if token == FinalToken.context:
            return prob
    return 0

def findProbability_cached(InitialToken, FinalToken, model,cached_probs):
    context = InitialToken.build_Context()

    # Check if context has been previously processed
    if context in cached_probs:
        # Return cached result if available
        return cached_probs[context].get(FinalToken.context, 0)

    # Otherwise, calculate the probabilities
    tokens_50K = model.get_batch_predictions([context], 500)

    # Store probabilities for this context in the cache
    context_probs = {token: prob for token, prob in tokens_50K[0]}
    cached_probs[context] = context_probs

    return context_probs.get(FinalToken.context, 0)




def find_overlap_children(arr):
    "Find all the children of top two tokens and return their overlap"
    children1 = {child.context for child in arr[0].child}
    children2 = {child.context for child in arr[1].child}
    common = len(children1.intersection(children2))
    return common / len(arr[0].child) if arr[0].child else 0



def VITERBI_Lists(state_transition_probmat, initial_state_prob):

    viterbi_mat = []
    backpointer = []
    viterbi_1stLayer = []
    for i in range(len(initial_state_prob)):
        viterbi_1stLayer.append(float(initial_state_prob[i]))
    viterbi_mat.append(viterbi_1stLayer)

    for time_step in range(len(state_transition_probmat)):
        viterbi_layer = []
        backpointer_layer = []
        for state in range(len(state_transition_probmat[time_step])):
            iteration_vec = [viterbi_mat[time_step][i]*state_transition_probmat[time_step][state][i] for i in range(len(viterbi_mat[time_step]))]

            maxval = max(iteration_vec)
            maxind = iteration_vec.index(maxval)
            viterbi_layer.append(maxval)
            backpointer_layer.append(maxind)

        viterbi_mat.append(viterbi_layer)
        backpointer.append(backpointer_layer)

    best_path_prob = max(viterbi_mat[-1])
    # max_index = max(range(len(viterbi_mat[-1])), key = lambda i: viterbi_mat[-1][i])
    max_index = viterbi_mat[-1].index(best_path_prob)
    best_backpointer = max_index
    best_path = [best_backpointer]
    j = 0
    for i in reversed(range(len(state_transition_probmat))):
        best_path.append(backpointer[i][best_path[j]])
        j += 1
    best_path = best_path[::-1]
    return best_path, viterbi_mat,best_path_prob
def decodePath(best_path,unique_tokens_list,root_string):
    resultant_string = ''
    for i in range(len(best_path)):
      if unique_tokens_list[i][best_path[i]] in ['.',':',',','?','!',';']:
            if (i-1>= 0):
                resultant_string = resultant_string+unique_tokens_list[i][best_path[i]]
      elif "'" in unique_tokens_list[i][best_path[i]]:
              resultant_string = resultant_string + unique_tokens_list[i][best_path[i]]
      else:
            resultant_string = resultant_string + ' '+ unique_tokens_list[i][best_path[i]]
    return root_string+resultant_string






In [None]:
def generateIntermediates(root,numTokens = 3, loop_runner = 4):
  sentence = SearchTree(root,1)
  context = []
  prob_list = []
  num_tokens = numTokens
  content = []
  probability = []
  model = LMHeadModel("gpt2")
  tokens_50K = model.get_batch_predictions([sentence.context],numTokens+3)
  children = []
  overlap = []
  most_common = []
  #unique_elements = []   # to store unique elements at each iteration
  unique_tokens = set()
  probabilityMatrix = []
  uniqueTokensList = []
  new_content = []
  uniqueTokenLength = []

  flops_counter = {}
  cached_probs = {}
  batch_size = 75
  holdout_number = 5
  for i in range(num_tokens):
    context = tokens_50K[0][i][0]
    unique_tokens.add(context)
    new_content.append(context)
    prob = tokens_50K[0][i][1]
    probability.append(prob)
    context = SearchTree(context,prob,sentence,parent_index = 0)
    context.create_child()
    uniqueTokensList.append(context)
    children.append(context)

  content.append(new_content)
  previousUniqueLength = num_tokens
  #unique_elements.append(unique_tokens)
  initialStateProbability = probability
  uniqueTokenLength.append(num_tokens)
  for i in range(2,loop_runner):
    unique_tokens = set()
    probability = []
    new_content = []
    total_predictions = []
    previousSetLength = 0
    batch_sentences = [child.build_Context() for child in uniqueTokensList]
    if len(batch_sentences)>holdout_number:
        batch_sentences2 = batch_sentences[0:-holdout_number]
        batch_predictions = model.get_batch_predictions(batch_sentences2,numTokens+2)
        total_predictions = []
        total_predictions.extend(batch_predictions)
        batch_predictions1 = model.get_batch_predictions(batch_sentences[-holdout_number:],numTokens+2)
        total_predictions.extend(batch_predictions1)
    else:
        total_predictions = model.get_batch_predictions(batch_sentences,numTokens+2)
    # batch_sentences =  ["I enjoy walking in the park, but I'm not sure", "I enjoy walking in the park, but I'm not a", "I enjoy walking in the park, but I'm not going", "I enjoy walking in the park, but I'm also very", "I enjoy walking in the park, but I'm also not", "I enjoy walking in the park, but I'm afraid I", "I enjoy walking in the park, but I'm afraid that", "I enjoy walking in the park, but I'm afraid to", "I enjoy walking in the park, but I don't like", "I enjoy walking in the park, but I don't want", "I enjoy walking in the park, but I don't think", "I enjoy walking in the park, but I don' t", 'I enjoy walking in the park, but I don � c', 'I enjoy walking in the park, but I don � ve', 'I enjoy walking in the park, but I also enjoy the', 'I enjoy walking in the park, but I also enjoy being', 'I enjoy walking in the park, but I also enjoy walking', "I enjoy walking in the park, but it's a little", "I enjoy walking in the park, but it's a bit", "I enjoy walking in the park, but it's a lot", "I enjoy walking in the park, but it's hard for", 'I enjoy walking in the park, but it is very difficult', 'I enjoy walking in the park, but it is very hard', 'I enjoy walking in the park, but it is very quiet', 'I enjoy walking in the park, but it can get really', 'I enjoy walking in the park, but it can get pretty', "I enjoy walking in the park, but when I'm in", "I enjoy walking in the park, but when I'm out", 'I enjoy walking in the park, but when I walk down', 'I enjoy walking in the park, but when I walk into', 'I enjoy walking in the park, but when I go back', "I enjoy walking in the park, but when you're on", "I enjoy walking in the park, but when it's raining", "I enjoy walking in the park, but when it's time", "I enjoy walking in the park, but when it's dark", 'I enjoy walking in the park, but when it rains,', 'I enjoy walking in the park, but when it rains it', 'I enjoy walking in the park, and the people there are', 'I enjoy walking in the park, and the people there have', 'I enjoy walking in the park, and the people there seem', 'I enjoy walking in the park, and the people who live', 'I enjoy walking in the park, and the people who come', 'I enjoy walking in the park, and the people are always', 'I enjoy walking in the park, and the people are so', 'I enjoy walking in the park, and the people are nice', 'I enjoy walking in the park, and the view is amazing', 'I enjoy walking in the park, and the view is beautiful', 'I enjoy walking in the park, and the view is great', 'I enjoy walking in the park, and the view from my', 'I enjoy walking in the park, and the view from here', 'I enjoy walking in the park, and the view of this', 'I enjoy walking in the park, and the view of Lake', 'I enjoy walking in the park, and the smell and smell', 'I enjoy walking in the park, and the smell and taste', 'I enjoy walking in the park, I like to play with', 'I enjoy walking in the park, I like to watch movies', 'I enjoy walking in the park, I like the view.', 'I enjoy walking in the park, I like the view of', 'I enjoy walking in the park, I like the smell and', 'I enjoy walking in the park, I like the way they', 'I enjoy walking in the park, I like being in front', 'I enjoy walking in the park, I like being around people', 'I enjoy walking in the park, I like being around other', 'I enjoy walking in the park, I enjoy playing with friends', 'I enjoy walking in the park, I enjoy playing the game', 'I enjoy walking in the park, I enjoy playing the guitar', 'I enjoy walking in the park, I enjoy playing the piano', 'I enjoy walking in the park and seeing people. I love', "I enjoy walking in the park and seeing people. I'm", "I enjoy walking in the park and seeing people. It's", 'I enjoy walking in the park and seeing people. It is', 'I enjoy walking in the park and seeing people. It makes', "I enjoy walking in the park and seeing people. We're", 'I enjoy walking in the park and seeing people, but when', 'I enjoy walking in the park and seeing people and seeing what', 'I enjoy walking in the park and seeing people and animals ,"', 'I enjoy walking in the park and seeing all of these different', 'I enjoy walking in the park and seeing all of our neighbors', 'I enjoy walking in the park and seeing all of our favorite', 'I enjoy walking in the park and seeing all these beautiful trees', 'I enjoy walking in the park and seeing all these beautiful birds', 'I enjoy walking in the park and seeing all these different things', 'I enjoy walking in the park and seeing all these different species', 'I enjoy walking in the park and seeing all these different kinds', "I enjoy walking in the park and it's a great place", "I enjoy walking in the park and it's a great way", "I enjoy walking in the park and it's a great experience", "I enjoy walking in the park and it's a nice feeling", "I enjoy walking in the park and it's nice to see", "I enjoy walking in the park and it's nice to be", "I enjoy walking in the park and it's nice. It", "I enjoy walking in the park and it's nice. The", "I enjoy walking in the park and it's nice that we", "I enjoy walking in the park and it's nice that you", "I enjoy walking in the park and it's great for me", 'I enjoy walking in the park and it makes me want more', 'I enjoy walking in the park and it makes my day ."', 'I enjoy walking in the park and it makes my life easier', 'I enjoy walking in the park and it makes my body feel', 'I enjoy walking in the park and it makes my body look', 'I enjoy walking in the park and it makes my body stronger', 'I enjoy walking in the park. The park itself has been', 'I enjoy walking in the park. The park itself has some', 'I enjoy walking in the park. The park itself, though', 'I enjoy walking in the park. The trees are so tall', 'I enjoy walking in the streets of the United Arab Emir ate', 'I enjoy walking in the streets of the United Arab Emir ates', 'I enjoy walking in the streets of the United Arab Emir ati', 'I enjoy walking in the streets of London, but it was', 'I enjoy walking in the streets of London and seeing how many']

    # batch_predictions = model.get_batch_predictions(batch_sentences,numTokens+2)
    for j in range(len(uniqueTokensList)):


      for s in range(num_tokens):
        context = total_predictions[j][s][0]
        prob = total_predictions[j][s][1]

        if (i == loop_runner-1):
           print(context, end = " ")
           print(prob)


        unique_tokens.add(context)

        context = SearchTree(context,prob,uniqueTokensList[j])   #probably redundant: Because I should only create SearchTree of unique tokens
        # context.create_child() Removed this 2/19/2025
        if (len(unique_tokens)>previousSetLength):
          previousSetLength = len(unique_tokens)
          uniqueTokensList.append(context)
          new_content.append(context.context)


    #unique_elements.append(unique_tokens) # append the unique tokens list at each iteration to unique_elements list
    content.append(new_content) # for storing tokens which will pass to the decode_path function.

    for token in uniqueTokensList[previousUniqueLength:]:
      probs = []  #for making probability matrix and applying viterbi.
      probs2 = [] #for finding the new parent.
      for prevToken in uniqueTokensList[:previousUniqueLength]:
        probabilityCalc = findProbability(prevToken,token,model)


        probs.append(probabilityCalc)
        probs2.append(probabilityCalc*prevToken.calcProbTillNow())
        #new code below inserted:
      if not probs2:
        continue
      else:
        max_value = max(probs2)
        max_index = my_list.index(max_value)
        token.replace_parent(uniqueTokensList[:previousUniqueLength][max_index])
        token.assign_parent_index(max_index)

      probability.append(probs)
    probabilityMatrix.append(probability)
    flops_counter[i-1] = model.get_batch_prediction_count()
    model.reset_batch_prediction_count()


    uniqueTokenLength.append(len(uniqueTokensList[previousUniqueLength:]))

    previousUniqueLength = len(uniqueTokensList[previousUniqueLength:])
    uniqueTokensList = uniqueTokensList[len(uniqueTokensList)-previousUniqueLength:]


  return probabilityMatrix, initialStateProbability, content,uniqueTokenLength, flops_counter


In [None]:
P,S,C,length,flops = generateIntermediates("I enjoy walking in the", 3,9)

In [None]:
print(length)

[3, 4, 10, 23, 38, 75, 110, 8]


In [None]:
best_path,viterbi_mat,best_path_prob = VITERBI_Lists(P,S)

decodedString = decodePath(best_path,C,"I enjoy walking in the")
print(decodedString)
print(best_path_prob)

In [None]:
# batch_size = 105
# total_predictions = []
batch_sentences =  ["I enjoy walking in the park, but I'm not sure", "I enjoy walking in the park, but I'm not a", "I enjoy walking in the park, but I'm not going", "I enjoy walking in the park, but I'm also very", "I enjoy walking in the park, but I'm also not", "I enjoy walking in the park, but I'm afraid I", "I enjoy walking in the park, but I'm afraid that", "I enjoy walking in the park, but I'm afraid to", "I enjoy walking in the park, but I don't like", "I enjoy walking in the park, but I don't want", "I enjoy walking in the park, but I don't think", "I enjoy walking in the park, but I don' t", 'I enjoy walking in the park, but I don � c', 'I enjoy walking in the park, but I don � ve', 'I enjoy walking in the park, but I also enjoy the', 'I enjoy walking in the park, but I also enjoy being', 'I enjoy walking in the park, but I also enjoy walking', "I enjoy walking in the park, but it's a little", "I enjoy walking in the park, but it's a bit", "I enjoy walking in the park, but it's a lot", "I enjoy walking in the park, but it's hard for", 'I enjoy walking in the park, but it is very difficult', 'I enjoy walking in the park, but it is very hard', 'I enjoy walking in the park, but it is very quiet', 'I enjoy walking in the park, but it can get really', 'I enjoy walking in the park, but it can get pretty', "I enjoy walking in the park, but when I'm in", "I enjoy walking in the park, but when I'm out", 'I enjoy walking in the park, but when I walk down', 'I enjoy walking in the park, but when I walk into', 'I enjoy walking in the park, but when I go back', "I enjoy walking in the park, but when you're on", "I enjoy walking in the park, but when it's raining", "I enjoy walking in the park, but when it's time", "I enjoy walking in the park, but when it's dark", 'I enjoy walking in the park, but when it rains,', 'I enjoy walking in the park, but when it rains it', 'I enjoy walking in the park, and the people there are', 'I enjoy walking in the park, and the people there have', 'I enjoy walking in the park, and the people there seem', 'I enjoy walking in the park, and the people who live', 'I enjoy walking in the park, and the people who come', 'I enjoy walking in the park, and the people are always', 'I enjoy walking in the park, and the people are so', 'I enjoy walking in the park, and the people are nice', 'I enjoy walking in the park, and the view is amazing', 'I enjoy walking in the park, and the view is beautiful', 'I enjoy walking in the park, and the view is great', 'I enjoy walking in the park, and the view from my', 'I enjoy walking in the park, and the view from here', 'I enjoy walking in the park, and the view of this', 'I enjoy walking in the park, and the view of Lake', 'I enjoy walking in the park, and the smell and smell', 'I enjoy walking in the park, and the smell and taste', 'I enjoy walking in the park, I like to play with', 'I enjoy walking in the park, I like to watch movies', 'I enjoy walking in the park, I like the view.', 'I enjoy walking in the park, I like the view of', 'I enjoy walking in the park, I like the smell and', 'I enjoy walking in the park, I like the way they', 'I enjoy walking in the park, I like being in front', 'I enjoy walking in the park, I like being around people', 'I enjoy walking in the park, I like being around other', 'I enjoy walking in the park, I enjoy playing with friends', 'I enjoy walking in the park, I enjoy playing the game', 'I enjoy walking in the park, I enjoy playing the guitar', 'I enjoy walking in the park, I enjoy playing the piano', 'I enjoy walking in the park and seeing people. I love', "I enjoy walking in the park and seeing people. I'm", "I enjoy walking in the park and seeing people. It's", 'I enjoy walking in the park and seeing people. It is', 'I enjoy walking in the park and seeing people. It makes', "I enjoy walking in the park and seeing people. We're", 'I enjoy walking in the park and seeing people, but when', 'I enjoy walking in the park and seeing people and seeing what', 'I enjoy walking in the park and seeing people and animals ,"', 'I enjoy walking in the park and seeing all of these different', 'I enjoy walking in the park and seeing all of our neighbors', 'I enjoy walking in the park and seeing all of our favorite', 'I enjoy walking in the park and seeing all these beautiful trees', 'I enjoy walking in the park and seeing all these beautiful birds', 'I enjoy walking in the park and seeing all these different things', 'I enjoy walking in the park and seeing all these different species', 'I enjoy walking in the park and seeing all these different kinds', "I enjoy walking in the park and it's a great place", "I enjoy walking in the park and it's a great way", "I enjoy walking in the park and it's a great experience", "I enjoy walking in the park and it's a nice feeling", "I enjoy walking in the park and it's nice to see", "I enjoy walking in the park and it's nice to be", "I enjoy walking in the park and it's nice. It", "I enjoy walking in the park and it's nice. The", "I enjoy walking in the park and it's nice that we", "I enjoy walking in the park and it's nice that you", "I enjoy walking in the park and it's great for me", 'I enjoy walking in the park and it makes me want more', 'I enjoy walking in the park and it makes my day ."', 'I enjoy walking in the park and it makes my life easier', 'I enjoy walking in the park and it makes my body feel', 'I enjoy walking in the park and it makes my body look', 'I enjoy walking in the park and it makes my body stronger', 'I enjoy walking in the park. The park itself has been', 'I enjoy walking in the park. The park itself has some', 'I enjoy walking in the park. The park itself, though', 'I enjoy walking in the park. The trees are so tall', 'I enjoy walking in the streets of the United Arab Emir ate', 'I enjoy walking in the streets of the United Arab Emir ates', 'I enjoy walking in the streets of the United Arab Emir ati', 'I enjoy walking in the streets of London, but it was', 'I enjoy walking in the streets of London and seeing how many']
# batch_sentences1 = batch_sentences[0:50]
# batch_predictions = model.get_batch_predictions(batch_sentences[0:70],3+2)
# print("batch_predictions: ", batch_predictions)
holdout_number = 5
batch_sentences2 = batch_sentences[0:-holdout_number]
batch_predictions = model.get_batch_predictions(batch_sentences2,3+2)
print("batch_predictions: ", batch_predictions)
total_predictions = []
total_predictions.extend(batch_predictions)
batch_predictions1 = model.get_batch_predictions(batch_sentences[-holdout_number:],3+2)
print("batch_predictions: ", batch_predictions1)
total_predictions.extend(batch_predictions1)
print("total_predictions: ",total_predictions)
# iteration_number = (len(batch_sentences))//batch_size
# for k in range(iteration_number):
#     print(k)
#     clear_cache()  # Free GPU memory before processing

#     batch_predictions1 = model.get_batch_predictions(batch_sentences[batch_size*k:batch_size*(k+1)],6)
#     print("batch_predictions: ", batch_predictions1)
#     total_predictions.extend(batch_predictions1)
#     print("total_predictions: ",total_predictions)

# print("semifinal total_predictions: ", total_predictions)
# print(len(total_predictions))
# batch_sentences = batch_sentences[batch_size*(k+1):]
# print("check: ", len(batch_sentences))
# batch_predictions = model.get_batch_predictions(batch_sentences[0:45],3+2)
# print(batch_predictions)
clear_cache()  # Free GPU memory before processing


# if (len(batch_sentences)%batch_size):
#     start = batch_size*(k+1)
#     end = len(batch_sentences)

#     print("end-start: ", end-start)
#     print("batch_sentences[0]: ",batch_sentences[start] )
#     batch_predictions = model.get_batch_predictions(batch_sentences[start:end-4],3+3)
#     print("batch_predictions: ", batch_predictions)
#     total_predictions.extend(batch_predictions)
#     print("final: ", total_predictions)
# print(batch_sentences[-1])
# clear_cache()  # Free GPU memory before processing
# batch_sentences = batch_sentences[106:]

# batch_predictions = model.get_batch_predictions(batch_sentences,3+3)
# print("finalbatch_predictions: ", batch_predictions)
# total_predictions.extend(batch_predictions)
# print("final", total_predictions)
print(len(batch_sentences))
print(len(total_predictions))


batch_predictions:  [[('I', 0.22196993231773376), ('if', 0.21653705835342407), ('how', 0.14745603501796722), ('what', 0.0852440595626831), ('that', 0.0523461289703846)], [('big', 0.11360680311918259), ('fan', 0.10415048897266388), ('huge', 0.03251166641712189), ('great', 0.02184179238975048), ('very', 0.016514241695404053)], [('to', 0.864355742931366), ('anywhere', 0.040128689259290695), ('there', 0.01755983941257), ('out', 0.010512413457036018), ('back', 0.008705461397767067)], [('curious', 0.03471408784389496), ('interested', 0.030560605227947235), ('concerned', 0.03020295314490795), ('much', 0.025488410145044327), ('nervous', 0.025145182386040688)], [('a', 0.15220940113067627), ('sure', 0.1036079078912735), ('used', 0.03327564522624016), ('really', 0.03120976872742176), ('afraid', 0.03113960474729538)], [("'m", 0.2475501447916031), ("'ll", 0.2088497132062912), ('can', 0.06224760413169861), ('won', 0.06211286783218384), ('don', 0.04461279511451721)], [('I', 0.18447834253311157), ('if

In [None]:
batch_sentences =  ["I enjoy walking in the park, but I'm not sure", "I enjoy walking in the park, but I'm not a", "I enjoy walking in the park, but I'm not going", "I enjoy walking in the park, but I'm also very", "I enjoy walking in the park, but I'm also not", "I enjoy walking in the park, but I'm afraid I", "I enjoy walking in the park, but I'm afraid that", "I enjoy walking in the park, but I'm afraid to", "I enjoy walking in the park, but I don't like", "I enjoy walking in the park, but I don't want", "I enjoy walking in the park, but I don't think", "I enjoy walking in the park, but I don' t", 'I enjoy walking in the park, but I don � c', 'I enjoy walking in the park, but I don � ve', 'I enjoy walking in the park, but I also enjoy the', 'I enjoy walking in the park, but I also enjoy being', 'I enjoy walking in the park, but I also enjoy walking', "I enjoy walking in the park, but it's a little", "I enjoy walking in the park, but it's a bit", "I enjoy walking in the park, but it's a lot", "I enjoy walking in the park, but it's hard for", 'I enjoy walking in the park, but it is very difficult', 'I enjoy walking in the park, but it is very hard', 'I enjoy walking in the park, but it is very quiet', 'I enjoy walking in the park, but it can get really', 'I enjoy walking in the park, but it can get pretty', "I enjoy walking in the park, but when I'm in", "I enjoy walking in the park, but when I'm out", 'I enjoy walking in the park, but when I walk down', 'I enjoy walking in the park, but when I walk into', 'I enjoy walking in the park, but when I go back', "I enjoy walking in the park, but when you're on", "I enjoy walking in the park, but when it's raining", "I enjoy walking in the park, but when it's time", "I enjoy walking in the park, but when it's dark", 'I enjoy walking in the park, but when it rains,', 'I enjoy walking in the park, but when it rains it', 'I enjoy walking in the park, and the people there are', 'I enjoy walking in the park, and the people there have', 'I enjoy walking in the park, and the people there seem', 'I enjoy walking in the park, and the people who live', 'I enjoy walking in the park, and the people who come', 'I enjoy walking in the park, and the people are always', 'I enjoy walking in the park, and the people are so', 'I enjoy walking in the park, and the people are nice', 'I enjoy walking in the park, and the view is amazing', 'I enjoy walking in the park, and the view is beautiful', 'I enjoy walking in the park, and the view is great', 'I enjoy walking in the park, and the view from my', 'I enjoy walking in the park, and the view from here', 'I enjoy walking in the park, and the view of this', 'I enjoy walking in the park, and the view of Lake', 'I enjoy walking in the park, and the smell and smell', 'I enjoy walking in the park, and the smell and taste', 'I enjoy walking in the park, I like to play with', 'I enjoy walking in the park, I like to watch movies', 'I enjoy walking in the park, I like the view.', 'I enjoy walking in the park, I like the view of', 'I enjoy walking in the park, I like the smell and', 'I enjoy walking in the park, I like the way they', 'I enjoy walking in the park, I like being in front', 'I enjoy walking in the park, I like being around people', 'I enjoy walking in the park, I like being around other', 'I enjoy walking in the park, I enjoy playing with friends', 'I enjoy walking in the park, I enjoy playing the game', 'I enjoy walking in the park, I enjoy playing the guitar', 'I enjoy walking in the park, I enjoy playing the piano', 'I enjoy walking in the park and seeing people. I love', "I enjoy walking in the park and seeing people. I'm", "I enjoy walking in the park and seeing people. It's", 'I enjoy walking in the park and seeing people. It is', 'I enjoy walking in the park and seeing people. It makes', "I enjoy walking in the park and seeing people. We're", 'I enjoy walking in the park and seeing people, but when', 'I enjoy walking in the park and seeing people and seeing what', 'I enjoy walking in the park and seeing people and animals ,"', 'I enjoy walking in the park and seeing all of these different', 'I enjoy walking in the park and seeing all of our neighbors', 'I enjoy walking in the park and seeing all of our favorite', 'I enjoy walking in the park and seeing all these beautiful trees', 'I enjoy walking in the park and seeing all these beautiful birds', 'I enjoy walking in the park and seeing all these different things', 'I enjoy walking in the park and seeing all these different species', 'I enjoy walking in the park and seeing all these different kinds', "I enjoy walking in the park and it's a great place", "I enjoy walking in the park and it's a great way", "I enjoy walking in the park and it's a great experience", "I enjoy walking in the park and it's a nice feeling", "I enjoy walking in the park and it's nice to see", "I enjoy walking in the park and it's nice to be", "I enjoy walking in the park and it's nice. It", "I enjoy walking in the park and it's nice. The", "I enjoy walking in the park and it's nice that we", "I enjoy walking in the park and it's nice that you", "I enjoy walking in the park and it's great for me", 'I enjoy walking in the park and it makes me want more', 'I enjoy walking in the park and it makes my day ."', 'I enjoy walking in the park and it makes my life easier', 'I enjoy walking in the park and it makes my body feel', 'I enjoy walking in the park and it makes my body look', 'I enjoy walking in the park and it makes my body stronger', 'I enjoy walking in the park. The park itself has been', 'I enjoy walking in the park. The park itself has some', 'I enjoy walking in the park. The park itself, though', 'I enjoy walking in the park. The trees are so tall', 'I enjoy walking in the streets of the United Arab Emir ate', 'I enjoy walking in the streets of the United Arab Emir ates', 'I enjoy walking in the streets of the United Arab Emir ati', 'I enjoy walking in the streets of London, but it was', 'I enjoy walking in the streets of London and seeing how many']
batch_predictions = model.get_batch_predictions(batch_sentences,6)
print(batch_predictions)

print(len(batch_predictions))

[[('The', 0.06441471725702286), ('"', 0.029232027009129524), ('I', 0.024992607533931732), ('A', 0.017693907022476196), ('This', 0.016091689467430115)], [('The', 0.03021872788667679), ('"', 0.010039023123681545), ('I', 0.00871141254901886), ('A', 0.007432189770042896), ('This', 0.006580688059329987)], [('The', 0.06791429221630096), ('"', 0.032660551369190216), ('I', 0.02276371791958809), ('A', 0.018162254244089127), ('This', 0.01436851266771555)], [('The', 0.06292367726564407), ('"', 0.02983095310628414), ('I', 0.022361278533935547), ('A', 0.019140776246786118), ('This', 0.013515595346689224)], [('The', 0.054077185690402985), ('"', 0.022345803678035736), ('I', 0.016759106889367104), ('A', 0.014416421763598919), ('This', 0.010597935877740383)], [('The', 0.05732255056500435), ('"', 0.01784597709774971), ('I', 0.017432045191526413), ('A', 0.015546905808150768), ('This', 0.012565701268613338)], [('The', 0.08464467525482178), ('I', 0.03375137224793434), ('"', 0.02832474187016487), ('A', 0.02