In [1]:
import torch
import random

from transformers import AutoTokenizer, AutoModelWithLMHead

In [2]:
tokenizer = AutoTokenizer.from_pretrained("openai-gpt")

model = AutoModelWithLMHead.from_pretrained("openai-gpt")


In [3]:
def generate_text(input_text: str, tokens_to_generate: int):
    text_generated = torch.tensor([tokenizer.encode(input_text)], dtype=torch.long)
    with torch.no_grad():
        for _ in range(tokens_to_generate):
            predictions = model(text_generated)
            next_token_logits = predictions[0][:, -1, :]

            next_token_id = torch.argmax(next_token_logits, dim=-1).unsqueeze(-1)
            
            text_generated = torch.cat((text_generated, next_token_id), dim=1)
    result = tokenizer.decode(text_generated.squeeze().tolist())
    return result

In [93]:
text = "My favourite movie is"

In [94]:
text_encoded = torch.tensor([tokenizer.encode(text)], dtype=torch.long)

In [95]:
model.eval()
with torch.no_grad():
    predictions = model(text_encoded)

In [96]:
print(generate_text(text, 100))

my favourite movie is the one where the guy is in love with the girl. " 
 " i don't think that's a good idea. " 
 " why not? " 
 " because it's not a good idea. " 
 " why not? " 
 " because it's not a good idea. " 
 " why not? " 
 " because it's not a good idea. " 
 " why not? " 
 " because it's not a good idea. " 
 " why not? "


In [97]:
def gen_topK_text(input_text: str, tokens_to_generate: int, k: int):
    text_generated = torch.tensor([tokenizer.encode(input_text)], dtype=torch.long)
    with torch.no_grad():
        for _ in range(tokens_to_generate):
            predictions = model(text_generated)
            
            next_token_logits = predictions[0][:, -1, :]
            next_token_id = torch.LongTensor(random.choices(next_token_logits.topk(k, dim=-1).indices.tolist()[0], torch.nn.functional.softmax(next_token_logits).topk(k, dim=-1).values.tolist()[0])).unsqueeze(-1)            
            text_generated = torch.cat((text_generated, next_token_id), dim=1)
    result = tokenizer.decode(text_generated.squeeze().tolist())
    return result

In [98]:
print(gen_topK_text(text, 100, 10))

  


my favourite movie is the one about that time of year where they were all together and all the boys got into a game of hide and seek. the boys were all in the front seat of the car with one boy in the back while the other boy sat on the seat beside them. they didn't have to worry about being caught and that is why they didn't want to be seen in public together. 
 they got out the car and walked into the park. there were a few other boys playing games and it looked


In [99]:
print(gen_topK_text(text, 100, 3))


  


my favourite movie is the one where the guy is a vampire. he has a vampire in his arms and is about to bite her. " 
 " that's a good movie! " i said. " i love that one. " 
 " i know, " she said. " but it's a lot of fun to watch. " 
 we sat on the floor, watching the movie. 
 " you're so cute when you're nervous, " she said. " i love that. " 
 " i love


In [100]:
def gen_topK_text_penalized(input_text: str, tokens_to_generate: int, k: int, penalty: float):
    text_generated = torch.tensor([tokenizer.encode(input_text)], dtype=torch.long)
    with torch.no_grad():
        for _ in range(tokens_to_generate):
            predictions = model(text_generated)
            
            next_token_logits = predictions[0][:, -1, :]
            
            #applying penalty (>1 to cut repetition) to logits that are already in text_generated
            #we add (penalty-1)% to logits < 0 and substract (penalty-1)% if logits > 0

            if penalty != 1:
              for i in range(len(next_token_logits)):
                if i in text_generated[0]:
                  if next_token_logits[0][i]>=0:
                    next_token_logits[0][i] *= (2-penalty)  
                  else:
                    next_token_logits[0][i] *= penalty

            next_token_id = torch.LongTensor(random.choices(next_token_logits.topk(k, dim=-1).indices.tolist()[0], torch.nn.functional.softmax(next_token_logits).topk(k, dim=-1).values.tolist()[0])).unsqueeze(-1)   
            
            text_generated = torch.cat((text_generated, next_token_id), dim=1)
    result = tokenizer.decode(text_generated.squeeze().tolist())
    return result


In [107]:
print(gen_topK_text_penalized(text, 200, 2, 10))




my favourite movie is the one where you're a princess and you have to be rescued from the castle. i'm not going to let you go, " i said, pulling her into my arms. 
 " i'm not leaving you. " she looked up at me, her eyes full of determination. 
 " i'm not going anywhere, " i said. 
 " then let me go. " 
 " i'm not letting you go. " 
 " then let me go, " she said. " please, i need to go. " 
 " i can't let you go, " i said. 
 " then i 'll go. " she pulled away from me and looked at me. 
 " i can't let you go. " 
 " then let me go. " 
 i looked down at the floor, feeling the tears coming. " i can't. " 
 " then let me go. " 
 i looked up at the ceiling, feeling the tears coming. " i can't


In [133]:
def gen_topP_text_penalized(input_text: str, tokens_to_generate: int, p: float, penalty: float):
    text_generated = torch.tensor([tokenizer.encode(input_text)], dtype=torch.long)
    with torch.no_grad():
        for _ in range(tokens_to_generate):
            predictions = model(text_generated)
            
            next_token_logits = predictions[0][:, -1, :]
            
            #applying penalty (>1 to cut repetition) to logits that are already in text_generated
            #we add (penalty-1)% to logits < 0 and substract (penalty-1)% if logits > 0

            if penalty != 1:
              for i in range(len(next_token_logits)):
                if i in text_generated[0]:
                  if next_token_logits[0][i]>=0:
                    next_token_logits[0][i] *= (2-penalty)  
                  else:
                    next_token_logits[0][i] *= penalty
 
            
            cum_sum_prob = 0
            chosen_logits_indices = []
            sorted_softmax_values, indices = torch.sort(torch.nn.functional.softmax(next_token_logits), descending=True)
            
            for i in range(len(sorted_softmax_values[0])):
              if cum_sum_prob <= p:
                cum_sum_prob += sorted_softmax_values[0][i]
                chosen_logits_indices.append(indices[0][i].tolist())
              else:
                break
            
            next_token_id = torch.LongTensor(random.choices(chosen_logits_indices)).unsqueeze(-1)
            
            text_generated = torch.cat((text_generated, next_token_id), dim=1)
    result = tokenizer.decode(text_generated.squeeze().tolist())
    return result


In [139]:
print(gen_topP_text_penalized(text, 200, 0.7, 5))



my favourite movie is dirty and beautiful, " jack replied. " that one is really beautiful. " 
 i frowned at the sad sight. i wanted to hold him close, kiss him and kiss him, but he would be afraid if i tried. so i leaned over and hugged him, taking a small piece of him into my heart and wrapping my fingers around his chin, trying to stop the tears that wanted to escape. 
 " why are you crying? " he asked. i wiped the tears away with the back of my hand. 
 " because it was my favourite movie ever. it was my mom's favourite too, " i said. 
 " so? " 
 " so it was also the last time she watched it. " 
 jack was silent for a minute. his eyes filled with emotion, as he gazed at me, my beautiful beautiful face with the crooked nose, which looked perfect in the late morning light. he then took my hand and kissed it, but his touch was as awkward as ever


In [140]:
def gen_topP_penalized_w_temp(input_text: str, tokens_to_generate: int, p: float, penalty: float, temperature: int =1):
    text_generated = torch.tensor([tokenizer.encode(input_text)], dtype=torch.long)
    with torch.no_grad():
        for _ in range(tokens_to_generate):
            predictions = model(text_generated)
            
            next_token_logits = predictions[0][:, -1, :]
            
            #applying penalty (>1 to cut repetition) to logits that are already in text_generated
            #we add (penalty-1)% to logits < 0 and substract (penalty-1)% if logits > 0

            if penalty != 1:
              for i in range(len(next_token_logits)):
                if i in text_generated[0]:
                  if next_token_logits[0][i]>=0:
                    next_token_logits[0][i] *= (2-penalty)  
                  else:
                    next_token_logits[0][i] *= penalty
 
            
            cum_sum_prob = 0
            chosen_logits_indices = []
            sorted_softmax_values, indices = torch.sort(torch.nn.functional.softmax(next_token_logits), descending=True)
            
            if temperature != 0:
                sorted_softmax_values = sorted_softmax_values/temperature
            
            for i in range(len(sorted_softmax_values[0])):
              if cum_sum_prob <= p:
                cum_sum_prob += sorted_softmax_values[0][i]
                chosen_logits_indices.append(indices[0][i].tolist())
              else:
                break
            
            next_token_id = torch.LongTensor(random.choices(chosen_logits_indices)).unsqueeze(-1)
            
            text_generated = torch.cat((text_generated, next_token_id), dim=1)
    result = tokenizer.decode(text_generated.squeeze().tolist())
    return result

In [142]:
print(gen_topP_penalized_w_temp(text, 200, 0.4, 5, 1.1))



my favourite movie is playing. " 
 " that's my favorite movie too. " 
 " i know. " he smiled. " it's a love story. " 
 " what do you mean? " 
 " i mean, it's about a girl who gets a bad case of heartburn. " 
 " what kind of girl? " 
 " i don't know. i think she's a big girl. " 
 " oh. " i nodded. " well, i'm sure she 'll be fine. " 
 " she's not the only one. " he laughed. " i'm sure there are other girls out there who would love to have a girlfriend like her. " 
 " well, i don't think she 'd want to have a boyfriend like me. " 
 " why not? " 
 " i don't know. " i shrugged. " i just don't think she 'd want to have a boyfriend like me. " 
 " well, she's a pretty girl. "
