# Querying GPT2 Locally
This notebook will perform the first experiments on querying a small GPT2 model for personality traits

In [38]:
# Imports
import torch, json, tqdm, sys, random
import torch.nn.functional as F
import numpy as np
from transformers import (
    GPT2Tokenizer,
    GPT2LMHeadModel,
    BartConfig,
    BartTokenizer,
    BartForSequenceClassification
)

In [39]:
# GPT2 Generation
model_name = 'gpt2-medium' # <-- Change this per model
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)
sent_tokenizer = BartTokenizer.from_pretrained('textattack/facebook-bart-large-SST-2')
sent_model = BartForSequenceClassification.from_pretrained('textattack/facebook-bart-large-SST-2', num_labels=2)

def logit_to_single_score(logits: torch.Tensor):
    # Label 0 is negative, label 1 is positive
    logits = logits[0]
    logits = F.softmax(logits, dim=0)
    logits = logits.cpu().detach().numpy()
    neg_score = -1 * float(logits[0])
    pos_score = float(logits[1])
    return pos_score + neg_score

def get_sent_score(phrase: str):
    inputs = sent_tokenizer(phrase, return_tensors="pt")
    labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
    outputs = sent_model(**inputs, labels=labels)
    return logit_to_single_score(outputs.logits)

# Load the filtered questions
with open('./data/questions.json', 'r') as f:
    questions = json.load(f)
# TODO: Output JSON?
output_dict = {}
for i, question in enumerate(tqdm.tqdm(questions)):
    input_ids = tokenizer.encode(question['question'], return_tensors='pt')
    sample_outputs = model.generate(
        input_ids,
        do_sample=True, 
        max_length=len(input_ids[0]) + 16, 
        top_k=200, 
        top_p=0.95,
        num_return_sequences=100
    )

    # Perform scoring and storing outputs
    output_dict[i] = {}
    output_dict[i]['question'] = question['text'].strip()
    output_dict[i]['question_score'] = get_sent_score(question['text'])
    output_dict[i]['responses'] = []

    for sample_output in sample_outputs:
        response_dict = {}
        out_str = tokenizer.decode(sample_output, skip_special_tokens=True)[len(question['question']) - 1:]
        out_str = out_str.split('\n')[0]
        response_dict['text'] = out_str
        response_dict['score'] = get_sent_score(out_str)
        response_dict['facet'] = question['facet']
        response_dict['domain'] = question['domain']
        response_dict['reverse_score'] = question['reverse_score']
        output_dict[i]['responses'].append(response_dict)

with open(f"./data/{model_name}-out.json", 'w') as f:
    json.dump(output_dict, f, indent=4)


  0%|          | 0/120 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  1%|          | 1/120 [00:32<1:03:39, 32.10s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  2%|▏         | 2/120 [01:05<1:04:17, 32.69s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  2%|▎         | 3/120 [01:38<1:04:23, 33.02s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  3%|▎         | 4/120 [02:07<1:00:25, 31.25s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  4%|▍         | 5/120 [02:38<1:00:10, 31.40s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  5%|▌         | 6/120 [03:09<58:54, 31.01s/it]  Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  6%|▌         | 7/120 [03:38<57:33, 30.57s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  7%|▋         | 8/120 [04:12<59:01, 31.62s/

In [7]:
print(len(input_ids[0]))

62


In [10]:
print()

50256
