# CSV to Prompt Data Pre-Processing
**Last Edited On: 5/30/2023**<br>
**Last Edited By: Kyle Williams**

**Motivation:** The code in this file takes a CSV of CommonsenseQA questions, and formats it into list of text containing the prompt and the question's corresponding answer. These experiment files should be easily uploaded to Colab so that the model's inference can be done on a GPU. 

In [117]:
'''
Necessary Imports, Path Constants
'''
import torch
import pickle
import pandas as pd
import numpy as np
from transformers import GPT2Tokenizer
import json

READ_FOLDER = "csv_splits/"
READ_FILES = ["TRAINsplit", "DEVsplit"] # ignore test set for now because it doesn't have answer labels
INCLUDE_ANSWERS = [True, False] # we won't include the answer in the dev split's prompts
WRITE_FOLDER = "prompt_splits/"

In [120]:
'''
Define a function to read the csv and format its contents into a tokenized prompt. 
Prompts will look like the following example:

Question: A revolving door is convenient for two direction travel, but it also serves as a security measure at a what? 
Choices: bank, library, department store, mall, new york.
Answer: bank
'''
def question_to_prompt(csv_path, include_answer=True):
    csv = pd.read_csv(READ_FOLDER + csv_path + ".csv")
    csv = csv.drop(columns = ['Unnamed: 0']) # the CSVs were saved with a leading index column that we can ignore

    prompts = [""] * csv.shape[0] #torch.zeros([csv.shape[0], tokenizer.max_len_single_sentence], dtype=torch.int32) # Vocab size is ~50000, which fits in uint16
    #attention_masks = torch.zeros([csv.shape[0], tokenizer.max_len_single_sentence], dtype=torch.bool)
    answers = [""] * csv.shape[0]

    for i, row in csv.iterrows():
        prompt = "Question: " + row['question.stem'] + "\n"
        prompt += "Choices: "
        
        # Load the row. They were saved as strings, so this is a little wonky. I decided to use
        # json.loads, which expects double quoted property keys. Since the question stem was saved
        # as one huge json string with single quoted keys, we have to be careful to overwrite these 
        # without blindly overwriting single quotes in the choices (e.g. inside a contraction)
        choices_str = row['question.choices']
        choices_str = choices_str.replace("'label'", '"label"')
        choices_str = choices_str.replace("'text'", '"text"')
        choices_str = choices_str.replace('"label": \'', '"label": "')
        choices_str = choices_str.replace('"text": \'', '"text": "')
        choices_str = choices_str.replace('\', "text"', '", "text"')
        choices_str = choices_str.replace('\'}', '"}')
        choices = json.loads(choices_str)

        answer_text = ""
        for choice in choices: # Append the choices to the prompt
            if choice['label'] == row['answerKey']:
                answer_text = choice['text']

            if choice['label'] == 'E':
                #prompt += f"or {choice['label']}: {choice['text']}. " # includes label
                prompt += f"{choice['text']}.\n" # excludes label
            else:
                #prompt += f"{choice['label']}: {choice['text']}, " # includes label
                prompt += f"{choice['text']}, "

        if include_answer:
            prompt += f"Answer: {answer_text} <|endoftext|>" # Add <|endoftext|> so fine-tuned model learns to end generation after it answers
        else:
            prompt += f"Answer: " # leave out the actual answer so the model may fill it in when freely generating
        
        prompts[i] = prompt
        answers[i] = answer_text

        # TODO: I should continue from here within the loop to tokenize the question keywords and connected concepts

    with open(WRITE_FOLDER + csv_path + "_prompts.pkl", "wb") as file:
        pickle.dump(prompts, file)
    with open(WRITE_FOLDER + csv_path + "_answers.pkl", "wb") as file:
        pickle.dump(answers, file)

    return

In [121]:
'''
Call the function on each of the splits to create new tensor files. 
'''
for i, file in enumerate(READ_FILES):
    # For now, use this cell to test output of the first file
    question_to_prompt(file, include_answer=INCLUDE_ANSWERS[i])