# Keyword Connection Pre-Processing
**Last Edited On: 5/30/2023**<br>
**Last Edited By: Kyle Williams**

**Motivation:** The code in this file takes a CSV of CommonsenseQA questions, whose keywords have previously been extracted by BERT. For each question, ConceptNet is queried for its keywords to pull in potentially related concepts. These concept lists will be used to constrain the vocabulary of GPT2 during our experiments to hopefully improve its generation.

### Setup

In [1]:
'''
Necessary Imports, Path Constants
'''
from ConceptNetRequestor import ConceptNetRequestor
import pandas as pd
import itertools
import json
import pickle
from functools import lru_cache

# Constants
CNR = ConceptNetRequestor()             # Our interface for querying data from ConceptNet
READ_FOLDER = "csv_splits/"
READ_FILES = ["DEVsplit"] #"TRAINsplit", "DEVsplit"] # ignore test set for now because it doesn't have answer labels
WRITE_FOLDER = "prompt_splits/"

In [2]:
'''
Script Hyperparameters
'''
DEPTH = 2 # Depth of ConceptNet edge traversal
N_GRAMS = 2 # Query ConceptNet with combinations of keywords of this size

### Function Definitions

In [3]:
def connect_keywords(keywords):
    """
    Queries ConceptNet for the BERT-extracted keywords associated with a given question. 
    This creates a list of associated concepts from which to restrict the model's vocabulary
    during decoding.
    """
    vocab = []   # Stores keywords or traversed concepts we've already seen
    depths = {}  # Stores at what depth we saw that keyword to speed up graph search

    def add_to_vocab(keyword, curr_depth):
        '''
        Helper function to add 'keyword' to our vocabulary. Optionally explores edges of 'keyword' from ConceptNet
        if 'depth' is nonzero. When this function adds 'keyword' to the vocabulary, it takes note of the index
        of the row/column this keyword will be mapped to in the adjacency matrix to be created after. 

        NOTE: This function maintains the invariant that any item in 'vocab' is also in 'depths'
        '''
        # BASE CASES: Return early if these are hit
        # CASE 1) If 'curr_depth' is 0, we are simply adding this node without traversing its edges
        # CASE 2) If we've seen this keyword, check 'depths' to see if we've already done the work

        # CASE ELSE) This keyword is not-yet-seen, we need to add it and find its edges because 'curr_depth' is nonzero
        key = keyword.replace("_", " ")
        if curr_depth == 0: # CASE 1
            if keyword not in vocab:
                vocab.append(key)
                depths[keyword] = curr_depth
            return
        elif keyword in depths: # CASE 2
            if depths[keyword] >= curr_depth: 
                return
            else: 
                depths[keyword] = curr_depth # 'curr_depth' is greater than tracked depth, so we must update and do work
        else: # CASE ELSE
            vocab.append(key)
            depths[keyword] = curr_depth

        # RECURSIVE STEP: query ConceptNet for edges in/out of this node. Then add it and its connected concepts.
        edges = CNR.get_edges(keyword) # Might be empty if this keyword is not an actual node in ConceptNet
        if edges:
            for edge in edges:         # Then add the connected concepts recursively
                add_to_vocab(edge, curr_depth-1)
        else: # since this is not an actual node in ConceptNet, we should not track it (may have been added by CASE ELSE)
            vocab.remove(key)
            del depths[keyword]


    if N_GRAMS > 1: # If we want to try permutations of keywords
        for n_gram in range(2, N_GRAMS+1): # Try all lengths of permutations specified
            for combo in itertools.permutations(keywords, n_gram):
                query_concept = '_'.join(combo) # Multi-word concepts are separated by '_' in the API path
                add_to_vocab(query_concept, DEPTH)

    for keyword in keywords: # Then process the original keywords without permutation
        add_to_vocab(keyword, DEPTH)

    return vocab

### Main

In [4]:
for file in READ_FILES:
    csv = pd.read_csv(READ_FOLDER + file + ".csv")
    csv = csv.drop(columns = ['Unnamed: 0']) # the CSVs were saved with a leading index column that we can ignore

    q_concepts = [[]]*csv.shape[0] # will be a List[List[str]], populated with all concepts related to the keywords of each question

    for i, row in csv.iterrows():
        row_list = json.loads(row['keywords'].replace("'", '"'))
        q_concepts[i] = connect_keywords(row_list)

    with open(WRITE_FOLDER + file + "_keywords.pkl", "wb") as f:
        pickle.dump(q_concepts, f)

In [None]:
with open(WRITE_FOLDER + file + "_keywords.pkl", "rb") as f:
    my_list = pickle.load(f)