# Advanced Data Processing
---
We will enhance our data processing in order to attain (hopefully) better results. The things we will do are
- Create questions about what acronymns stand for (e.g., ARM)
- Summarize the answer passages using GPT-3 so that responses aren't cut off by max_tokens
- Maybe other stuff

In [28]:
# Imports
import pandas as pd
import numpy as np
import os
import re
import string
import warnings
from numpy.random import Generator, PCG64

warnings.filterwarnings("ignore", message="Unverified HTTPS request is being made to host")
os.environ["CURL_CA_BUNDLE"] = ""
pd.set_option("display.max_colwidth", None)
rand = Generator(PCG64(seed=13))

## Acronymns
---
A common question is what something stands for. We don't want the model to elaborate more than it needs to, so we will create questions from the data that are to-the-point answers.

In [2]:
# Load data frames
key_path = "../data/cfpb_key_terms.csv"
key_df = pd.read_csv(key_path)

question_path = "../data/cfpb_mortgage_questions.csv"
quest_df = pd.read_csv(question_path)

In [18]:
def find_acronym_meaning(text):
    """
    From a passage, pull out all acronymns in parentheses
    and extract their meanings.
    """
    # Format the text
    text = re.sub(r"[\"\-]", " ", text) 
    text = re.sub(r"([a-zA-Z])(\'s)", r"\1", text)
    
    # Split on parentheses and get first mention of acronym
    split = re.split(r"(\([A-Z]+)[s]?\)", text)[::-1]
    if len(split) <= 1 or ("(TTY)" in text and len(split) <= 3):
        return None
    acronyms = {e[1:]: split[i+1].split() for i, e in enumerate(split) 
                if e.startswith("(")}

    # Find the meaning for each one
    ignore = ["the", "and", "a", "of", "an", "to"] + list(string.punctuation)
    answers = []
    for acronym, before in acronyms.items():
        # Look backwards through what precedes acronym in text
        match = acronym.lower()[::-1]
        answer = [""]

        for word in before[::-1]:
            # If there's no more matches and previous word isn't 'of', stop
            if not match and word.lower() != "of" and answer[-1].lower() != "of":
                break

            # If word is a filler word, include it
            if word.lower() in ignore:
                answer.append(word)

            else:
                # Add words that start with acronym letters
                for i, letter in enumerate(match):
                    if word.lower().startswith(letter):
                        answer.append(word)
                        # Remove those letters from acronym
                        match = match[i+1:]
                        break
                # Any intervening capitalized words
                if word.istitle() and word != answer[-1]:
                    answer.append(word)

        # Remove any starting filler words
        answer = answer[::-1]
        for i, word in enumerate(answer):
            if word.lower() not in ignore:
                break
        definition = " ".join(answer[i:]).strip()

        # get the appropriate case
        if not all([c.istitle() for c in definition.split() if c not in ignore]):
            definition = definition.lower()
        if definition:
            answers.append({"acronym": acronym, "meaning": definition})
        
    return answers

In [27]:
acronymns = (
    pd.concat([
        key_df.definition.apply(find_acronym_meaning),
        quest_df.content.apply(find_acronym_meaning)
    ], ignore_index=True)
    .explode()
    .apply(pd.Series)
    .sort_values("meaning")
    .drop_duplicates(subset=["acronym"], keep="first")
    .dropna()
    .reset_index(drop=True)
)


acronymns

Unnamed: 0,acronym,meaning
0,APR,Annual Percentage Rate
1,AAA,Area Agencies Aging
2,APOR,Average Prime Offer Rate
3,COE,Certificate of Eligibility
4,CCPA,Consumer Credit Protection Act
5,HUD,Department of Housing and Urban Development
6,VA,Department of Veteran Affairs
7,ECOA,Equal Credit Opportunity Act
8,FEMA,Federal Emergency Management Agency
9,FHA,Federal Housing Administration


In [52]:
def make_abbreviation_question(a):
    """ 
    Given an acronym, generate 2 questions. One will
    randomly be lower case. A question mark may or may 
    not be at the end of the question.
    """
    punc = lambda: "?" if rand.random() > 0.5 else ""
    
    # shuffle order of lower case acronym
    a1, a2 = rand.choice([a, a.lower()], size=2, replace=False)
    response = [
        f"What does {a1} stand for" + punc(),
        f"What does {a2} mean" + punc()
    ]
    return response

adf = pd.DataFrame({
    "prompt": acronymns.acronym.apply(make_abbreviation_question),
    "completion": acronymns.meaning
}).explode("prompt")


adf.head(4)

Unnamed: 0,prompt,completion
0,What does APR stand for?,Annual Percentage Rate
0,What does apr mean?,Annual Percentage Rate
1,What does AAA stand for,Area Agencies Aging
1,What does aaa mean?,Area Agencies Aging
