# Custom Chatbot Project

TODO: In this cell, write an explanation of which dataset you have chosen and why it is appropriate for this task

## Prep

In [1]:

import pandas as pd
import requests
import tiktoken
from dotenv import load_dotenv
from openai import OpenAI
from scipy.spatial.distance import cosine

load_dotenv()

COMPLETION_MODEL_NAME = "gpt-3.5-turbo-instruct"
EMBEDDING_MODEL_NAME = "text-embedding-ada-002"
TOKENIZER_ENCODING = "cl100k_base"
MAX_TOKENS = 2048

QUESTION_1 = "Who is the judge presiding over the 'The People of the State of New York v. Donald J. Trump' trial?"
QUESTION_2 = "What are the indictments filed against Donald J. Trump?"

client = OpenAI()

## Data Wrangling

TODO: In the cells below, load your chosen dataset into a `pandas` dataframe with a column named `"text"`. This column should contain all of your text data, separated into at least 20 rows.

In [2]:
wikipedia_articles = ["E._Jean_Carroll_v._Donald_J._Trump", "Stormy_Daniels–Donald_Trump_scandal",
                      "Prosecution_of_Donald_Trump_in_New_York", "Indictments_against_Donald_Trump"]

wikipedia_api_url_template = "https://en.wikipedia.org/w/api.php?action=query&prop=extracts&exlimit=1&titles={article_name}&explaintext=1&formatversion=2&format=json"

wikipedia_urls = [wikipedia_api_url_template.format(article_name=article_name) for article_name in wikipedia_articles]

df = pd.DataFrame()
for wikipedia_url in wikipedia_urls:
    res = requests.get(wikipedia_url)
    article_df = pd.DataFrame()
    article_df["text"] = res.json()["query"]["pages"][0]["extract"].strip().split("\n")
    df = pd.concat([df, article_df], ignore_index=True)

df = df[(df["text"].str.len() > 0) & (~df["text"].str.startswith("=="))]
df

Unnamed: 0,text
0,E. Jean Carroll v. Donald J. Trump is the name...
1,"In November 2022, Carroll filed her second sui..."
2,"In September 2023, Kaplan issued a partial sum..."
7,"On June 21, 2019, E. Jean Carroll published an..."
8,The allegations were made during the Trump adm...
...,...
512,Trump was indicted on state charges in a March...
517,Trump was indicted in June 2023 in the United ...
522,Trump was indicted in August 2023 in the Unite...
527,Trump was indicted on state charges in an Augu...


## Custom Query Completion

TODO: In the cells below, compose a custom query using your chosen dataset and retrieve results from an OpenAI `Completion` model. You may copy and paste any useful code from the course materials.

In [3]:
batch_size = 100
embeddings = []
for i in range(0, len(df), batch_size):
    # Send text data to OpenAI model to get embeddings
    response = client.embeddings.create(
        input=df.iloc[i:i + batch_size]["text"].tolist(),
        model=EMBEDDING_MODEL_NAME
    )

    # Add embeddings to list
    embeddings.extend([data.embedding for data in response.data])

# Add embeddings list to dataframe
df["embeddings"] = embeddings
df

Unnamed: 0,text,embeddings
0,E. Jean Carroll v. Donald J. Trump is the name...,"[-0.014774243347346783, -0.0011845094850286841..."
1,"In November 2022, Carroll filed her second sui...","[-0.02475082129240036, -0.013566769659519196, ..."
2,"In September 2023, Kaplan issued a partial sum...","[-0.017187822610139847, 0.001904560485854745, ..."
7,"On June 21, 2019, E. Jean Carroll published an...","[-0.025747627019882202, -0.008002640679478645,..."
8,The allegations were made during the Trump adm...,"[-0.036374982446432114, 0.0016292688669636846,..."
...,...,...
512,Trump was indicted on state charges in a March...,"[-0.03097999095916748, -0.02709445357322693, -..."
517,Trump was indicted in June 2023 in the United ...,"[-0.022944077849388123, -0.029442088678479195,..."
522,Trump was indicted in August 2023 in the Unite...,"[-0.037238024175167084, -0.018452312797307968,..."
527,Trump was indicted on state charges in an Augu...,"[-0.021089570596814156, -0.026820292696356773,..."


In [4]:


def get_question_embeddings(question: str) -> list[float]:
    """
    Gets the embeddings of the given question using the OpenAI API.
    :param question: The question to answer.
    :return: The embeddings of the given question.
    """
    resp = client.embeddings.create(
        input=question,
        model=EMBEDDING_MODEL_NAME
    )
    return resp.data[0].embedding


def get_rows_sorted_by_relevance(question: str, df: pd.DataFrame) -> pd.DataFrame:
    """
    Function that takes in a question string and a dataframe containing
    rows of text and associated embeddings, and returns that dataframe
    sorted from least to most relevant for that question
    """

    # Get embeddings for the question text
    question_embeddings = get_question_embeddings(question)

    # Make a copy of the dataframe and add a "distances" column containing
    # the cosine distances between each row's embeddings and the
    # embeddings of the question
    #
    # Modified based on https://github.com/openai/openai-python/blob/075ab0349a5a7d94528bfef0fd7adc8e2baf7c5d/openai/embeddings_utils.py#L139 
    # since embeddings_utils.py is no longer available in newer versions of the OpenAI SDK
    df_copy = df.copy()
    df_copy["distances"] = df_copy["embeddings"].apply(lambda x: cosine(question_embeddings, x))

    # Sort the copied dataframe by the distances and return it
    # (shorter distance = more relevant, so we sort in ascending order)
    df_copy.sort_values("distances", ascending=True, inplace=True)
    return df_copy


def create_prompt(question: str, df: pd.DataFrame, max_token_count: int) -> str:
    """
    Given a question and a dataframe containing rows of text and their
    embeddings, return a text prompt to send to a Completion model
    """
    # Create a tokenizer that is designed to align with our embeddings
    tokenizer = tiktoken.get_encoding(TOKENIZER_ENCODING)

    # Count the number of tokens in the prompt template and question
    prompt_template = """
Answer the question based on the context below, and if the question
can't be answered based on the context, say "I don't know"

Context: 

{}

---

Question: {}
Answer:"""

    current_token_count = len(tokenizer.encode(prompt_template)) + \
                          len(tokenizer.encode(question))

    context = []
    for text in get_rows_sorted_by_relevance(question, df)["text"].values:

        # Increase the counter based on the number of tokens in this row
        text_token_count = len(tokenizer.encode(text))
        current_token_count += text_token_count

        # Add the row of text to the list if we haven't exceeded the max
        if current_token_count <= max_token_count:
            context.append(text)
        else:
            break

    return prompt_template.format("\n\n###\n\n".join(context), question)


def answer_question(prompt: str, max_tokens: int = MAX_TOKENS) -> str:
    """
    Answers the question given a prompt using the OpenAI API.
    :param prompt: The question to answer.
    :param max_tokens: The maximum number of tokens to answer.
    :return: The answer.
    """
    try:
        completion = client.completions.create(
            prompt=prompt,
            model=COMPLETION_MODEL_NAME,
            max_tokens=max_tokens,
        )

        return completion.choices[0].text.strip()
    except Exception as e:
        print(e)
        return ""


## Custom Performance Demonstration

TODO: In the cells below, demonstrate the performance of your custom query using at least 2 questions. For each question, show the answer from a basic `Completion` model query as well as the answer from your custom query.

### Question 1

In [5]:

first_prompt = f"""
Question: "{QUESTION_1}"
Answer:
"""
question_1_original_answer = answer_question(first_prompt)
question_1_embeddings_answer = answer_question(create_prompt(QUESTION_1, df, 1800), MAX_TOKENS)

print(f"""
Question: {QUESTION_1}
---------------------------------------------
Original Answer: {question_1_original_answer}
---------------------------------------------
Embeddings Answer: {question_1_embeddings_answer}
""")


Question: Who is the judge presiding over the 'The People of the State of New York v. Donald J. Trump' trial?
---------------------------------------------
Original Answer: There is currently no legal case with this exact name in the state of New York. It is possible that the question is referring to one of the many ongoing legal cases involving President Donald Trump, in which case the judge presiding over the trial would depend on the specific case in question. Some prominent cases involving Trump in New York include:

- People of the State of New York v. The Trump Organization, Donald J. Trump, and others, which is a civil lawsuit filed by the New York Attorney General's office alleging financial misconduct by the Trump Foundation.
- The United States v. Michael Cohen, in which Trump's former lawyer pleaded guilty to various offenses, including tax fraud and making illegal campaign contributions at Trump's direction.
- Joseph Bondy, Esq. v. Donald J. Trump and others, in which a le

### Question 2

In [6]:
second_prompt = f"""
Question: "{QUESTION_2}"
Answer:
"""
question_2_original_answer = answer_question(second_prompt, MAX_TOKENS)
question_2_embeddings_answer = answer_question(create_prompt(QUESTION_2, df, 1800), MAX_TOKENS)

print(f"""
Question: {QUESTION_2}
---------------------------------------------
Original Answer: {question_2_original_answer}
---------------------------------------------
Embeddings Answer: {question_2_embeddings_answer}
""")



Question: What are the indictments filed against Donald J. Trump?
---------------------------------------------
Original Answer: As of June 2021, there are no indictments filed against former President Donald J. Trump. However, there have been multiple investigations and legal proceedings against him, including:
- Impeachment proceedings in 2019 and 2021, both of which resulted in acquittal by the Senate
- Criminal investigation by the Manhattan District Attorney's Office into financial and insurance fraud allegations
- Civil lawsuits filed by multiple women accusing him of sexual misconduct and defamation
- Investigation by the New York Attorney General into the Trump Organization's business practices, including tax and loan fraud
- Multiple lawsuits filed by Trump and his allies alleging election fraud in the 2020 Presidential election (all of which have been dismissed)
---------------------------------------------
Embeddings Answer: In total, there were four indictments filed again