In [29]:
from openai import OpenAI
from dotenv import load_dotenv
import pandas as pd
import os
import json
from ast import literal_eval


# Load the environment variables from the .env file
load_dotenv()

True

In [30]:
# Read in the data to pandas dataframes
train_df = pd.read_csv("data/airline_train.csv")
test_df = pd.read_csv("data/airline_test.csv")

# convert the string representation of a list to a list
train_df["airlines"] = train_df["airlines"].apply(lambda x: literal_eval(x))
test_df["airlines"] = test_df["airlines"].apply(lambda x: literal_eval(x))

# Combine the train and test dataframes, we are not splitting the data into train and test sets because we are not training a traditional ML model 
# Adding the train set gives us a better evaluation set
train_test_df = pd.concat([train_df, test_df], axis=0, ignore_index=True)

In [31]:
client = OpenAI(
  api_key=os.getenv('OPENAI_API_KEY')
)

In [32]:
def get_openai_ner_response(client: OpenAI, tweet_text: str, model="gpt-3.5-turbo") -> str:

    # system prompt
    system_prompt = """You are an expert NER model. Your task is to read a tweet and identify any airline
                    companies mentioned. Return a JSON object with a list of identified airline names.
                    If no airline names are found, return an empty list. Do not include other types of entities.
                    Consider common abbreviations (e.g., 'AA' for American Airlines) as well, if relevant."""

    
    user_prompt = f"""
    Below are examples of tweets and the corresponding JSON output.

    Example 1:
    Tweet: "Had a terrible experience with United yesterday!"
    Output: {{"airlines": ["United Airlines"]}}

    Example 2:
    Tweet: "I just booked a flight on Southwest, I hope it goes well."
    Output: {{"airlines": ["Southwest Airlines"]}}

    Example 3:
    Tweet: "AA lost my luggage again."
    Output: {{"airlines": ["American Airlines"]}}

    Example 4:
    Tweet: "No airline can beat Emirates in terms of luxury."
    Output: {{"airlines": ["Emirates"]}}

    ---
    Now, follow the exact same format and process the tweet below.
    Tweet: "{tweet_text}"

    Remember:
    1. Only list airline names if they are explicitly or implicitly mentioned.
    2. Consider common abbreviations or short forms.
    3. Return the output as a JSON object with the key 'airlines' pointing to a list of strings.
    4. If no airline name is mentioned, return {{"airlines": []}}.
    Remember to include the full airline name, not just the abbreviation.
    Some airlines may be misspelled in the tweet, but you should still include them in the output with the correct name."""


    # Call the OpenAI API
    completion = client.chat.completions.create(
        model=model,
        messages=[
            {
                "role": "system",
                "content": system_prompt
            },
            {
                "role": "user",
                "content": user_prompt
            }
        ],
        temperature=0,  # For more deterministic output
    )

    # print(completion.choices[0].message) #debug

    return completion.choices[0].message


In [33]:
# Test the function

tweet = train_test_df["tweet"][3]
print(tweet)
response = get_openai_ner_response(client, tweet)
print(response.content)

Southwest Airlines My bags are on the way to Chicago, without me! Help! I was confirmed for 2 flights and told there isn't room and I'm screwed.
{
    "airlines": ["Southwest Airlines"]
}


In [34]:
# Batch process the tweets for a whole dataframe
def get_openai_ner_response_batch(client: OpenAI, df: pd.DataFrame, model="gpt-3.5-turbo") -> list:
    responses = []
    for i, tweet_text in enumerate(df["tweet"]):
        print(f"Processing tweet {i+1}/{len(df)}")
        try:
            response = get_openai_ner_response(client, tweet_text, model)
            response_json = json.loads(response.content)
            responses.append(list(set(response_json["airlines"])))
        except Exception as e:
            responses.append("")
            print(f"Error processing tweet: {e}")
    return responses

In [None]:
# run NER extraction over the dataframe
# here, we are running only the test set because we are rate limited
test_df["airlines_predicted"] = get_openai_ner_response_batch(client, test_df)

In [26]:
def evaluate_prompt_responses(df):

    # Convert the columns to sets so we can compare the extracted entities.
    labels = df["airlines"].apply(lambda x: set(x))
    predictions = df["airlines_predicted"].apply(lambda x: set(x))

    # compare the two lists of sets and calculate the percentage accuracy, the false positive rate, and the false negative rate
    # Accuracy is defined as the percentage of sets that are equal
    # False positive rate is the percentage of sets in the predictions that are not in the labels
    # False negative rate is the percentage of sets in the labels that are not in the predictions

    acc, fp, fn = [],[],[]
    for i in range(len(df)):

        # Count accurate predictions
        if labels[i] == predictions[i]:
            acc.append(1)
        else:
            acc.append(0)

        # Count predictions that are not in the labels
        if predictions[i] - labels[i]:
            fp.append(1)
        else:
            fp.append(0)

        # Count labels that are not in the predictions
        if labels[i] - predictions[i]:
            fn.append(1)
        else:
            fn.append(0)

    accuracy = "{:.2f}".format(sum(acc) / len(df))
    false_positive_rate = "{:.2f}".format(sum(fp) / len(df))
    false_negative_rate = "{:.2f}".format(sum(fn) / len(df))

    print(f"Accuracy: {accuracy}")
    print(f"False Positive Rate: {false_positive_rate}")
    print(f"False Negative Rate: {false_negative_rate}")

In [27]:
results_df = pd.read_csv("results/results_gpt3_5.csv")
evaluate_prompt_responses(results_df)

Accuracy: 0.91
False Positive Rate: 0.05
False Negative Rate: 0.05


*Evaluation Metrics:*

- **Accuracy:** 0.91 -- This means 91 out of every 100 tweets had extracted all airlines perfectly

- **False Positive Rate:** 0.05 -- This means 5 out of every 100 tweets had picked an airline name that wasn't found in the labels

- **False Negative Rate:** 0.05 -- This means 5 out of every 100 tweets had failed to find the airline name(s) in the labels

In [None]:
# Get the predictions for the test_df using the GPT-4o model
gpt4_test_df = test_df.copy()
gpt4_test_df["airlines_predicted"] = get_openai_ner_response_batch(client, test_df, "gpt-4o-mini-2024-07-18")

# Save the results to a CSV file
gpt4_test_df.to_csv("results/results_gpt4o.csv", index=False)

### If you get rate limited with a 429 error, you can try again after waiting.