# LLM: try OpenAI's gpt models

Here we test the classification task now with the openai llm gpt-4 with their api and function calling using langchain.

In [None]:
from typing import Optional
import pandas as pd

from langchain.chains.openai_functions import (
    create_openai_fn_chain,
    create_openai_fn_runnable,
    create_structured_output_chain,
    create_structured_output_runnable,
)
from langchain.chat_models import ChatOpenAI

from langchain.prompts import ChatPromptTemplate
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

In [None]:
%store -r df1 # from classification.ipynb
%store -r df # from consolidate_classes.ipynb

In [None]:
llm = ChatOpenAI(model="gpt-4", temperature=0)

In [None]:
json_schema = {
    "title": "Company",
    "description": "Identifying information about activities of a company based on its description.",
    "type": "object",
    "properties": {
        "category": {
            "title": "Company's Category",
            "description": "The companies predicted category, it should be part one of the following values: ['Assembly, Packaging & Interconnects','Lithography, Photomasks & Imaging', 'Logic Chip Design & Software','Material & Wafer Fabrication', 'Planarization, Inspection & Metrology'] ",
            "type": "string",
        },
    },
    "required": ["category"],
}

prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "You are an supply chain expert that classifies semiconductor companies based on their activities.",
        ),
        (
            "human",
            "Here is the description of a semiconductor company. Classify the firm based on its activity into one of these categories of the value chain: {input}",
        ),
        (
            "human",
            "Tip: Make sure to answer in the correct format, only return these values: ['Assembly, Packaging & Interconnects','Lithography, Photomasks & Imaging', 'Logic Chip Design & Software','Material & Wafer Fabrication', 'Planarization, Inspection & Metrology'] A firm can be part of up to 2 classes at the same time, seperated with semicolon;.",
        ),
    ]
)

In [None]:
# Create a new runnable for classification
runnable = create_structured_output_runnable(json_schema, llm, prompt)

In [None]:
# function for running and saving results

# Initialize an empty list to store the results outside the function
results = []


def run(start_index, end_index):
    for i in range(start_index, end_index):
        # Retrieve the company name for the current index i
        name_i = df1["georgetown_name"].to_list()[i]
        # Retrieve the description for the current index i
        desc = df1["firm_descript"].to_list()[i]
        # Run the model prediction using the description
        category_dict = runnable.invoke({"input": desc})
        # Extract the category from the dictionary
        category = category_dict.get(
            "category", "Unknown"
        )  # Default to 'Unknown' if 'category' key not found
        # Append the company name and its predicted category to the results list
        results.append((name_i, category))

In [None]:
# Run the function for the first set of indices

#### ACTION #### here is where the action happens

run(0, len(df1))

In [None]:
# Once all desired partial jobs are done, convert the results list to a DataFrame
results_df = pd.DataFrame(results, columns=["georgetown_name", "predicted_class"])

In [None]:
# manually inspect values

from sklearn.preprocessing import MultiLabelBinarizer


def evaluate_predictions(df1, results_df):
    # Create a temporary copy of df1
    df_temp = df1.copy()

    # Specify class columns, excluding non-class columns
    class_columns = df_temp.columns.difference(
        ["georgetown_name", "firm_descript", "firm_descript_processed"]
    )

    # Initialize MultiLabelBinarizer
    mlb = MultiLabelBinarizer()

    # Fit the MultiLabelBinarizer with the class labels
    mlb.fit([class_columns])

    # # Convert class columns to binary (0 or 1)
    # for column in class_columns:
    #     df_temp[column] = pd.to_numeric(df_temp[column], errors='coerce').fillna(0)
    #     df_temp[column] = df_temp[column].apply(lambda x: 1 if x >= 1 else 0)

    # Inverse transform the binary matrix to get class labels
    gt_classes = mlb.inverse_transform(df_temp[class_columns].values)

    # Add the real class labels back to df_temp
    df_temp["real_classes"] = [list(classes) for classes in gt_classes]

    # Ensure predicted_classes is a list
    results_df["predicted_classes"] = results_df["predicted_class"].apply(
        lambda x: x.split(";")
    )

    # Join the DataFrames on 'georgetown_name'
    joined_df = pd.merge(
        df_temp[["georgetown_name", "real_classes"]],
        results_df[["georgetown_name", "predicted_classes"]],
        on="georgetown_name",
        how="inner",
    )

    # Determine complete and partial correctness
    joined_df["correct"] = joined_df.apply(
        lambda row: set(row["real_classes"]) == set(row["predicted_classes"]), axis=1
    )
    joined_df["partially_correct"] = joined_df.apply(
        lambda row: bool(set(row["real_classes"]) & set(row["predicted_classes"])),
        axis=1,
    )

    # Return the joined DataFrame for inspection
    return joined_df


# Usage example
joined_df = evaluate_predictions(df1, results_df)

In [None]:
joined_df.head()

In [None]:
# Initialize MultiLabelBinarizer (globally)

# Specify class columns, excluding non-class columns
class_columns = df1.columns.difference(
    ["georgetown_name", "firm_descript", "firm_descript_processed"]
)
mlb = MultiLabelBinarizer()

# Fit the MultiLabelBinarizer with the class labels
mlb.fit([class_columns])

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score
import pandas as pd


def evaluate_predictions_with_metrics(df1, results_df):
    # Use the existing evaluate_predictions function to get the joined DataFrame
    joined_df = evaluate_predictions(df1, results_df)

    # Initialize lists to store metrics for each row
    precision_list = []
    recall_list = []
    f1_list = []

    # Loop through each row in joined_df
    for index, row in joined_df.iterrows():
        # Convert real and predicted classes to binary format for sklearn metrics
        y_true = [1 if cls in row["real_classes"] else 0 for cls in mlb.classes_]
        y_pred = [1 if cls in row["predicted_classes"] else 0 for cls in mlb.classes_]

        # Calculate precision, recall, and F1 for this row
        precision = precision_score(y_true, y_pred, average="binary")
        recall = recall_score(y_true, y_pred, average="binary")
        f1 = f1_score(y_true, y_pred, average="binary")

        # Append the metrics to their respective lists
        precision_list.append(precision)
        recall_list.append(recall)
        f1_list.append(f1)

    # Add the metrics to the joined_df
    joined_df["precision"] = precision_list
    joined_df["recall"] = recall_list
    joined_df["f1"] = f1_list

    # Return the updated DataFrame
    return joined_df


# Usage example
# evaluation_df = evaluate_predictions_with_metrics(df1, results_df)
# evaluation_df.head()

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score
import numpy as np


def evaluate_model_metrics(df1, results_df):
    # Use the existing evaluate_predictions function to get the joined DataFrame
    joined_df = evaluate_predictions(df1, results_df)

    # Initialize lists to store all true and predicted labels
    all_y_true = []
    all_y_pred = []

    # Loop through each row in joined_df to populate the lists
    for index, row in joined_df.iterrows():
        # Convert real and predicted classes to binary format for sklearn metrics
        y_true = [1 if cls in row["real_classes"] else 0 for cls in mlb.classes_]
        y_pred = [1 if cls in row["predicted_classes"] else 0 for cls in mlb.classes_]

        # Append the binary lists to the aggregated lists
        all_y_true.append(y_true)
        all_y_pred.append(y_pred)

    # Convert lists of lists to 2D numpy arrays
    all_y_true = np.array(all_y_true)
    all_y_pred = np.array(all_y_pred)

    # Calculate global precision, recall, and F1 score
    precision = precision_score(all_y_true, all_y_pred, average="micro")
    recall = recall_score(all_y_true, all_y_pred, average="micro")
    f1 = f1_score(all_y_true, all_y_pred, average="micro")

    return precision, recall, f1


# Usage example
precision, recall, f1 = evaluate_model_metrics(df1, results_df)
print(
    f"Precision: {round(precision, 2)} \nRecall: {round(recall, 2)} \nF1 Score: {round(f1, 2)}"
)

### Output Notes
So here we are using the `average='micro'` because it is more compatible with out problem with class inbalance, whereas the 'macro' argument would count all classes as equal, thereby not taking imbalances into account. In fact, in our example changing the variable to 'macro' would even give an error.

Also what is a true positive? In this case every binary prediction is evaluated independently. That is, there is no notion of 'partial correctness' as in the column of the manual inspection before. So if for one company the model predicts 2 classes and only one of them is correct, it will just take these predictions as independent. 

In [None]:
# Calculate the counts of completely correct predictions
complete_correct_counts = joined_df["correct"].value_counts()

# Calculate the counts of partially correct predictions
# For partially correct, consider only those instances that are not completely correct
partially_correct_counts = joined_df.loc[
    ~joined_df["correct"], "partially_correct"
].value_counts()

# Print counts and percentages
print("Completely Correct Predictions:")
print(complete_correct_counts)
print(
    "Percentage of Completely Correct Predictions: {:.2f}%".format(
        100 * complete_correct_counts[True] / len(joined_df)
    )
)

print("\nPartially Correct Predictions (excluding completely correct ones):")
print(partially_correct_counts)
if True in partially_correct_counts:
    print(
        "Percentage of Partially Correct Predictions: {:.2f}%".format(
            100 * partially_correct_counts[True] / len(joined_df)
        )
    )
else:
    print("Percentage of Partially Correct Predictions: 0.00%")

# Calculate and print the percentage of completely incorrect predictions
# These are instances that are not partially correct and not completely correct
completely_incorrect_percentage = (
    100
    * (
        len(joined_df)
        - complete_correct_counts[True]
        - partially_correct_counts.get(True, 0)
    )
    / len(joined_df)
)
print(
    "\nCompletely Incorrect Predictions Percentage: {:.2f}%".format(
        completely_incorrect_percentage
    )
)

# calculate the partial correct in total
print(
    "\nAt least one correct prediction: {:.2f}%".format(
        100 - completely_incorrect_percentage
    )
)

### Results for gpt-3.5-turbo:
Completely Correct Predictions:
correct
False    96
True     44
Name: count, dtype: int64
Percentage of Completely Correct Predictions: 31.43%

Partially Correct Predictions (excluding completely correct ones):
partially_correct
True     63
False    33
Name: count, dtype: int64
Percentage of Partially Correct Predictions: 45.00%

Completely Incorrect Predictions Percentage: 23.57%

At least one correct prediction: 76.43%

#### metrics gpt-3.5-turbo:

time: 2m30s

cost: 0.20 USD

Precision: 0.54 

Recall: 0.72 

F1 Score: 0.62



## Results for gpt-4

Completely Correct Predictions:
correct
False    72
True     68
Name: count, dtype: int64
Percentage of Completely Correct Predictions: 48.57%

Partially Correct Predictions (excluding completely correct ones):
partially_correct
False    44
True     28
Name: count, dtype: int64
Percentage of Partially Correct Predictions: 20.00%

Completely Incorrect Predictions Percentage: 31.43%

At least one correct prediction: 68.57%


### metrics gpt-4
time: 11m30s

cost: 2.00 USD

Precision: 0.68 

Recall: 0.63 

F1 Score: 0.65