In [None]:
from typing import Optional
import pandas as pd

from langchain.chains.openai_functions import (
    create_openai_fn_chain,
    create_openai_fn_runnable,
    create_structured_output_chain,
    create_structured_output_runnable,
)
from langchain.chat_models import ChatOpenAI

from langchain.prompts import ChatPromptTemplate
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

In [None]:
%store -r df_cc
%store -r input_desc

In [None]:
input_desc.head(3)

In [None]:
df_cc.head(3)

In [None]:
# keep only english descriptions for now
df_en = df_cc[df_cc["language"] == "en"]

In [None]:
len(df_en)

In [None]:
df_en.iloc[1,]

In [None]:
llm = ChatOpenAI(
    model="gpt-3.5-turbo-1106", temperature=0
)  # take this for longer context length

In [None]:
input_names = input_desc["input_name"].tolist()

In [None]:
json_schema = {
    "title": "Company",
    "description": "Identifying information about activities of a company based on its description.",
    "type": "object",
    "properties": {
        "category": {
            "title": "Company's Category",
            "description": f"The companies predicted category, it should be part one of the following values: {input_names} ",
            "type": "string",
        },
    },
    "required": ["category"],
}

prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "You are an supply chain expert that classifies semiconductor companies based on their activities.",
        ),
        (
            "human",
            "Here is the description of a semiconductor company. Classify the firm based on its activity into one of these categories of the value chain: {input}",
        ),
        (
            "human",
            "Tip: Make sure to answer in the correct format, only return the names of your predicted class out of list before. A firm can be part of up to 4 classes at the same time, seperated with semicolon; Keep in mind that big firms can make multiple steps while small firms probably specialize on just one step.",
        ),
    ]
)

In [None]:
# Create a new runnable for classification
runnable = create_structured_output_runnable(json_schema, llm, prompt)

In [None]:
# function for running and saving results

# Initialize an empty list to store the results outside the function
results = []


def run(start_index, end_index):
    for i in range(start_index, end_index):
        # Retrieve the company name for the current index i
        name_i = df_cc["provider_name"].to_list()[i]
        # Retrieve the description for the current index i
        desc = df_cc["extr_text"].to_list()[i]
        # Run the model prediction using the description
        category_dict = runnable.invoke({"input": desc})
        # Extract the category from the dictionary
        category = category_dict.get(
            "category", "Unknown"
        )  # Default to 'Unknown' if 'category' key not found
        # Append the company name and its predicted category to the results list
        results.append((name_i, category))

In [None]:
# Run the function for the first set of indices

#### ACTION #### here is where the action happens

run(0, 2)

In [None]:
# Once all desired partial jobs are done, convert the results list to a DataFrame
results_df = pd.DataFrame(results, columns=["georgetown_name", "predicted_class"])

In [None]:
# Initialize MultiLabelBinarizer (globally)
from sklearn.preprocessing import MultiLabelBinarizer

# Specify class columns, excluding non-class columns
class_columns = df1.columns.difference(
    ["georgetown_name", "firm_descript", "firm_descript_processed"]
)
mlb = MultiLabelBinarizer()

# Fit the MultiLabelBinarizer with the class labels
mlb.fit([class_columns])

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score
import numpy as np


def evaluate_model_metrics(df1, results_df):
    # Use the existing evaluate_predictions function to get the joined DataFrame
    joined_df = evaluate_predictions(df1, results_df)

    # Initialize lists to store all true and predicted labels
    all_y_true = []
    all_y_pred = []

    # Loop through each row in joined_df to populate the lists
    for index, row in joined_df.iterrows():
        # Convert real and predicted classes to binary format for sklearn metrics
        y_true = [1 if cls in row["real_classes"] else 0 for cls in mlb.classes_]
        y_pred = [1 if cls in row["predicted_classes"] else 0 for cls in mlb.classes_]

        # Append the binary lists to the aggregated lists
        all_y_true.append(y_true)
        all_y_pred.append(y_pred)

    # Convert lists of lists to 2D numpy arrays
    all_y_true = np.array(all_y_true)
    all_y_pred = np.array(all_y_pred)

    # Calculate global precision, recall, and F1 score
    precision = precision_score(all_y_true, all_y_pred, average="micro")
    recall = recall_score(all_y_true, all_y_pred, average="micro")
    f1 = f1_score(all_y_true, all_y_pred, average="micro")

    return precision, recall, f1


# Usage example
precision, recall, f1 = evaluate_model_metrics(df1, results_df)
print(
    f"Precision: {round(precision, 2)} \nRecall: {round(recall, 2)} \nF1 Score: {round(f1, 2)}"
)