# Import Libraries

In [1]:
import json
import time

from tqdm import tqdm
import pandas as pd
from sklearn.metrics import classification_report, accuracy_score, f1_score

from langchain_openai import ChatOpenAI
from langchain.schema import HumanMessage

In [None]:
import os
os.environ["OPENAI_API_KEY"] = "API_KEY"

# Define constants

In [2]:
GOLDEN_FOLD_SAMPLE_SIZE = 10_000
RANDOM_STATE = 42
MODELS_PRICES_SPEC = {
    "gpt-4o-mini": {
        "price_input_per_1K": 0.000150,
        "price_output_per_1K": 0.000600,
        "batch_size": 30
    },
    "gpt-4o": {
        "price_input_per_1K": 0.00250,
        "price_output_per_1K": 0.01000,
        "batch_size": 50
    }
}

# Load data

In [3]:
splits = {'train': 'train.csv', 'test': 'test.csv'}
df_train = pd.read_csv("hf://datasets/yassiracharki/Amazon_Reviews_Binary_for_Sentiment_Analysis/" + splits["train"])
df_test = pd.read_csv("hf://datasets/yassiracharki/Amazon_Reviews_Binary_for_Sentiment_Analysis/" + splits["test"])

## Prepare data for test

In [None]:
golden_fold = df_test.sample(n=GOLDEN_FOLD_SAMPLE_SIZE, random_state=RANDOM_STATE)
golden_fold.head()

# Compute Predictions

In [8]:
prompt = """
    Classify the sentiment of each example in the following JSON array as "positive" or "negative".
    Respond only in JSON format where each ID is a key and its value is 1 for "positive" and 0 for "negative".
    
    Examples: {}
    
    Deliver the response here in plain text without any formatting.
    """

In [9]:
def clear_response(text):
    if text[0]=='`':
        text = text.replace('`','')
        text = text.replace('json','')
    return text

In [None]:
results = {}

In [None]:

for model, spec in tqdm(MODELS_PRICES_SPEC.items(), desc="Models"):

    chat_model = ChatOpenAI(model=model, temperature=0)

    # Initialize variables to store results and metadata for the current model
    model_results = {}
    error_inputs = []
    model_results_raw = []
    model_total_input_tokens = []
    model_total_output_tokens = []
    start_time = time.time()
    
    for i in tqdm(range(0, len(golden_fold), spec["batch_size"]), desc="Batches"):
        # Get the current batch of data
        batch_data = golden_fold.iloc[i:i + spec["batch_size"]]
        
        # Prepare the prompt
        examples = [{"id": idx, "text": row["review_text"]} for idx, row in batch_data.iterrows()]
        input_data = prompt.format(json.dumps(examples))

        response = chat_model.invoke([HumanMessage(content=input_data)])

        # Clear the response text
        response_text = clear_response(response.content)
        try:
            # Parse the response text as JSON
            batch_results = json.loads(response_text)

            model_total_input_tokens.append(response.usage_metadata["input_tokens"])
            model_total_output_tokens.append(response.usage_metadata["output_tokens"])

            model_results.update(batch_results)
            model_results_raw.append(response_text)
        
        # Handle JSON decode errors and store the error inputs
        # when the response is not a valid JSON
        except json.JSONDecodeError as e:
            print(f"Error at batch {i} with error {e}")
            error_inputs.append([batch_data, input_data, e])

    # Store the results and metadata for the current model
    results[model] = {
        "results": model_results,
        "total_input_tokens": model_total_input_tokens,
        "total_output_tokens": model_total_output_tokens,
        "total_time": time.time() - start_time,
        "raw_results": model_results_raw,
        "error_inputs": error_inputs
    }


# Evaluate predictions

In [13]:
models_preds = {
    model: results[model]["results"]
    for model in results
}

In [14]:
models_preds = pd.DataFrame(models_preds)
models_preds.index = models_preds.index.astype(int)
models_preds.columns=[f"pred from {model}" for model in models_preds]

In [None]:
evaluation_df = pd.concat([golden_fold, models_preds+1], axis=1)
evaluation_df.head()

In [None]:
# if LLM makes Hallucination in observation_id, then we should remove it
print("Number of hallucinations in the evaluation_df: ", evaluation_df[evaluation_df["class_index"].isna()].shape[0])
evaluation_df = evaluation_df[~evaluation_df["class_index"].isna()]

In [None]:
metrics_by_models = {}
for model in MODELS_PRICES_SPEC:
    col = f"pred from {model}"
    model_data_for_metrics = evaluation_df[~evaluation_df[col].isna()]
    metrics_by_models[model] = {
        "accuracy": accuracy_score(model_data_for_metrics["class_index"], model_data_for_metrics[col]),
        "f1": f1_score(model_data_for_metrics["class_index"], model_data_for_metrics[col])
    }
    print(model)
    print(classification_report(model_data_for_metrics["class_index"], model_data_for_metrics[col],target_names=["negative", "positive"]))
    print("\n\n")

# Prepare results for table

In [19]:
for model in MODELS_PRICES_SPEC:
    metrics_by_models[model]["input_tokens_per_row"] = sum(results[model]["total_input_tokens"]) / len(golden_fold)
    metrics_by_models[model]["output_tokens_per_row"] = sum(results[model]["total_output_tokens"]) / len(golden_fold)
    metrics_by_models[model]["total_input_tokens"] = sum(results[model]["total_input_tokens"])
    metrics_by_models[model]["total_output_tokens"] = sum(results[model]["total_output_tokens"])
    
    metrics_by_models[model]["price_per_row"] = (
        metrics_by_models[model]["input_tokens_per_row"] / 1000 * MODELS_PRICES_SPEC[model]["price_input_per_1K"] +
        metrics_by_models[model]["output_tokens_per_row"] / 1000 * MODELS_PRICES_SPEC[model]["price_output_per_1K"]
    )
    
    metrics_by_models[model]["sec_per_row"] = results[model]["total_time"] / len(golden_fold)

In [20]:
pred_final_table = pd.DataFrame(metrics_by_models)
pred_final_table.loc["total price estimation, $", :] = pred_final_table.loc["price_per_row", :] * df_test.shape[0]
pred_final_table.loc["total time estimation, hour", :] = pred_final_table.loc["sec_per_row", :] * df_test.shape[0] / 3600
pred_final_table

# Final Result

In [23]:
cols = ["accuracy", "f1", "total price estimation, $", "total time estimation, hour", "total_input_tokens", "total_output_tokens"]
final_table = pred_final_table.T.reset_index(names="model")
final_table["approach"]  = 1
final_table = final_table.set_index(["approach", "model"])

In [None]:
final_table[cols].round(3)