# Import Libraries

In [None]:
import time
import json

import pandas as pd
from tqdm.auto import tqdm

from langchain_openai import ChatOpenAI
from langchain.schema import HumanMessage
from sklearn.metrics import classification_report, accuracy_score, f1_score
from sklearn.linear_model import LogisticRegression
from sentence_transformers import SentenceTransformer

In [3]:
import os
os.environ["OPENAI_API_KEY"] = "API_KEY"

# Define constants

In [4]:
TRAIN_FOLD_SAMPLE_SIZE = 100
RANDOM_STATE = 42
MODELS_PRICES_SPEC = {
    "gpt-4o-mini": {
        "price_input_per_1K": 0.000150,
        "price_output_per_1K": 0.000600,
        "batch_size": 30
    },
    "gpt-4o": {
        "price_input_per_1K": 0.00250,
        "price_output_per_1K": 0.01000,
        "batch_size": 50
    }
}

# Load data

In [None]:
splits = {'train': 'train.csv', 'test': 'test.csv'}
df_train = pd.read_csv("hf://datasets/yassiracharki/Amazon_Reviews_Binary_for_Sentiment_Analysis/" + splits["train"])
df_test = pd.read_csv("hf://datasets/yassiracharki/Amazon_Reviews_Binary_for_Sentiment_Analysis/" + splits["test"])

## Prepare data for train/test

In [None]:
train_fold_sample = df_train.sample(n=TRAIN_FOLD_SAMPLE_SIZE, random_state=RANDOM_STATE)
train_fold_sample.head()

# Compute LLM Predictions

In [8]:
prompt = """
    Classify the sentiment of each example in the following JSON array as "positive" or "negative".
    Respond only in JSON format where each ID is a key and its value is 1 for "positive" and 0 for "negative".
    
    Examples: {}
    
    Deliver the response here in plain text without any formatting.
    """

In [9]:
def clear_response(text):
    if text[0]=='`':
        text = text.replace('`','')
        text = text.replace('json','')
    return text

In [None]:


results = {}

for model, spec in tqdm(MODELS_PRICES_SPEC.items(), desc="Models"):

    chat_model = ChatOpenAI(model=model, temperature=0)

    # Initialize variables to store results and metadata for the current model
    model_results = {}
    model_results_raw = []
    model_total_input_tokens = []
    model_total_output_tokens = []
    error_examples = []
    start_time = time.time()
    
    for i in tqdm(range(0, len(train_fold_sample), spec["batch_size"]), desc="Batches"):
        # Get the current batch of data
        batch_data = train_fold_sample.iloc[i:i + spec["batch_size"]]
        
        # Prepare the prompt
        examples = [{"id": idx, "text": row["review_text"]} for idx, row in batch_data.iterrows()]
        input_data = prompt.format(json.dumps(examples))
        
        response = chat_model.invoke([HumanMessage(content=input_data)])

        # Clear the response text
        response_text = clear_response(response.content)
        try:
            # Parse the response text as JSON
            batch_results = json.loads(response_text)
            model_total_input_tokens.append(response.usage_metadata["input_tokens"])
            model_total_output_tokens.append(response.usage_metadata["output_tokens"])
            model_results.update(batch_results)
            model_results_raw.append(response.content)
            
        # Handle JSON decode errors and store the error inputs
        # when the response is not a valid JSON
        except json.JSONDecodeError as e:
            print(f"Error at batch {i} with error {e}")
            error_examples.append([batch_data, input_data, e, response_text])
            
    # Store the results and metadata for the current model
    results[model] = {
        "results": model_results,
        "total_input_tokens": model_total_input_tokens,
        "total_output_tokens": model_total_output_tokens,
        "total_time": time.time() - start_time,
        "raw_results": model_results_raw,
        "error_examples": error_examples
    }


# Prepare LLM Predictions

In [11]:
models_preds = {
    model: results[model]["results"]
    for model in results
}

In [12]:
models_preds = pd.DataFrame(models_preds)
models_preds.index = models_preds.index.astype(int)
models_preds.columns=[f"pred from {model}" for model in models_preds]

In [None]:
evaluation_df = pd.concat([train_fold_sample, models_preds+1], axis=1)
evaluation_df.head()

# Compute Embeddings

In [14]:
embeddings_model = SentenceTransformer('all-MiniLM-L6-v2')

## Train fold sample Embeddings

In [None]:
embeddings = embeddings_model.encode(evaluation_df["review_text"].to_list(), show_progress_bar=True, batch_size=500)

In [16]:
evaluation_df["embedding"] = embeddings.tolist()

## Golden Fold Embeddings

In [17]:
start_time_for_golden_fold = time.time()

In [None]:
test_sample = df_test.sample(n=df_test.shape[0], random_state=RANDOM_STATE)

In [18]:
test_sample = df_test.sample(n=100, random_state=RANDOM_STATE)

In [None]:
embeddings_test = embeddings_model.encode(test_sample["review_text"].to_list(), show_progress_bar=True, batch_size=100)

# Train Models

In [20]:
models_models = {}
for model in MODELS_PRICES_SPEC:
    col = f"pred from {model}"
    data_part_for_train = evaluation_df[~evaluation_df[col].isna()]
    model_lr = LogisticRegression(max_iter=1000)
    model_lr.fit(data_part_for_train["embedding"].to_list(), data_part_for_train[col])
    models_models[model] = model_lr
    

# Evaluate Models

In [None]:

metrics_by_models = {}
for model in MODELS_PRICES_SPEC:
    col = f"pred from {model}"
    model_lr = models_models[model]
    test_sample[model] = model_lr.predict(embeddings_test)
    metrics_by_models[model] = {
        "accuracy": accuracy_score(test_sample["class_index"], test_sample[model]),
        "f1": f1_score(test_sample["class_index"], test_sample[model])
    }
    print(model)
    print(classification_report(test_sample["class_index"], test_sample[model], target_names=["negative", "positive"]))
    print("\n\n")

# Prepare results for table

In [22]:
for model in MODELS_PRICES_SPEC:
    metrics_by_models[model]["input_tokens_per_row"] = sum(results[model]["total_input_tokens"]) / len(train_fold_sample)
    metrics_by_models[model]["output_tokens_per_row"] = sum(results[model]["total_output_tokens"]) / len(train_fold_sample)
    metrics_by_models[model]["total_input_tokens"] = sum(results[model]["total_input_tokens"])
    metrics_by_models[model]["total_output_tokens"] = sum(results[model]["total_output_tokens"])
    
    metrics_by_models[model]["price_per_row"] = (
        metrics_by_models[model]["input_tokens_per_row"] / 1000 * MODELS_PRICES_SPEC[model]["price_input_per_1K"] +
        metrics_by_models[model]["output_tokens_per_row"] / 1000 * MODELS_PRICES_SPEC[model]["price_output_per_1K"]
    )
    metrics_by_models[model]["price_total"] = (
        metrics_by_models[model]["total_input_tokens"] / 1000 * MODELS_PRICES_SPEC[model]["price_input_per_1K"] +
        metrics_by_models[model]["total_output_tokens"] / 1000 * MODELS_PRICES_SPEC[model]["price_output_per_1K"]
    )
    metrics_by_models[model]["total_time_annot"] = results[model]["total_time"]
    metrics_by_models[model]["sec_per_row"] = results[model]["total_time"] / len(train_fold_sample)

In [None]:
metrics_by_models

In [None]:
pred_final_table = pd.DataFrame(metrics_by_models)
pred_final_table.loc["total price estimation, $", :] = pred_final_table.loc["price_total", :]
pred_final_table.loc["total time estimation, hour", :] = (time.time() - start_time_for_golden_fold + pred_final_table.loc["total_time_annot", :]) / 3600
pred_final_table

# Final Result

In [25]:
cols = ["accuracy", "f1", "total price estimation, $", "total time estimation, hour", "total_input_tokens", "total_output_tokens"]

In [None]:
final_table = pred_final_table.T.reset_index(names="model")
final_table["approach"]  = 2
final_table = final_table.set_index(["approach", "model"])
final_table[cols].round(3)