# Import Libraries

In [None]:
import time
import json

import numpy as np
import pandas as pd
from tqdm.auto import tqdm

from langchain_openai import ChatOpenAI
from langchain.schema import HumanMessage
from sklearn.metrics import classification_report, accuracy_score, f1_score
from sklearn.linear_model import LogisticRegression
from sentence_transformers import SentenceTransformer
from modAL.models import ActiveLearner
from modAL.uncertainty import uncertainty_sampling

In [2]:
import os
os.environ["OPENAI_API_KEY"] = "API_KEY"

# Define constants

In [3]:
TRAIN_FOLD_SAMPLE_SIZE = 300_000
NUMBER_OF_LABELS_FOR_AL = 5_000
RANDOM_STATE = 42
MODELS_PRICES_SPEC = {
    "gpt-4o-mini": {
        "price_input_per_1K": 0.000150,
        "price_output_per_1K": 0.000600,
        "batch_size": 30
    },
    "gpt-4o": {
        "price_input_per_1K": 0.00250,
        "price_output_per_1K": 0.01000,
        "batch_size": 50
    }
}

In [4]:
assert TRAIN_FOLD_SAMPLE_SIZE>=NUMBER_OF_LABELS_FOR_AL, "TRAIN_FOLD_SAMPLE_SIZE should be greater than NUMBER_OF_LABELS_FOR_AL"

# Load data

In [5]:
df_train = pd.read_csv("../data/raw/ama_train.csv")
df_test = pd.read_csv("../data/raw/ama_test.csv")

## Prepare data for train/test

In [None]:
train_fold_sample = df_train.sample(n=TRAIN_FOLD_SAMPLE_SIZE, random_state=RANDOM_STATE)
train_fold_sample.head()

# Compute Train fold sample Embeddings

In [7]:
embeddings_model = SentenceTransformer('all-MiniLM-L6-v2', device="mps")

In [8]:
start_train_embs = time.time()

In [None]:
embedings_train = embeddings_model.encode(train_fold_sample["review_text"].to_list(), show_progress_bar=True, batch_size=200)

In [10]:
train_fold_sample["embedding"] = embedings_train.tolist()

In [None]:
end_train_embs = time.time()-start_train_embs
end_train_embs

# Compute LLM Predictions + Active Learning Loop

In [15]:
prompt = """
    Classify the sentiment of each example in the following JSON array as "positive" or "negative".
    Respond only in JSON format where each ID is a key and its value is 1 for "positive" and 0 for "negative".
    
    Examples: {}
    
    Deliver the response here in plain text without any formatting.
    """

In [16]:
def clear_response(text):
    if text[0]=='`':
        text = text.replace('`','')
        text = text.replace('json','')
    return text

In [17]:
results = {}

In [None]:
for model, spec in MODELS_PRICES_SPEC.items():
    chat_model = ChatOpenAI(model=model, temperature=0)
    print("-------------------")
    print(f"Model: {model}")
    
    # store labels during the active learning loop
    labels = {}
    # Initialize variables to store results and metadata for the current model
    model_results = []
    model_total_input_tokens = []
    model_total_output_tokens = []
    start_time = time.time()
    
    # Initialize the active learning loop
    initial_flag = True
    # Initial number of examples to label
    n_initial = spec["batch_size"]

    iteration = 0
    # Error count to stop the loop if the model returns an error more than 3 times
    error_count = 0
    
    # Active learning loop
    while len(labels)<=NUMBER_OF_LABELS_FOR_AL:
        print(f"Iteration {iteration} Len labels: {len(labels)} \nerror count: {error_count}")
        
        # Randomly select the initial examples
        if initial_flag:
            np.random.seed(RANDOM_STATE)
            query_idx = np.random.choice(train_fold_sample.index, size=n_initial, replace=False)
            # disable initial run flag
            initial_flag = False
        

        # Get the examples to label
        batch_data = train_fold_sample.loc[query_idx]
        
        # Prepare the prompt
        examples = [{"id": idx, "text": row["review_text"]} for idx, row in batch_data.iterrows()]
        input_data = prompt.format(json.dumps(examples))
        
        response = chat_model.invoke([HumanMessage(content=input_data)])
        
        # Clear the response text
        response_text = clear_response(response.content)
        
        try:
            # Parse the response text as JSON
            batch_results = json.loads(response_text)
            
            # Filter results with incorrect structure in keys (LLM can return keys as words)
            batch_results = {key: value for key, value in batch_results.items() if key in batch_data.index.astype(str)}
            
            # Store the total input and output tokens
            model_total_input_tokens.append(response.usage_metadata["input_tokens"])
            model_total_output_tokens.append(response.usage_metadata["output_tokens"])
            model_results.append(batch_results)
            
            # Update the labels with the new results
            labels.update(batch_results)
            
            # Prepare the data for the active learner
            step_df_with_labels = train_fold_sample.loc[map(int, labels)]
            step_df_with_labels["label"] = [labels[str(idx)] for idx in step_df_with_labels.index]
            
            # Train the active learner
            learner = ActiveLearner(
                        estimator=LogisticRegression(),
                        query_strategy=uncertainty_sampling,
                        X_training=step_df_with_labels["embedding"].to_list(),
                        y_training=step_df_with_labels["label"],
                    )
            
            # excluding the examples that are already labeled for querying
            step_df = train_fold_sample[~train_fold_sample.index.isin(labels)]
            
            # Query the next examples to label
            query_idx_raw, query_inst = learner.query(
                                                    step_df["embedding"].to_list(), 
                                                    n_instances=n_initial
                                                )
            
            # Get the index of the examples to label
            query_idx = step_df.iloc[query_idx_raw].index
            
            iteration += 1
            error_count = 0
            
        # Handle JSON decode errors and store the error inputs
        # when the response is not a valid JSON
        except json.JSONDecodeError as e:
            print(f"Error: {e}")
            
            # Stop the loop if the error count exceeds 3
            if error_count>3:
                print("Error count exceeded 3. Stopping the loop.")
                break
            
            iteration += 1
            error_count += 1

    results[model] = {
        "labels": labels,
        "step_results": model_results,
        "total_input_tokens": model_total_input_tokens,
        "total_output_tokens": model_total_output_tokens,
        "total_time": time.time() - start_time
    }

# Prepare LLM + Active Learning Predictions

In [19]:
models_preds = {
    model: results[model]["labels"]
    for model in results
}

models_preds = pd.DataFrame(models_preds)
models_preds.index = models_preds.index.astype(int)
models_preds.columns=[f"pred from {model}" for model in models_preds]

In [None]:
evaluation_df = pd.concat([train_fold_sample, models_preds+1], axis=1)
evaluation_df.head()

# Train Models

In [22]:
models_models = {}
for model in MODELS_PRICES_SPEC:
    col = f"pred from {model}"
    data_part_for_train = evaluation_df[~evaluation_df[col].isna()]
    model_lr = LogisticRegression(max_iter=1000)
    model_lr.fit(data_part_for_train["embedding"].to_list(), data_part_for_train[col])
    models_models[model] = model_lr

# Compute Golden fold Embeddings

In [36]:
start_time_for_golden_fold = time.time()

In [37]:
golden_fold = df_test.sample(n=df_test.shape[0], random_state=RANDOM_STATE)

In [None]:
embeddings_test = embeddings_model.encode(golden_fold["review_text"].to_list(), show_progress_bar=True, batch_size=100)

In [40]:
golden_fold["embedding"] = embeddings_test.tolist()

# Evaluate Models

In [None]:
metrics_by_models = {}
for model in MODELS_PRICES_SPEC:
    col = f"pred from {model}"
    model_lr = models_models[model]
    golden_fold[model] = model_lr.predict(golden_fold["embedding"].to_list())

    metrics_by_models[model] = {
        "accuracy": accuracy_score(golden_fold["class_index"], golden_fold[model]),
        "f1": f1_score(golden_fold["class_index"], golden_fold[model])
    }
    print(model)
    print(classification_report(golden_fold["class_index"], golden_fold[model], target_names=["negative", "positive"]))
    print("\n\n")

# Prepare results for table

In [42]:
for model in MODELS_PRICES_SPEC:
    metrics_by_models[model]["input_tokens_per_row"] = sum(results[model]["total_input_tokens"]) / len(train_fold_sample)
    metrics_by_models[model]["output_tokens_per_row"] = sum(results[model]["total_output_tokens"]) / len(train_fold_sample)
    metrics_by_models[model]["total_input_tokens"] = sum(results[model]["total_input_tokens"])
    metrics_by_models[model]["total_output_tokens"] = sum(results[model]["total_output_tokens"])
    
    metrics_by_models[model]["price_per_row"] = (
        metrics_by_models[model]["input_tokens_per_row"] / 1000 * MODELS_PRICES_SPEC[model]["price_input_per_1K"] +
        metrics_by_models[model]["output_tokens_per_row"] / 1000 * MODELS_PRICES_SPEC[model]["price_output_per_1K"]
    )
    metrics_by_models[model]["price_total"] = (
        metrics_by_models[model]["total_input_tokens"] / 1000 * MODELS_PRICES_SPEC[model]["price_input_per_1K"] +
        metrics_by_models[model]["total_output_tokens"] / 1000 * MODELS_PRICES_SPEC[model]["price_output_per_1K"]
    )
    metrics_by_models[model]["total_time_annot"] = results[model]["total_time"]
    metrics_by_models[model]["sec_per_row"] = results[model]["total_time"] / len(train_fold_sample)

In [None]:
metrics_by_models

In [None]:
pred_final_table = pd.DataFrame(metrics_by_models)
pred_final_table

In [45]:
end_time_for_golden_fold = time.time() - start_time_for_golden_fold
pred_final_table.loc["total price estimation, $", :] = pred_final_table.loc["price_total", :]
pred_final_table.loc["total time estimation, hour", :] = (end_time_for_golden_fold + pred_final_table.loc["total_time_annot", :] + end_train_embs) / 3600

In [None]:
pred_final_table

# Final Result

In [47]:
cols = ["accuracy", "f1", "total price estimation, $", "total time estimation, hour", "total_input_tokens", "total_output_tokens"]

In [None]:
final_table = pred_final_table.T.reset_index(names="model")
final_table["approach"]  = 3
final_table = final_table.set_index(["approach", "model"])
final_table[cols].round(3)