# Import Libraries and setup Hugging Face

In [1]:
import time
import os
from dotenv import load_dotenv
import torch
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from datasets import load_dataset
from tqdm.notebook import tqdm
from transformers import BatchEncoding
from prettytable import PrettyTable

# Update HF cache directory
env_path = os.path.abspath(os.path.join(os.getcwd(), '..', '..', '.env'))
load_dotenv(env_path)
hf_cache_dir = os.getenv('TRANSFORMERS_CACHE')
os.makedirs(hf_cache_dir, exist_ok=True)
print(f"Hugging Face cache directory set to: {hf_cache_dir}")

from transformers import AutoModelForCausalLM, AutoTokenizer

# Check cuda version torch is using
print(f"Using torch {torch.__version__} with cuda {torch.version.cuda}")

workspace_dir = os.getenv('WORKSPACE_DIR')

seed = 42

Hugging Face cache directory set to: /root/repos/DL-Final-Project/.cache/huggingface
Using torch 2.5.1 with cuda 12.1




# Import the dataset
It is stored in the dataset directory which is gitignored so run this block to repopulate if needed

In [2]:
# # Check if dataset is present
# dataset_dir = os.path.join(workspace_dir, 'datasets')
# os.makedirs(dataset_dir, exist_ok=True)

# if not os.path.exists(os.path.join(dataset_dir, 'IMDB Dataset.csv')):
#     !kaggle datasets download lakshmi25npathi/imdb-dataset-of-50k-movie-reviews --path {dataset_dir} --unzip

# # Load dataset into dataframe
# dataset = pd.read_csv(os.path.join(dataset_dir, 'IMDB Dataset.csv'))
# print(dataset.head())

# _, test_set = train_test_split(dataset, test_size=0.2, random_state=seed)

imdb = load_dataset("imdb")

test_dataset = imdb['test'].shuffle(seed=seed).select([i for i in list(range(100))])
train_dataset = imdb['train'].shuffle(seed=seed)

print(f"Train dataset: {len(train_dataset)}")
print(f"Test dataset: {len(test_dataset)}")
print(test_dataset[0])

Train dataset: 25000
Test dataset: 100
{'text': "<br /><br />When I unsuspectedly rented A Thousand Acres, I thought I was in for an entertaining King Lear story and of course Michelle Pfeiffer was in it, so what could go wrong?<br /><br />Very quickly, however, I realized that this story was about A Thousand Other Things besides just Acres. I started crying and couldn't stop until long after the movie ended. Thank you Jane, Laura and Jocelyn, for bringing us such a wonderfully subtle and compassionate movie! Thank you cast, for being involved and portraying the characters with such depth and gentleness!<br /><br />I recognized the Angry sister; the Runaway sister and the sister in Denial. I recognized the Abusive Husband and why he was there and then the Father, oh oh the Father... all superbly played. I also recognized myself and this movie was an eye-opener, a relief, a chance to face my OWN truth and finally doing something about it. I truly hope A Thousand Acres has had the same e

# Define Experiment Functions

In [3]:
"""
To calculate the values of accuracy, recall, specificity, precision, and F-score, you need the confusion matrix or the key components: True Positives (TP), True Negatives (TN), False Positives (FP), and False Negatives (FN). Here's how each metric is calculated:

1. **Accuracy**: The proportion of correctly classified instances (both positive and negative) out of all instances.
   \[
   \text{Accuracy} = \frac{TP + TN}{TP + TN + FP + FN}
   \]

2. **Recall (Sensitivity)**: The proportion of actual positives correctly identified.
   \[
   \text{Recall} = \frac{TP}{TP + FN}
   \]

3. **Specificity**: The proportion of actual negatives correctly identified.
   \[
   \text{Specificity} = \frac{TN}{TN + FP}
   \]

4. **Precision**: The proportion of predicted positives that are actually positive.
   \[
   \text{Precision} = \frac{TP}{TP + FP}
   \]

5. **F-score**: The harmonic mean of precision and recall, balancing the two.
   \[
   \text{F-Score} = 2 \cdot \frac{\text{Precision} \cdot \text{Recall}}{\text{Precision} + \text{Recall}}
   \]

"""


def evaluate_model_zero_shot(model, tokenizer, device, dataset, top_k=50):
    # Data preparation
    reviews = [example['text'] for example in dataset]
    true_labels = [example['label'] for example in dataset]  # 0 for negative, 1 for positive

    # Define the prompts
    prompts = [f"Movie Review: {review} \n Only Answer if this Movie Review is Positive or Negative:" for review in reviews]

    # Perform inference
    predictions = []
    inference_times = []

    for idx, example in tqdm(enumerate(dataset), total=len(dataset), desc="Processing", leave=True):
        # Tokenize the input
        inputs = tokenizer.encode(prompts[idx], return_tensors="pt").to(device)

        # Perform inference
        start_time = time.time()
        with torch.no_grad():
            outputs = model(inputs)
            logits = outputs.logits
        end_time = time.time()

        # Get the probabilities for the next token
        next_token_logits = logits[:, -1, :]  # Only consider the last token's logits
        probabilities = torch.softmax(next_token_logits, dim=-1)

        # Get the top k most likely tokens
        top_k_probs, top_k_indices = torch.topk(probabilities, top_k) # these are sorted in order of most likely to least likely

        # Decode the top k tokens
        top_k_tokens = [tokenizer.decode([token]) for token in top_k_indices[0]]

        # Extract the sentiment prediction from the top k tokens, default to negative for confusion matrix calculation
        for token in top_k_tokens:
            token_lower = token.strip().lower()
            if token_lower == 'positive':
                pred = 1
                break
            else:
                pred = 0

        # If the model did not predict a sentiment, default to negative
        predictions.append(pred)
        
        inference_times.append(end_time - start_time)


    # Calculate confusion matrix    
    tn, fp, fn, tp = confusion_matrix(true_labels, predictions).ravel()

    # Calculate metrics
    accuracy = (tp + tn) / (tp + tn + fp + fn)
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    f_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    # Calculate total and average inference times
    total_inference_time = sum(inference_times)
    average_inference_time = total_inference_time / len(inference_times)

    return {
        "accuracy": accuracy,
        "recall": recall,
        "specificity": specificity,
        "precision": precision,
        "f_score": f_score,
        "total_inference_time": total_inference_time,
        "average_inference_time": average_inference_time
    }



def evaluate_model_few_shot(model, tokenizer, device, dataset, top_k=50):
    # Data preparation
    reviews = [example['text'] for example in dataset]
    true_labels = [example['label'] for example in dataset]  # 0 for negative, 1 for positive

    # Define the prompts
    # example_review = "This movie was an absolute masterpiece with stunning visuals and a gripping story!"
    # example_review_neg = "This movie was terrible and I hated it."
    # example_negative_review_2 = "I really think this movie is not that good. It was a waste of time."
    few_shot_rev_1 = "Movie Review: I loved this movie ! So good plot ! \n Only Answer if this Movie Review is Positive or Negative: Positive \n"
    few_shot_rev_2 = "Movie Review: I hated this, could be a lot better \n Only Answer if this Movie Review is Positive or Negative: Negative \n"
 
    prompts = [f"{few_shot_rev_1} {few_shot_rev_2} Movie Review: {review} \n Only Answer if this Movie Review is Positive or Negative:" for review in reviews]

    # Perform inference
    predictions = []
    inference_times = []

    for idx, example in tqdm(enumerate(dataset), total=len(dataset), desc="Processing", leave=True):
        # Tokenize the input
        inputs = tokenizer.encode(prompts[idx], return_tensors="pt").to(device)

        # Perform inference
        start_time = time.time()
        with torch.no_grad():
            outputs = model(inputs)
            logits = outputs.logits
        end_time = time.time()

        # Get the probabilities for the next token
        next_token_logits = logits[:, -1, :]  # Only consider the last token's logits
        probabilities = torch.softmax(next_token_logits, dim=-1)

        # Get the top k most likely tokens
        top_k_probs, top_k_indices = torch.topk(probabilities, top_k) # these are sorted in order of most likely to least likely

        # Decode the top k tokens
        top_k_tokens = [tokenizer.decode([token]) for token in top_k_indices[0]]

        # Extract the sentiment prediction from the top k tokens
        for token in top_k_tokens:
            token_lower = token.strip().lower()
            if token_lower == 'positive':
                pred = 1
                break
            else:
                pred = 0

        # If the model did not predict a sentiment, default to negative
        predictions.append(pred)
        
        inference_times.append(end_time - start_time)

    # Calculate confusion matrix    
    tn, fp, fn, tp = confusion_matrix(true_labels, predictions).ravel()

    # Calculate metrics
    accuracy = (tp + tn) / (tp + tn + fp + fn)
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    f_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    # Calculate total and average inference times
    total_inference_time = sum(inference_times)
    average_inference_time = total_inference_time / len(inference_times)

    return {
        "accuracy": accuracy,
        "recall": recall,
        "specificity": specificity,
        "precision": precision,
        "f_score": f_score,
        "total_inference_time": total_inference_time,
        "average_inference_time": average_inference_time
    }


def create_results_table(results_dict, model_name="Model Results"):
    """
    Creates a formatted table from the results dictionary.
    
    Parameters:
        results_dict (dict): Dictionary containing evaluation metrics.
        model_name (str): Name of the model being evaluated.
    
    Returns:
        str: Formatted table as a string.
    """
    # Initialize a PrettyTable
    table = PrettyTable()
    
    # Set the table title
    table.title = f"Results for {model_name}"
    
    # Add columns
    table.field_names = ["Metric", "Value"]
    
    # Add rows for each metric
    table.add_row(["Accuracy", f"{results_dict['accuracy']:.2f}"])
    table.add_row(["Recall (Sensitivity)", f"{results_dict['recall']:.2f}"])
    if "specificity" in results_dict:  # Specificity might not be included in some results
        table.add_row(["Specificity", f"{results_dict['specificity']:.2f}"])
    table.add_row(["Precision", f"{results_dict['precision']:.2f}"])
    table.add_row(["F-Score", f"{results_dict['f_score']:.2f}"])
    table.add_row(["Total Inference Time (s)", f"{results_dict['total_inference_time']:.2f}"])
    table.add_row(["Average Inference Time (s)", f"{results_dict['average_inference_time']:.2f}"])
    
    # Return the table as a string
    return table.get_string()

  """


# SmolLM-135M


In [4]:
device = "cuda" # for GPU usage or "cpu" for CPU usage

# Clears cuda from last run
if device == "cuda":
    torch.cuda.empty_cache()
    torch.cuda.ipc_collect()
    torch.cuda.reset_peak_memory_stats()

#FROM https://huggingface.co/HuggingFaceTB/SmolLM2-135M TODO: Dont forget to cite the model in report
checkpoint = "HuggingFaceTB/SmolLM2-135M"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForCausalLM.from_pretrained(checkpoint).to(device)

## Zero Shot

In [5]:
zero_135_results = evaluate_model_zero_shot(model, tokenizer, device, test_dataset)

zero_135_table = create_results_table(zero_135_results, model_name="SmolLM2-135M Zero-Shot")
print(zero_135_table)

Processing:   0%|          | 0/100 [00:00<?, ?it/s]

Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)


+--------------------------------------+
|  Results for SmolLM2-135M Zero-Shot  |
+-----------------------------+--------+
|            Metric           | Value  |
+-----------------------------+--------+
|           Accuracy          |  0.64  |
|     Recall (Sensitivity)    |  0.51  |
|         Specificity         |  0.75  |
|          Precision          |  0.65  |
|           F-Score           |  0.57  |
|   Total Inference Time (s)  |  1.87  |
|  Average Inference Time (s) |  0.02  |
+-----------------------------+--------+


## Few Shot

In [6]:
few_135_results = evaluate_model_few_shot(model, tokenizer, device, test_dataset)

few_135_table = create_results_table(few_135_results, model_name="SmolLM2-135M Few-Shot")
print(few_135_table)

Processing:   0%|          | 0/100 [00:00<?, ?it/s]

+------------------------------------+
| Results for SmolLM2-135M Few-Shot  |
+----------------------------+-------+
|           Metric           | Value |
+----------------------------+-------+
|          Accuracy          |  0.47 |
|    Recall (Sensitivity)    |  1.00 |
|        Specificity         |  0.00 |
|         Precision          |  0.47 |
|          F-Score           |  0.64 |
|  Total Inference Time (s)  |  1.81 |
| Average Inference Time (s) |  0.02 |
+----------------------------+-------+


# SmolLM-360M

In [7]:
# Clears cuda from last run
if device == "cuda":
    torch.cuda.empty_cache()
    torch.cuda.ipc_collect()
    torch.cuda.reset_peak_memory_stats()
    
checkpoint = "HuggingFaceTB/SmolLM2-360M"
device = "cuda" # for GPU usage or "cpu" for CPU usage
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForCausalLM.from_pretrained(checkpoint).to(device)

## Zero Shot

In [8]:
zero_360_results = evaluate_model_zero_shot(model, tokenizer, device, test_dataset)

zero_360_table = create_results_table(zero_360_results, model_name="SmolLM2-360M Zero-Shot")
print(zero_360_table)

Processing:   0%|          | 0/100 [00:00<?, ?it/s]

+--------------------------------------+
|  Results for SmolLM2-360M Zero-Shot  |
+-----------------------------+--------+
|            Metric           | Value  |
+-----------------------------+--------+
|           Accuracy          |  0.47  |
|     Recall (Sensitivity)    |  1.00  |
|         Specificity         |  0.00  |
|          Precision          |  0.47  |
|           F-Score           |  0.64  |
|   Total Inference Time (s)  |  1.90  |
|  Average Inference Time (s) |  0.02  |
+-----------------------------+--------+


## Few Shot

In [9]:
few_360_results = evaluate_model_few_shot(model, tokenizer, device, test_dataset)

few_360_table = create_results_table(few_360_results, model_name="SmolLM2-360M Few-Shot")
print(few_360_table)

Processing:   0%|          | 0/100 [00:00<?, ?it/s]

+------------------------------------+
| Results for SmolLM2-360M Few-Shot  |
+----------------------------+-------+
|           Metric           | Value |
+----------------------------+-------+
|          Accuracy          |  0.47 |
|    Recall (Sensitivity)    |  1.00 |
|        Specificity         |  0.00 |
|         Precision          |  0.47 |
|          F-Score           |  0.64 |
|  Total Inference Time (s)  |  1.80 |
| Average Inference Time (s) |  0.02 |
+----------------------------+-------+


# SmolLM2-1.7B

In [10]:
# # Clears cuda from last run
# if device == "cuda":
#     torch.cuda.empty_cache()
#     torch.cuda.ipc_collect()
#     torch.cuda.reset_peak_memory_stats()

# checkpoint = "HuggingFaceTB/SmolLM2-1.7B"
# device = "cuda" # for GPU usage or "cpu" for CPU usage
# tokenizer = AutoTokenizer.from_pretrained(checkpoint)
# model = AutoModelForCausalLM.from_pretrained(checkpoint).to(device)

## Zero Shot

In [11]:
zero_17_results = evaluate_model_zero_shot(model, tokenizer, device, test_dataset)

zero_17_table = create_results_table(zero_17_results, model_name="SmolLM2-1.7B Zero-Shot")
print(zero_17_table)

Processing:   0%|          | 0/100 [00:00<?, ?it/s]

+--------------------------------------+
|  Results for SmolLM2-1.7B Zero-Shot  |
+-----------------------------+--------+
|            Metric           | Value  |
+-----------------------------+--------+
|           Accuracy          |  0.47  |
|     Recall (Sensitivity)    |  1.00  |
|         Specificity         |  0.00  |
|          Precision          |  0.47  |
|           F-Score           |  0.64  |
|   Total Inference Time (s)  |  1.89  |
|  Average Inference Time (s) |  0.02  |
+-----------------------------+--------+


## Few Shot

In [12]:
few_17_results = evaluate_model_few_shot(model, tokenizer, device, test_dataset)

few_17_table = create_results_table(few_17_results, model_name="SmolLM2-1.7B Few-Shot")
print(few_17_table)

Processing:   0%|          | 0/100 [00:00<?, ?it/s]

+------------------------------------+
| Results for SmolLM2-1.7B Few-Shot  |
+----------------------------+-------+
|           Metric           | Value |
+----------------------------+-------+
|          Accuracy          |  0.47 |
|    Recall (Sensitivity)    |  1.00 |
|        Specificity         |  0.00 |
|         Precision          |  0.47 |
|          F-Score           |  0.64 |
|  Total Inference Time (s)  |  1.89 |
| Average Inference Time (s) |  0.02 |
+----------------------------+-------+


# Print All Results

In [19]:
print("All models evaluated successfully!")

results_dicts = [
    zero_135_results,
    few_135_results,
    zero_360_results,
    few_360_results,
    zero_17_results,
    few_17_results,
]

# List of model names
model_names = [
    "SmolLM2-135M Zero-Shot",
    "SmolLM2-135M Few-Shot",
    "SmolLM2-360M Zero-Shot",
    "SmolLM2-360M Few-Shot",
    "SmolLM2-1.7B Zero-Shot",
    "SmolLM2-1.7B Few-Shot",
]

table = PrettyTable()

# Define the columns
table.field_names = [
    "Model",
    "Accuracy",
    "Recall (Sensitivity)",
    "Specificity",
    "Precision",
    "F-Score",
    "Total Inference Time (s)",
    "Avg Inference Time (s)"
]

# Populate the table
for model_name, results in zip(model_names, results_dicts):
    table.add_row([
        model_name,
        f"{results['accuracy']:.2f}",
        f"{results['recall']:.2f}",
        f"{results['specificity']:.2f}",
        f"{results['precision']:.2f}",
        f"{results['f_score']:.2f}",
        f"{results['total_inference_time']:.2f}",
        f"{results['average_inference_time']:.2f}",
    ])

print(table)

All models evaluated successfully!
+------------------------+----------+----------------------+-------------+-----------+---------+--------------------------+------------------------+
|         Model          | Accuracy | Recall (Sensitivity) | Specificity | Precision | F-Score | Total Inference Time (s) | Avg Inference Time (s) |
+------------------------+----------+----------------------+-------------+-----------+---------+--------------------------+------------------------+
| SmolLM2-135M Zero-Shot |   0.64   |         0.51         |     0.75    |    0.65   |   0.57  |           1.87           |          0.02          |
| SmolLM2-135M Few-Shot  |   0.47   |         1.00         |     0.00    |    0.47   |   0.64  |           1.81           |          0.02          |
| SmolLM2-360M Zero-Shot |   0.47   |         1.00         |     0.00    |    0.47   |   0.64  |           1.90           |          0.02          |
| SmolLM2-360M Few-Shot  |   0.47   |         1.00         |     0.00  

# Cleanup Cuda

In [14]:
# Clears cuda from last run
if device == "cuda":
    torch.cuda.empty_cache()
    torch.cuda.ipc_collect()
    torch.cuda.reset_peak_memory_stats()