# Import Libraries and setup Hugging Face

In [1]:
# Pace Setup
# !module load anaconda3
# !module load gcc/12.3.0
# !module load cuda/12.6.1

In [2]:
!nvidia-smi

import os

os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'

import time
from dotenv import load_dotenv
import torch
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from tqdm import tqdm
from datasets import load_dataset
from prettytable import PrettyTable

# Update HF cache directory
env_path = os.path.abspath(os.path.join(os.getcwd(), '..', '.env'))
print(env_path)
load_dotenv(env_path)
hf_cache_dir = os.getenv('TRANSFORMERS_CACHE')
os.makedirs(hf_cache_dir, exist_ok=True)
print(f"Hugging Face cache directory set to: {hf_cache_dir}")

from transformers import AutoModelForCausalLM, AutoTokenizer

# Check cuda version torch is using
print(f"Using torch {torch.__version__} with cuda {torch.version.cuda}")

workspace_dir = os.getenv('WORKSPACE_DIR')

seed = 42

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Sun Dec  8 01:19:42 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 555.42.02              Driver Version: 555.42.02      CUDA Version: 12.5     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA A100-PCIE-40GB          On  |   00000000:C1:00.0 Off |                    0 |
| N/A   31C    P0             42W /  250W |       1MiB /  40960MiB |      0%      Default |
|                                         |                        |             Disabled |
+-----------------------------------------+------------------------+----------------------+
                                                



Using torch 2.5.1 with cuda 12.4
Using device: cuda


# Import the dataset
It is stored in the dataset directory which is gitignored so run this block to repopulate if needed

In [3]:
# # Check if dataset is present
# dataset_dir = os.path.join(workspace_dir, 'datasets')
# os.makedirs(dataset_dir, exist_ok=True)

# if not os.path.exists(os.path.join(dataset_dir, 'IMDB Dataset.csv')):
#     !kaggle datasets download lakshmi25npathi/imdb-dataset-of-50k-movie-reviews --path {dataset_dir} --unzip

# # Load dataset into dataframe
# dataset = pd.read_csv(os.path.join(dataset_dir, 'IMDB Dataset.csv'))
# print(dataset.head())

# _, test_set = train_test_split(dataset, test_size=0.2, random_state=seed)

imdb = load_dataset("imdb")

test_dataset = imdb['test'].shuffle(seed=seed)#.select([i for i in list(range(500))])
train_dataset = imdb['train'].shuffle(seed=seed)

print(f"Train dataset: {len(train_dataset)}")
print(f"Test dataset: {len(test_dataset)}")
print(test_dataset[0])

Train dataset: 25000
Test dataset: 25000
{'text': "<br /><br />When I unsuspectedly rented A Thousand Acres, I thought I was in for an entertaining King Lear story and of course Michelle Pfeiffer was in it, so what could go wrong?<br /><br />Very quickly, however, I realized that this story was about A Thousand Other Things besides just Acres. I started crying and couldn't stop until long after the movie ended. Thank you Jane, Laura and Jocelyn, for bringing us such a wonderfully subtle and compassionate movie! Thank you cast, for being involved and portraying the characters with such depth and gentleness!<br /><br />I recognized the Angry sister; the Runaway sister and the sister in Denial. I recognized the Abusive Husband and why he was there and then the Father, oh oh the Father... all superbly played. I also recognized myself and this movie was an eye-opener, a relief, a chance to face my OWN truth and finally doing something about it. I truly hope A Thousand Acres has had the same

# Define Experiment Functions

# HUGE NOTE, the order of pos vs neg in the asking of sentiment matters alot, first one listed tends to be the default models answer

In [4]:
"""
To calculate the values of accuracy, recall, specificity, precision, and F-score, you need the confusion matrix or the key components: True Positives (TP), True Negatives (TN), False Positives (FP), and False Negatives (FN). Here's how each metric is calculated:

1. **Accuracy**: The proportion of correctly classified instances (both positive and negative) out of all instances.
   \[
   \text{Accuracy} = \frac{TP + TN}{TP + TN + FP + FN}
   \]

2. **Recall (Sensitivity)**: The proportion of actual positives correctly identified.
   \[
   \text{Recall} = \frac{TP}{TP + FN}
   \]

3. **Specificity**: The proportion of actual negatives correctly identified.
   \[
   \text{Specificity} = \frac{TN}{TN + FP}
   \]

4. **Precision**: The proportion of predicted positives that are actually positive.
   \[
   \text{Precision} = \frac{TP}{TP + FP}
   \]

5. **F-score**: The harmonic mean of precision and recall, balancing the two.
   \[
   \text{F-Score} = 2 \cdot \frac{\text{Precision} \cdot \text{Recall}}{\text{Precision} + \text{Recall}}
   \]

"""

def evaluate_model(model, tokenizer, device, dataset, top_k=50, shot_type='zero'):
    # Data preparation
    reviews = [example['text'] for example in dataset]
    true_labels = [example['label'] for example in dataset]  # 0 for negative, 1 for positive

    few_shot_rev_1 = "Movie Review: I loved this movie ! So good plot ! \n Only Answer if this Movie Review is Positive or Negative: Positive \n"
    few_shot_rev_2 = "Movie Review: I hated this, could be a lot better \n Only Answer if this Movie Review is Positive or Negative: Negative \n"
    few_shot_rev_3 = "Movie Review: This move was so good I would recommend to all my friends! \n Only Answer if this Movie Review is Positive or Negative: Positive \n"

    # One function for both
    if shot_type == 'zero':
        # Define the prompts
        prompts = [f"Movie Review: {review} \n Only Answer if this Movie Review is Positive or Negative:" for review in reviews]
    elif shot_type == 'few':
        # Need to have prompts that do not ask if pos or neg vs neg vs pos since the model will just answer the first one
        prompts = [f"{few_shot_rev_1} {few_shot_rev_2} {few_shot_rev_3} Movie Review: {review} \n Only Answer if this Movie Review is Positive or Negative:" for review in reviews]

    # Perform inference
    predictions = []
    inference_times = []
    idk_predictions = 0

    for idx, example in tqdm(enumerate(dataset), total=len(dataset), desc="Processing", leave=True):
        # Tokenize the input
        inputs = tokenizer.encode(prompts[idx], return_tensors="pt").to(device)

        # Perform inference
        start_time = time.time()
        with torch.no_grad():
            outputs = model(inputs)
            logits = outputs.logits
        end_time = time.time()

        # Get the probabilities for the next token
        next_token_logits = logits[:, -1, :]  # Only consider the last token's logits
        probabilities = torch.softmax(next_token_logits, dim=-1)

        # Get the top k most likely tokens
        top_k_probs, top_k_indices = torch.topk(probabilities, top_k) # these are sorted in order of most likely to least likely

        # Decode the top k tokens
        top_k_tokens = [tokenizer.decode([token]) for token in top_k_indices[0]]

        # print(f"Top-k tokens for review {reviews[idx]}: {top_k_tokens}")
        
        # Extract the sentiment prediction from the top k tokens, if the model did not predict a sentiment, default to negative
        pred = -1
        for token in top_k_tokens:
            token_lower = token.strip().lower()
            if token_lower == 'positive':
                pred = 1
                break
            elif token_lower == 'negative':
                pred = 0
                break
            
        if pred == -1:
            idk_predictions += 1
            pred = 0

        # If the model did not predict a sentiment, default to negative
        predictions.append(pred)
        
        inference_times.append(end_time - start_time)

        
    # print(predictions)
    # Calculate confusion matrix    
    tn, fp, fn, tp = confusion_matrix(true_labels, predictions).ravel()

    # Calculate metrics
    accuracy = (tp + tn) / (tp + tn + fp + fn)
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    f_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    # Calculate true and false answer percentages
    total_samples = len(true_labels)
    true_percent = predictions.count(1) / total_samples * 100
    false_percent = predictions.count(0) / total_samples * 100
    
    # Calculate total and average inference times
    total_inference_time = sum(inference_times)
    average_inference_time = total_inference_time / len(inference_times)

    return {
        "accuracy": accuracy,
        "recall": recall,
        "specificity": specificity,
        "precision": precision,
        "f_score": f_score,
        "true_percent": true_percent,
        "false_percent": false_percent,
        "unknown_predictions": idk_predictions,
        "total_inference_time": total_inference_time,
        "average_inference_time": average_inference_time
    }


def create_results_table(results_dict, model_name="Model Results"):
    """
    Creates a formatted table from the results dictionary.
    
    Parameters:
        results_dict (dict): Dictionary containing evaluation metrics.
        model_name (str): Name of the model being evaluated.
    
    Returns:
        str: Formatted table as a string.
    """
    # Initialize a PrettyTable
    table = PrettyTable()
    
    # Set the table title
    table.title = f"Results for {model_name}"
    
    # Add columns
    table.field_names = ["Metric", "Value"]
    
    # Add rows for each metric
    table.add_row(["Accuracy", f"{results_dict['accuracy']:.2f}"])
    table.add_row(["Recall (Sensitivity)", f"{results_dict['recall']:.2f}"])
#     if "specificity" in results_dict:  # Specificity might not be included in some results
    table.add_row(["Specificity", f"{results_dict['specificity']:.2f}"])
    table.add_row(["Precision", f"{results_dict['precision']:.2f}"])
    table.add_row(["F-Score", f"{results_dict['f_score']:.2f}"])
    table.add_row(["% True Predictions", f"{results_dict['true_percent']:.2f}%"])
    table.add_row(["% False Predictions", f"{results_dict['false_percent']:.2f}%"])
    table.add_row(["Unknown Predictions", f"{results_dict['unknown_predictions']:.2f}%"])
    table.add_row(["Total Inference Time (s)", f"{results_dict['total_inference_time']:.2f}"])
    table.add_row(["Average Inference Time (s)", f"{results_dict['average_inference_time']:.2f}"])
    
    # Return the table as a string
    return table.get_string()

  """


# SmolLM-135M


In [5]:
# Clears cuda from last run
# if device == "cuda":
#     torch.cuda.empty_cache()
#     torch.cuda.ipc_collect()
#     torch.cuda.reset_peak_memory_stats()

#FROM https://huggingface.co/HuggingFaceTB/SmolLM2-135M TODO: Dont forget to cite the model in report
checkpoint = "HuggingFaceTB/SmolLM2-135M"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForCausalLM.from_pretrained(checkpoint).to(device)

## Zero Shot

In [6]:
zero_135_results = evaluate_model(model, tokenizer, device, test_dataset, shot_type='zero')

zero_135_table = create_results_table(zero_135_results, model_name="SmolLM2-135M Zero-Shot")
print(zero_135_table)

Processing: 100%|██████████| 25000/25000 [09:16<00:00, 44.93it/s]

+---------------------------------------+
|   Results for SmolLM2-135M Zero-Shot  |
+----------------------------+----------+
|           Metric           |  Value   |
+----------------------------+----------+
|          Accuracy          |   0.50   |
|    Recall (Sensitivity)    |   0.01   |
|        Specificity         |   1.00   |
|         Precision          |   0.78   |
|          F-Score           |   0.02   |
|     % True Predictions     |  0.78%   |
|    % False Predictions     |  99.22%  |
|    Unknown Predictions     | 2993.00% |
|  Total Inference Time (s)  |  464.59  |
| Average Inference Time (s) |   0.02   |
+----------------------------+----------+





## Few Shot

In [7]:
few_135_results = evaluate_model(model, tokenizer, device, test_dataset, shot_type='few')

few_135_table = create_results_table(few_135_results, model_name="SmolLM2-135M Few-Shot")
print(few_135_table)

Processing: 100%|██████████| 25000/25000 [09:34<00:00, 43.53it/s]

+-------------------------------------+
|  Results for SmolLM2-135M Few-Shot  |
+----------------------------+--------+
|           Metric           | Value  |
+----------------------------+--------+
|          Accuracy          |  0.59  |
|    Recall (Sensitivity)    |  0.80  |
|        Specificity         |  0.37  |
|         Precision          |  0.56  |
|          F-Score           |  0.66  |
|     % True Predictions     | 71.86% |
|    % False Predictions     | 28.14% |
|    Unknown Predictions     | 0.00%  |
|  Total Inference Time (s)  | 467.02 |
| Average Inference Time (s) |  0.02  |
+----------------------------+--------+





# SmolLM-360M

In [8]:
# Clears cuda from last run
# if device == "cuda":
#     torch.cuda.empty_cache()
#     torch.cuda.ipc_collect()
#     torch.cuda.reset_peak_memory_stats()
    
checkpoint = "HuggingFaceTB/SmolLM2-360M"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForCausalLM.from_pretrained(checkpoint).to(device)

## Zero Shot

In [9]:
zero_360_results = evaluate_model(model, tokenizer, device, test_dataset, shot_type='zero')

zero_360_table = create_results_table(zero_360_results, model_name="SmolLM2-360M Zero-Shot")
print(zero_360_table)

Processing: 100%|██████████| 25000/25000 [12:23<00:00, 33.62it/s]


+-------------------------------------+
|  Results for SmolLM2-360M Zero-Shot |
+----------------------------+--------+
|           Metric           | Value  |
+----------------------------+--------+
|          Accuracy          |  0.56  |
|    Recall (Sensitivity)    |  1.00  |
|        Specificity         |  0.11  |
|         Precision          |  0.53  |
|          F-Score           |  0.69  |
|     % True Predictions     | 94.26% |
|    % False Predictions     | 5.74%  |
|    Unknown Predictions     | 2.00%  |
|  Total Inference Time (s)  | 499.90 |
| Average Inference Time (s) |  0.02  |
+----------------------------+--------+


## Few Shot

In [10]:
few_360_results = evaluate_model(model, tokenizer, device, test_dataset, shot_type='few')

few_360_table = create_results_table(few_360_results, model_name="SmolLM2-360M Few-Shot")
print(few_360_table)

Processing: 100%|██████████| 25000/25000 [13:51<00:00, 30.05it/s]

+-------------------------------------+
|  Results for SmolLM2-360M Few-Shot  |
+----------------------------+--------+
|           Metric           | Value  |
+----------------------------+--------+
|          Accuracy          |  0.66  |
|    Recall (Sensitivity)    |  0.99  |
|        Specificity         |  0.33  |
|         Precision          |  0.60  |
|          F-Score           |  0.75  |
|     % True Predictions     | 82.74% |
|    % False Predictions     | 17.26% |
|    Unknown Predictions     | 0.00%  |
|  Total Inference Time (s)  | 490.53 |
| Average Inference Time (s) |  0.02  |
+----------------------------+--------+





# SmolLM2-1.7B

In [None]:
# Clears cuda from last run
# if device == "cuda":
#     torch.cuda.empty_cache()
#     torch.cuda.ipc_collect()
#     torch.cuda.reset_peak_memory_stats()

checkpoint = "HuggingFaceTB/SmolLM2-1.7B"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForCausalLM.from_pretrained(checkpoint).to(device)

## Zero Shot

In [None]:
zero_17_results = evaluate_model(model, tokenizer, device, test_dataset, shot_type='zero')

zero_17_table = create_results_table(zero_17_results, model_name="SmolLM2-1.7B Zero-Shot")
print(zero_17_table)

## Few Shot

In [None]:
few_17_results = evaluate_model(model, tokenizer, device, test_dataset, shot_type='few')

few_17_table = create_results_table(few_17_results, model_name="SmolLM2-1.7B Few-Shot")
print(few_17_table)

# Print All Results

In [None]:
print("All models evaluated successfully!")

results_dicts = [
    zero_135_results,
    few_135_results,
    zero_360_results,
    few_360_results,
    zero_17_results,
    few_17_results,
]

# List of model names
model_names = [
    "SmolLM2-135M Zero-Shot",
    "SmolLM2-135M Few-Shot",
    "SmolLM2-360M Zero-Shot",
    "SmolLM2-360M Few-Shot",
    "SmolLM2-1.7B Zero-Shot",
    "SmolLM2-1.7B Few-Shot",
]

table = PrettyTable()

# Define the columns
table.field_names = [
    "Model",
    "Accuracy",
    "Recall (Sensitivity)",
    "Specificity",
    "Precision",
    "F-Score",
    "% True Predictions",
    "% False Predictions",
    "Unknown Predictions",
    "Total Inference Time (s)",
    "Avg Inference Time (s)"
]

# Populate the table
for model_name, results in zip(model_names, results_dicts):
    table.add_row([
        model_name,
        f"{results['accuracy']:.2f}",
        f"{results['recall']:.2f}",
        f"{results['specificity']:.2f}",
        f"{results['precision']:.2f}",
        f"{results['f_score']:.2f}",
        f"{results['true_percent']:.2f}%",
        f"{results['false_percent']:.2f}%",
        f"{results['unknown_predictions']:.2f}%",
        f"{results['total_inference_time']:.2f}",
        f"{results['average_inference_time']:.2f}",
    ])

print(table)
print(results_dicts)

# Models tend to say true more because they see it first in the prompt, but to counter this they do default to negative. MENTION IN REPORT

# Cleanup Cuda

In [None]:
# Clears cuda from last run
# if device == "cuda":
#     torch.cuda.empty_cache()
#     torch.cuda.ipc_collect()
#     torch.cuda.reset_peak_memory_stats()