# Import Libraries and setup Hugging Face

In [1]:
# Pace Setup
# !module load anaconda3
# !module load gcc/12.3.0
# !module load cuda/12.6.1

In [2]:
!nvidia-smi

import os

os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'

import time
from dotenv import load_dotenv
import torch
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from tqdm import tqdm
from datasets import load_dataset
from prettytable import PrettyTable

# Update HF cache directory
env_path = os.path.abspath(os.path.join(os.getcwd(), '..', '.env'))
print(env_path)
load_dotenv(env_path)
hf_cache_dir = os.getenv('TRANSFORMERS_CACHE')
os.makedirs(hf_cache_dir, exist_ok=True)
print(f"Hugging Face cache directory set to: {hf_cache_dir}")

from transformers import AutoModelForCausalLM, AutoTokenizer

# Check cuda version torch is using
print(f"Using torch {torch.__version__} with cuda {torch.version.cuda}")

workspace_dir = os.getenv('WORKSPACE_DIR')

seed = 42

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Sun Dec  8 01:01:23 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 555.42.02              Driver Version: 555.42.02      CUDA Version: 12.5     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA A100-PCIE-40GB          On  |   00000000:C1:00.0 Off |                    0 |
| N/A   31C    P0             42W /  250W |       1MiB /  40960MiB |      0%      Default |
|                                         |                        |             Disabled |
+-----------------------------------------+------------------------+----------------------+
                                                



Using torch 2.5.1 with cuda 12.4
Using device: cuda


# Import the dataset
It is stored in the dataset directory which is gitignored so run this block to repopulate if needed

In [3]:
# # Check if dataset is present
# dataset_dir = os.path.join(workspace_dir, 'datasets')
# os.makedirs(dataset_dir, exist_ok=True)

# if not os.path.exists(os.path.join(dataset_dir, 'IMDB Dataset.csv')):
#     !kaggle datasets download lakshmi25npathi/imdb-dataset-of-50k-movie-reviews --path {dataset_dir} --unzip

# # Load dataset into dataframe
# dataset = pd.read_csv(os.path.join(dataset_dir, 'IMDB Dataset.csv'))
# print(dataset.head())

# _, test_set = train_test_split(dataset, test_size=0.2, random_state=seed)

imdb = load_dataset("imdb")

test_dataset = imdb['test'].shuffle(seed=seed).select([i for i in list(range(500))])
train_dataset = imdb['train'].shuffle(seed=seed)

print(f"Train dataset: {len(train_dataset)}")
print(f"Test dataset: {len(test_dataset)}")
print(test_dataset[0])

Train dataset: 25000
Test dataset: 500
{'text': "<br /><br />When I unsuspectedly rented A Thousand Acres, I thought I was in for an entertaining King Lear story and of course Michelle Pfeiffer was in it, so what could go wrong?<br /><br />Very quickly, however, I realized that this story was about A Thousand Other Things besides just Acres. I started crying and couldn't stop until long after the movie ended. Thank you Jane, Laura and Jocelyn, for bringing us such a wonderfully subtle and compassionate movie! Thank you cast, for being involved and portraying the characters with such depth and gentleness!<br /><br />I recognized the Angry sister; the Runaway sister and the sister in Denial. I recognized the Abusive Husband and why he was there and then the Father, oh oh the Father... all superbly played. I also recognized myself and this movie was an eye-opener, a relief, a chance to face my OWN truth and finally doing something about it. I truly hope A Thousand Acres has had the same e

# Define Experiment Functions

# HUGE NOTE, the order of pos vs neg in the asking of sentiment matters alot, first one listed tends to be the default models answer

In [4]:
"""
To calculate the values of accuracy, recall, specificity, precision, and F-score, you need the confusion matrix or the key components: True Positives (TP), True Negatives (TN), False Positives (FP), and False Negatives (FN). Here's how each metric is calculated:

1. **Accuracy**: The proportion of correctly classified instances (both positive and negative) out of all instances.
   \[
   \text{Accuracy} = \frac{TP + TN}{TP + TN + FP + FN}
   \]

2. **Recall (Sensitivity)**: The proportion of actual positives correctly identified.
   \[
   \text{Recall} = \frac{TP}{TP + FN}
   \]

3. **Specificity**: The proportion of actual negatives correctly identified.
   \[
   \text{Specificity} = \frac{TN}{TN + FP}
   \]

4. **Precision**: The proportion of predicted positives that are actually positive.
   \[
   \text{Precision} = \frac{TP}{TP + FP}
   \]

5. **F-score**: The harmonic mean of precision and recall, balancing the two.
   \[
   \text{F-Score} = 2 \cdot \frac{\text{Precision} \cdot \text{Recall}}{\text{Precision} + \text{Recall}}
   \]

"""

def evaluate_model(model, tokenizer, device, dataset, top_k=50, shot_type='zero'):
    # Data preparation
    reviews = [example['text'] for example in dataset]
    true_labels = [example['label'] for example in dataset]  # 0 for negative, 1 for positive

    few_shot_rev_1 = "Movie Review: I loved this movie ! So good plot ! \n Only Answer if this Movie Review is Positive or Negative: Positive \n"
    few_shot_rev_2 = "Movie Review: I hated this, could be a lot better \n Only Answer if this Movie Review is Positive or Negative: Negative \n"
    few_shot_rev_3 = "Movie Review: This move was so good I would recommend to all my friends! \n Only Answer if this Movie Review is Positive or Negative: Positive \n"

    # One function for both
    if shot_type == 'zero':
        # Define the prompts
        prompts = [f"Movie Review: {review} \n Only Answer if this Movie Review is Positive or Negative:" for review in reviews]
    elif shot_type == 'few':
        # Need to have prompts that do not ask if pos or neg vs neg vs pos since the model will just answer the first one
        prompts = [f"{few_shot_rev_1} {few_shot_rev_2} {few_shot_rev_3} Movie Review: {review} \n Only Answer if this Movie Review is Positive or Negative:" for review in reviews]

    # Perform inference
    predictions = []
    inference_times = []

    for idx, example in tqdm(enumerate(dataset), total=len(dataset), desc="Processing", leave=True):
        # Tokenize the input
        inputs = tokenizer.encode(prompts[idx], return_tensors="pt").to(device)

        # Perform inference
        start_time = time.time()
        with torch.no_grad():
            outputs = model(inputs)
            logits = outputs.logits
        end_time = time.time()

        # Get the probabilities for the next token
        next_token_logits = logits[:, -1, :]  # Only consider the last token's logits
        probabilities = torch.softmax(next_token_logits, dim=-1)

        # Get the top k most likely tokens
        top_k_probs, top_k_indices = torch.topk(probabilities, top_k) # these are sorted in order of most likely to least likely

        # Decode the top k tokens
        top_k_tokens = [tokenizer.decode([token]) for token in top_k_indices[0]]

        # print(f"Top-k tokens for review {reviews[idx]}: {top_k_tokens}")
        
        # Extract the sentiment prediction from the top k tokens, if the model did not predict a sentiment, default to negative
        pred = -1
        for token in top_k_tokens:
            token_lower = token.strip().lower()
            if token_lower == 'positive':
                pred = 1
                break
            elif token_lower == 'negative':
                pred = 0
                break
            
        if pred == -1:
            print(f"Could not predict sentiment for review: {reviews[idx]}")
            print(f"Top k tokens: {top_k_tokens}")
            pred = 0

        # If the model did not predict a sentiment, default to negative
        predictions.append(pred)
        
        inference_times.append(end_time - start_time)

        
    print(predictions)
    # Calculate confusion matrix    
    tn, fp, fn, tp = confusion_matrix(true_labels, predictions).ravel()

    # Calculate metrics
    accuracy = (tp + tn) / (tp + tn + fp + fn)
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    f_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    # Calculate true and false answer percentages
    total_samples = len(true_labels)
    true_percent = predictions.count(1) / total_samples * 100
    false_percent = predictions.count(0) / total_samples * 100
    
    # Calculate total and average inference times
    total_inference_time = sum(inference_times)
    average_inference_time = total_inference_time / len(inference_times)

    return {
        "accuracy": accuracy,
        "recall": recall,
        "specificity": specificity,
        "precision": precision,
        "f_score": f_score,
        "true_percent": true_percent,
        "false_percent": false_percent,
        "total_inference_time": total_inference_time,
        "average_inference_time": average_inference_time
    }


def create_results_table(results_dict, model_name="Model Results"):
    """
    Creates a formatted table from the results dictionary.
    
    Parameters:
        results_dict (dict): Dictionary containing evaluation metrics.
        model_name (str): Name of the model being evaluated.
    
    Returns:
        str: Formatted table as a string.
    """
    # Initialize a PrettyTable
    table = PrettyTable()
    
    # Set the table title
    table.title = f"Results for {model_name}"
    
    # Add columns
    table.field_names = ["Metric", "Value"]
    
    # Add rows for each metric
    table.add_row(["Accuracy", f"{results_dict['accuracy']:.2f}"])
    table.add_row(["Recall (Sensitivity)", f"{results_dict['recall']:.2f}"])
#     if "specificity" in results_dict:  # Specificity might not be included in some results
    table.add_row(["Specificity", f"{results_dict['specificity']:.2f}"])
    table.add_row(["Precision", f"{results_dict['precision']:.2f}"])
    table.add_row(["F-Score", f"{results_dict['f_score']:.2f}"])
    table.add_row(["% True Predictions", f"{results_dict['true_percent']:.2f}%"])
    table.add_row(["% False Predictions", f"{results_dict['false_percent']:.2f}%"])
    table.add_row(["Total Inference Time (s)", f"{results_dict['total_inference_time']:.2f}"])
    table.add_row(["Average Inference Time (s)", f"{results_dict['average_inference_time']:.2f}"])
    
    # Return the table as a string
    return table.get_string()

  """


# SmolLM-135M


In [5]:
# Clears cuda from last run
# if device == "cuda":
#     torch.cuda.empty_cache()
#     torch.cuda.ipc_collect()
#     torch.cuda.reset_peak_memory_stats()

#FROM https://huggingface.co/HuggingFaceTB/SmolLM2-135M TODO: Dont forget to cite the model in report
checkpoint = "HuggingFaceTB/SmolLM2-135M"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForCausalLM.from_pretrained(checkpoint).to(device)

## Zero Shot

In [6]:
zero_135_results = evaluate_model(model, tokenizer, device, test_dataset, shot_type='zero')

zero_135_table = create_results_table(zero_135_results, model_name="SmolLM2-135M Zero-Shot")
print(zero_135_table)

Processing:   3%|▎         | 16/500 [00:00<00:12, 38.55it/s]

Could not predict sentiment for review: Don't pay any attention to the rave reviews of this film here. It is the worst Van Damme film and one of the worst of any sort I have ever seen. It would appeal to somebody with no depth whatever who requires nothing more than gunfire and explosions to be entertained.<br /><br />Seeing that this is directed by Peter Hyams it has made me realise that Peter has no talent as a director, but is very good at filming explosions and the like. However, movies need other elements as well; for example, a story. This one didn't have one. This might explain the awfulness of some of Mr. Hyams' more recent films, hardly any better than this one, really.<br /><br />One can't help wondering how some people ever were put behind a camera.
Top k tokens: [' <', ' ', ' I', ' It', ' This', '\n', ' The', ' Yes', '<', ' No', ' "', ' If', ' Don', ' You', ' A', ' yes', ' Not', ' Van', ' \n', '\n ', ' Good', ' There', ' it', '</', ' Y', ' (', ' In', 'This', ' Very', '\n  '

Processing:   8%|▊         | 41/500 [00:01<00:10, 45.40it/s]

Could not predict sentiment for review: This is just one of those films which cannot justify much of anything that happens. These people are going on a trek: the young girl wants to photograph animals. There really are no Piranhas, but I guess the psychotic hunter guy is the real piranha. Anyway, there are lots of animals and there is lots of driving. There is considerable anti-gun talk, but we all know where that is going. Toward the end, there's lots of action and a rape thrown in. Somebody must pay, and they do. It would have been nice to have a couple of piranhas to sort of fill the thing out. There were lots of monkeys. If you fast forward through the dull parts, you have a tight little five minutes.
Top k tokens: [' I', ' \n', ' This', ' Yes', ' ', '\n', ' It', ' The', ' If', ' No', ' There', ' You', ' Not', ' A', '\n ', ' Well', ' In', ' Very', ' We', ' Maybe', ' [', ' As', ' Y', ' "', ' yes', ' Some', ' My', ' At', ' Okay', ' Yeah', ' Probably', ' Absolutely', ' What', ' That',

Processing:  12%|█▏        | 61/500 [00:01<00:09, 46.12it/s]

Could not predict sentiment for review: This series takes a classic story and makes rubbish of it. Robin is somehow mystified by an elk-man in the forest and is embroiled in all sorts of druidism and outright satanic episodes. The story is more about him avoiding the evil sheriff than helping the poor. This is barely watchable. And to top all the ridiculousness they kill Robin at the end of series 2 and replace him with another actor. Some people may like this show as a fantasy show but it is NOT a Robin Hood show. If you want Robin fighting in king Richards name against Prince John and the sheriff and if you want Robin feeding the poor and oppressed, watch the classic series or the newest from the BBC.
Top k tokens: ['\n', ' ', ' This', ' \n', ' Yes', ' I', ' The', ' If', ' It', ' No', '\n ', ' [', '\n\n', ' You', '\n  ', ' A', ' this', ' **', '  \n', '  ', ' Not', ' "', ' None', ' There', ' *', ' Y', ' \n\n', ' Very', ' yes', ' Only', ' As', ' In', ' Watch', '\n   ', ' <', ' Good', '

Processing:  14%|█▍        | 71/500 [00:01<00:09, 46.34it/s]

Could not predict sentiment for review: There are certain scenes in this film (like the hero's first meeting with super-villainess Shirley Eaton) where it seems to be on the edge of breaking sexual taboos and doing its premise (females want to rule the world by making men slaves) justice, but it never dares to. The result is a film with no sexuality and some tame violence. Despite the choppy plot, the film is not overly bad until its climax, where its amateurishness runs rampant (terrible editing, overuse of stock footage). Worth seeing only as a curio. (*1/2)
Top k tokens: [' ', ' Yes', ' I', '\n', ' This', ' It', ' The', ' \n', ' There', ' *', ' No', ' If', ' **', ' You', ' Not', '\n ', ' [', ' "', ' A', ' Only', ' ***', ' (', ' As', ' Y', ' My', ' yes', ' None', '\n  ', ' Very', ' In', ' For', '\n\n', ' We', ' Absolutely', ' <', ' At', ' it', ' Good', ' That', ' Please', ' Okay', ' Great', ' this', ' What', ' Some', ' One', ' Well', ' True', ' F', ' Based']
Could not predict sentime

Processing:  17%|█▋        | 86/500 [00:01<00:08, 46.24it/s]

Could not predict sentiment for review: All I can do is echo the sentiment already expressed by some of the other commenters. This is CITY OF GOD meets HAPPY DAYS. The bipolarity of the ruthless thug (one minute a ruthless killer, the next minute a Luv's diaper commercial) is completely unconvincing. You can approach it in one of two ways: (1) A gritty, realistic movie turned sappy; or (2) a sappy, ABC-afterschool-special with profanity, violence and animal cruelty. Either way it just don't fly, do it? Why then has it received so much praise? As others have implied, it gets the "conscience vote" from the west. Show us pictures of poverty to contrast against our fluffy, double-wide theatre seats and 44-oz cokes, and we'll applaud in a heartbeat. But--oh--don't forget to candy coat it, because the bitter pill of reality (tantalizing as it is) is hard for us to swallow.<br /><br />I'm terribly disappointed that this film would receive so many awards and accolades, especially when there ar

Processing:  19%|█▉        | 96/500 [00:02<00:08, 45.13it/s]

Could not predict sentiment for review: Just watched this today on TCM, where the other reviewers here saw it.<br /><br />Sorry that I was the only one to find Davies a weak actress, with a truly awful attempt at an Irish (Irish-American or otherwise) accent. As she's the star, it was sort of hard for me to get past that -- especially as the other reviewers have said that this was her finest performance.<br /><br />Another particularly terrible Davies performance was in "Marianne" (1929), which I also watched today. In this film, given a 9 of 10 rating here, her accent switches from that of a (correct) French woman to an odd combination of Italian and Swiss.<br /><br />Interestingly, in TCM's one-hour bio of Davies -- "Captured on Film: The True Story of Marion Davies" (2001) -- film historian Jeanine Basinger claims that "one of the things that you note about Marion Davies in her sound work is how good she is at doing accents." Of course this bio also includes commentary by fans (make

Processing:  22%|██▏       | 111/500 [00:02<00:08, 46.52it/s]

Could not predict sentiment for review: Great cult flick for MST-3K types: Richard Boone is a mess -- bad hair, arthritis, even his dark glasses aren't right; about as good as a bad dino-flick can get... actually, that charging saber-toothed Styracosaurus was pretty cool -- maybe Spielberg should take a couple of notes from that one.
Top k tokens: [' Yes', ' ', ' I', '\n', ' It', ' The', ' This', ' No', ' Not', ' \n', ' Great', ' If', ' Very', ' A', ' Absolutely', ' You', ' "', ' Y', ' [', ' *', ' <', ' Good', ' Yeah', ' Sp', ' Well', ' Probably', ' M', ' There', ' Richard', ' We', ' In', ' That', ' Bad', '\n ', ' H', ' As', ' Maybe', ' R', ' Def', ' My', ' **', ' For', ' Only', ' S', ' He', ' Wow', ' N', ' None', ' F', ' What']
Could not predict sentiment for review: It was probably watching this TV movie that got me interested in the debate as to whether "Anna" was really Tsar Nicholas's daughter Anastasia. Since seeing it I have made a point of watching various documentaries and als

Processing:  24%|██▍       | 121/500 [00:02<00:08, 43.33it/s]

Could not predict sentiment for review: This interesting film noir features three very good performances: Sanders, Patrick, and Blackmer. The scenes between Sanders and Patrick are particularly outstanding. Demming, as the detective, is unfortunately not nearly as good. He lacks the intelligence, strength, and cynical world view of a Bogart. Had Humphrey played this part, we could have had a classic.<br /><br />Pace, location (a library), and atmosphere are all good. But there are a few plot holes. Sanders strongly fears Blackmer and the ruthless organization (Nazis) he represents. Yet after mistakenly killing Blackmer, Sanders seems to experience no anxiety or remorse. Sanders then seizes the library and its occupants by using the ruse that he and his men are detectives investigating the murder. However, Sanders' hit man later tries to kill Demming by shooting him (without a silencer), even though the many other detainees could have been expected to hear, and become alarmed by, the no

Processing:  27%|██▋       | 136/500 [00:03<00:08, 44.46it/s]

Could not predict sentiment for review: A woman in love with her husband (he's suicidal) decides to have a baby to save his life. She's been to a fertility clinic - as has the lover she takes - so both know how artificial insemination works; but, instead of using the method thousands of people use every year around the world (the $5 turkey baster), they engage in coitus. We also are to believe that although the immigrant is in love with his fiancée, he doesn't suggest the obvious alternative to intercourse. Further, even though this is a business arrangement, the first time she's with her sperm donor, she takes off all her clothes, as if it's a seduction. Plus, her husband doesn't notice when $30,000 goes missing from their bank accounts. Does all this seem to demand more willing suspension of disbelief than even most Hollywood fare? Far fetched on all counts.
Top k tokens: ['\n', ' ', ' \n', ' Yes', ' A', ' I', ' The', ' If', ' This', ' It', ' No', '\n ', ' "', ' Not', ' We', ' You', 

Processing:  29%|██▉       | 146/500 [00:03<00:08, 43.85it/s]

Could not predict sentiment for review: Don't listen to what the critics have always said about this cute, charming little movie. Madonna is GREAT in this clever comedy. I worked at a video store for several years and suggested this movie to lots of customers- no one EVER brought it back and screamed at me for telling them to rent it. Everyone always enjoyed it. It's actually a great movie for kids, too.
Top k tokens: [' ', '\n', ' Yes', ' I', ' \n', ' It', ' This', ' The', ' Don', ' No', ' If', ' "', ' Y', ' Not', ' <', ' You', '\n ', ' [', ' A', ' Absolutely', ' *', ' yes', ' Good', ' Mad', ' **', ' Very', '\n\n', ' We', '\n  ', ' There', ' NO', ' Only', ' My', ' it', ' Do', ' Bad', ' Great', ' Please', '  ', ' None', ' N', ' What', '  \n', ' In', ' Wow', ' Yeah', ' M', ' Maybe', ' For', ' this']
Could not predict sentiment for review: Before seeing this film, I suggest the viewer puts away any expectations that the victims of the crimes depicted will get equal treatment and consider

Processing:  34%|███▍      | 171/500 [00:03<00:07, 45.68it/s]

Could not predict sentiment for review: This short was director Del Lord's last and only Shemp short. The problem: It was quite weak and the cafe scene was pretty much a carbon copy of a Curly short "Busy Buddies" (1944). The interrogation scene was pretty funny, and the beginning part of the cafe part. But there are a lot of plotholes in this short. For example, why are the stooges hiding in the garbage can when the police come? In the remake, "Of Cash And Hash"(1955), director Jules White fixes this and the reason for the stooges hiding in the garbage can is because there is a gunfight between the police and the armored car robbers. The scene in which Moe is having trouble with the oyster was done before with Curly in "Dutiful But Dumb" (1941). The spooky house part wasn't all that great except for the hilarious scene on the outside of the spooky house. To top it off, the ending had no sting to it. Rating: C-
Top k tokens: [' This', ' ', ' I', ' The', '\n', ' "', ' Yes', ' It', ' If'

Processing:  36%|███▌      | 181/500 [00:04<00:06, 45.95it/s]

Could not predict sentiment for review: Gordon Scott made some good Tarzan movies, but this is not one of them.<br /><br />As I watched it, wincing at the bad, obviously interior sets and the hollow wooden "clonking" sounds as they walked across supposedly dirt trails, and cringing at the bad dialog and worse acting among the supporting cast, I kept thinking, "Sheesh! This is TV show level!" Then I find out it was, indeed, three TV show pilot episodes woven seam-fully into one.<br /><br />It's nice to see Scott get outside (alone), away from the lame sets, in a few of the scenes; and the fights do have some pretty nice moves... but oh, ow, and ouch as to the dialog. And did I mention the acting? Heck, Cheetah (or "Cheta," in this version) was a better actor than most of the humans.<br /><br />And that's not saying much.<br /><br />It is kind of a stitch to see a younger Sherman (i.e. Scatman) Carothers acting as a native. But probably not worth the overall time-investment.
Top k tokens

Processing:  39%|███▉      | 196/500 [00:04<00:06, 46.73it/s]

Could not predict sentiment for review: When I first saw this film around 6 months ago, I considered it interesting, but little more. But it stuck with me. That interest grew and grew, and I wondered whether my initial boredom and response had more to do with the actual VHS quality rather than the film itself. I purchased the Criterion DVD box set, and it turns out that I was right the second time. Alexander Nevsky is a great film. It is rousing, and I'm sure it succeeded in its main aim: propaganda against the Germans.<br /><br />That is the most common criticism against this film, and against Eisenstein, that it is merely propagandist and nothing else. It's untrue. He is an amazing film artist, one of the most important whoever lived. By now, the world is far enough beyond Joseph Stalin to be able to watch Eisenstein's films as art.
Top k tokens: [' ', ' <', ' I', '\n', ' The', ' Yes', ' If', ' It', ' This', ' No', ' When', ' \n', '\n ', '<', ' You', ' "', ' A', ' yes', ' Alexander',

Processing:  45%|████▌     | 226/500 [00:05<00:05, 46.80it/s]

Could not predict sentiment for review: I got this movie out a week after the death of Ichikawa Kon - I suppose if there is one way to mark the passing of a great director, its to raise a glass of wine to him while watching one of his greatest movies. Ichikawa had one of the finest careers in Japanese film, but as he never had a distinctive style or theme he often seems to be overlooked compared to his near contemporaries such as Ozu and Kurosawa (he was a little younger than them, but not by much). He is one of those directors who defies auteur theories - its likely that his wife (who wrote the screenplay for this and many other of his movies) was as much responsible for the quality of the movies as he was. But at his best, he was as good as any Japanese film maker at the time. In particular, he had great technical skills, allowing him to tell complex stories in an accessible manner. But in terms of theme, this movie could hardly be simpler - war is hell. No really, its seriously hell

Processing:  52%|█████▏    | 261/500 [00:05<00:05, 46.30it/s]

Could not predict sentiment for review: This is an OK adaptation of the breath taking book of Dan Brown. I can't say it is novel or very good but they made a movie that you can enjoy. Given the excellent story, the result could have been better though. The movie is pretty long but at the end I was feeling like some things were missing. Sound effects and sound tracks were very good. Acting was well done but the character development phase was very weak. For people who didn't read the book, things may look happening too quickly. From my point of view, instead of trying to put as much as stuff from the book, they could have tried to do the important scenes more proper. What makes the book very good was all the puzzle like story combined with the excellent portrait of Vatican. You see neither of it in the movie. Too much rush and using the time not in a good way, these are main problems of the movie. So, it is worth watching but could have been done better.
Top k tokens: ['\n', ' I', ' ', 

Processing:  55%|█████▌    | 276/500 [00:06<00:04, 46.35it/s]

Could not predict sentiment for review: algernon4's comment that Ms Paget's "ultra lewd dance in (this film) is the most erotic in the history of films" is certainly one doozy of an exaggeration. It isn't even Debra Paget's most erotic dance. Her near nude gyrations in Fritz Lang's "The Indian Tomb" make this number look decidedly tame. As for being the most erotic in the history of dance. Well! Where do I start? Salma Hayek's performance as Santanico Pandemonium in "From Dusk to Dawn" (1996); Jamie Lee Curtis in "True Lies" (1994); Jessica Alba in "Sin City" (2005); Rose McGowan in "Terror Planet" (2007); Sheila Kelley in "Dancing at the Blue Iguana" (2000), blah, blah, blah.<br /><br />Don't get me wrong. I love the sequence and have included it in my "Cheesecake Dance" series on Youtube. I just think that making a claim like "most erotic in the history of film" is really going out on a very fragile limb.
Top k tokens: [' <', ' ', '\n', ' I', ' alg', ' The', ' Alg', ' "', ' Yes', '<'

Processing:  57%|█████▋    | 286/500 [00:06<00:04, 45.10it/s]

Could not predict sentiment for review: If it wasn't meant to be a comedy, the filmmakers sure goofed. If they intended for it to be a comedy, they hit the mark. Our critic says Homegrown is a wonderful film filled with family values and community spirit, recommends it for all audiences, and says that he really liked Jamie Lee Curtis's performance. It deserves a theatrical re-release.
Top k tokens: [' If', ' ', ' \n', ' Yes', '\n', ' Home', ' I', ' It', ' No', ' This', ' The', ' [', ' We', ' Absolutely', ' Not', ' A', ' You', '\n ', ' if', ' Y', ' "', ' Only', ' *', ' My', ' None', '<|endoftext|>', ' As', ' Maybe', ' There', ' <', ' **', '  ', ' For', ' House', ' In', '\n\n', ' \n\n', ' H', '\n  ', '  \n', ' Please', ' yes', ' Thank', ' What', ' Of', 'Home', ' \n   ', ' Would', ' True', '\n   ']
Could not predict sentiment for review: If you really loved GWTW, you will find quite disappointing the story. Those who may think this is just about a romantic story and the south, will be pro

Processing:  63%|██████▎   | 316/500 [00:07<00:04, 43.62it/s]

Could not predict sentiment for review: Oliver Stone, always ready to make politically-themed movies, makes another one here. "Talk Radio" is loosely based on the career of Alan Berg, a radio talk show host in Denver who was murdered by white supremacists. In this case, the character is Barry Champlain (Eric Bogosian), an outspoken talk show host in Dallas who loves nothing more than to irk the people who call in. As it is, most of the people who call in are a bunch of pigheaded racists. And things may be heating up more than anyone realizes.<br /><br />Bogosian's performance brings a light comical tone to an otherwise serious movie. I really liked the scene where he jabs at a redneck who calls in. Granted, I wouldn't call this Oliver Stone's greatest movie ever, but it's a good reference in an era when media gets more and more concentrated. Good performances by Ellen Greene and Alec Baldwin also help.
Top k tokens: [' ', '\n', ' <', ' "', ' I', ' The', ' Yes', ' This', ' If', ' \n', '

Processing:  69%|██████▉   | 346/500 [00:07<00:03, 46.63it/s]

Could not predict sentiment for review: Well, "built" Doris Day (as Ethel S. "Dynamite" Jackson) is mistaken for thespian Ethel Barrymore, and falls in love with dancer Ray Bolger (as S. "Sam" Winthrop Putnam). Older Frenchman Claude Dauphin (as Philippe Fouquet) also digs Doris. Honestly What were they thinking? - This wildly inappropriate musical does feature Ms. Day prettily singing the standard "April in Paris", and others. Certainly, there nothing as good as her Columbia recordings from the time; and, nothing approaches Day's stunning and forthcoming "Secret Love". Although the material does not serve him well, it's nice to see Mr. Bolger performing. Some of the musical numbers are obnoxious.<br /><br />**** April in Paris (12/24/52) David Butler ~ Doris Day, Ray Bolger, Claude Dauphin
Top k tokens: [' Yes', ' I', ' ', ' This', ' **', ' No', ' "', ' The', ' <', ' It', ' -', '\n', ' *', ' If', ' Well', ' \n', ' Not', ' A', ' (', ' April', ' You', ' Very', ' __________________', ' 

Processing:  71%|███████   | 356/500 [00:07<00:03, 45.27it/s]

Could not predict sentiment for review: What are Forest Whitaker and Clifton Collins Jr. doing in this? Light It Up is a ridiculously melodramatic piece on problems in low income area schools. While the topic is one that needs to be addressed, the film uses every cliche in the genre and comes off as a textbook popcorn flick. The characters are cutouts from the inner city version of The Breakfast Club or even The Faculty. Watch this with your children when they turn 13 or 14. With them, it could be an outlet for a lesson on current social problems. For anyone older, it will be nothing more than something to watch and spit on at 4 in the morning, as I did recently on Bravo. Matter of fact, what was this doing on Bravo?
Top k tokens: ['\n', ' ', ' I', ' \n', ' This', ' The', ' Yes', ' It', ' If', ' What', ' No', ' You', ' [', ' A', ' Not', ' There', ' In', ' Light', ' Dark', '  ', ' Y', ' We', '\n ', '  \n', ' My', ' "', ' F', ' For', ' Only', ' <', ' Good', ' As', '\n\n', ' Forest', ' Ho

Processing:  74%|███████▍  | 371/500 [00:08<00:02, 45.64it/s]

Could not predict sentiment for review: 'Nuff said. An undercover cop from the state capital is sent to a small county where moonshine running is rampant. He ends up getting run off the road by some local hicks who have no idea he's an undercover cop (so they just drive away as blissfully dopey as ever). He is soon being taken care of by a woman and her three daughters who all wear low-cut tops and short shorts (gotta luv the '70s). He falls in love with one of the girls but in the meantime he still has to find out who's making all the moonshine and driving it to all the local bars and restaurants. He also has to contend with a fat sheriff and his incompetent deputy who think he's the moonshiner 'cause he's new in town.<br /><br />Life in small town America, 70s style. YEE HAAAAAAAAAAA.
Top k tokens: [" '", ' ', ' <', ' I', ' "', '\n', ' Yes', ' It', ' No', ' The', ' This', ' If', ' \n', ' A', " ''", ' N', ' Y', ' Not', ' An', ' You', '\n ', ' yes', ' **', '<', ' (', ' [', '\n  ', ' *'

Processing:  76%|███████▌  | 381/500 [00:08<00:02, 45.12it/s]

Could not predict sentiment for review: this is a really great series. i love the show and i am so glad it isn't canceled yet. it has really good humor and shows the realistic bond between a young mother and her daughter. o yes for Gilmore girls! it is very awesome. they are such a sarcastic humorous bunch. they do everything together just like my mom and me. ya for Gilmore girls. um, i'm running out of lines. but i love how Luke and Lorelei's relationship is finally shaping up. they so needed to be together. and i absolutely just love Sookie St. James! she is so awesome . and the show wouldn't be anywhere without Michel. the whole show is dry humor, sarcasm, and life in a very small town where everyone knows each other....especially the Gilmore Girls.
Top k tokens: ['\n', ' \n', ' this', ' ', ' I', ' yes', ' This', ' it', ' Yes', '\n\n', ' The', ' It', '\n ', ' the', ' [', ' no', ' i', ' No', ' If', ' "', ' if', ' \n\n', ' <', ' You', ' you', ' *', ' not', '\n  ', '  \n', ' NO', ' my'

Processing:  80%|████████  | 401/500 [00:08<00:02, 46.24it/s]

Could not predict sentiment for review: It was so very long ago (1960), but I have never forgotten this series and often wished it would reappear. So taken with it, I corresponded with Mr. Rathbun, then president of Standard Oil, which sponsored the presentation on PBS. He sent me a photo of the tapestry (actually a charcoal rendering) used behind the credits.<br /><br />To the opening theme music of Bayco's "Elizabethan Masque," my family and I gathered around our black & white TV to drink in Shakespeare's words as spoken by a group of excellent but relatively unknown players (at least to American audiences at the time).<br /><br />We were introduced to such actors as Sean Connery, Dame Judi Dench, Tom Fleming, Patrick Garland, Julian Glover and Robert Hardy. I have continued to enjoy their accomplishments ever since. One of the most interesting things was the way in which the actors continued to age in their respective roles as Shakespeare's "King" plays were presented, perhaps for t

Processing:  86%|████████▌ | 431/500 [00:09<00:01, 44.95it/s]

Could not predict sentiment for review: Now I remember what the 'indie' filmmakers were ripping off before Pulp Fiction. It was David Lynch, right?<br /><br />I hunted this thing down to see Kyle Secor. What a waste of a perfectly good Bayliss. It was so painful to watch him, sort of like when someone you love is horribly sick and there's nothing you can do.<br /><br />Nearly every cliche in the book: the desert, the psycho, the quirky mob boss, the biker, Tracy Walker (who fortunately was only in one scene, but I kept expecting him to reappear and say something strange and profound like "If a man wants to know where he's going, he's got to look at where he's been," or some contrived garbage like that). I have a theory as to why so many indies are short on location in the desert. I think it's because they can save money on lighting.<br /><br />If you like to be in pain, find this movie and give it a viewing. If you're a fan of Kyle Secor, watch reruns of Homicide on Court TV. If you wa

Processing:  88%|████████▊ | 441/500 [00:09<00:01, 44.48it/s]

Could not predict sentiment for review: Bedrooms and Hallways gives its audience a look into the mind of a man who thinks he's found himself, only to find out that he's not so sure he found the right guy. If you think that all gay comedies are the same, check this one out. Although the movie ends without much resolution, the hilarious one-liners, peculiar situations, and quirky characters are sure to satisfy.
Top k tokens: ['\n', ' Yes', ' ', ' I', ' If', ' The', ' Bed', ' This', ' It', ' [', ' No', ' You', ' \n', ' "', '\n ', ' Not', ' A', ' *', '\n  ', ' **', ' Only', '<|endoftext|>', ' <', '\n\n', ' Y', ' In', ' We', ' There', ' As', '\n   ', ' Although', ' My', ' Based', ' For', ' None', ' Most', ' yes', '  ', ' Good', ' (', ' What', ' At', ' Please', ' ***', ' Very', ' Maybe', ' Even', ' While', ' Absolutely', ' All']
Could not predict sentiment for review: i enjoyed this film immensely, due to pungent scenes (humorous as well as ironic, some even "tragical"), believable performan

Processing:  90%|█████████ | 451/500 [00:10<00:01, 45.46it/s]

Could not predict sentiment for review: I bought this video on a throw-out table at the video store expecting a good cast in what was touted as an award-winning Brit sex comedy. I guess I should have read the finer print. I rarely write a panning review, but here goes.<br /><br />These actors in gay roles really play games with your memories of a lot of far more worthy films. This comedy was a very cruel joke at the expense of the actors, the theatre-going public and of all the nice films that have contributed to their reputations.<br /><br />I repeat: is the joke about trashing the actors' other highly respectable on-screen personae with this scurrilously trashy flick? Can the reference to the Austen classics 'Pride and Prejudice' and 'Sense and Sensibility' be anything else? How much of a political statement was it to produce this melodrama using these stars? Are we meant to simply take it as a lay-down misere that all actors are gay and thus letting their on-screen roleplay affect o

Processing:  92%|█████████▏| 461/500 [00:10<00:00, 46.13it/s]

Could not predict sentiment for review: I blow hot and cold over Carné. He really can be a puzzle for me. I think perhaps his inspiration left him a little earlier than it did for other directors of his generation. Certainly a man who came to maturity in the Thirties with the Popular Front seems ill at ease in the France of the Fifties, with its rampant commercialism and heavy American influence. He is almost thirty years older than his young stars, and it shows. The party scenes go on much longer than they should, as if he were trying to buy time for the anemic scenario to work. Roland Lesaffre's character--he plays Pascale Petit's older brother--seems to exist only to reassure the director that his old-style ideas are still sound.<br /><br />At two hours, this picture is far too long. Still, let me praise Pascale Petit for her game performance; she was a natural who should have challenged Brigitte Bardot for sexpot supremacy, but somehow lost her way. Andrea Parisy is excellent too a

Processing:  94%|█████████▍| 471/500 [00:10<00:00, 45.26it/s]

Could not predict sentiment for review: Buddy Manucci(Roy Scheider, solid in a chance leading role)heads a secret undercover police squad called the Seven-Ups whose tactics don't necessarily follow the exact ways of the law. They get the job done in their own way without anything being leaked to the press, and this gives them a freedom to expand their means of getting to the criminals most working detectives and policeman just can not nab. Buddy has a pal from childhood named Vito(Tony Lo Bianco)who swaps information with him regarding mob types and shysters working the streets in NYC. What Buddy doesn't know is that Vito is hatching a scheme using names from Buddy's "check list"(he has this book open taking down notes provided by Vito, but doesn't know that his friend has copied those very names written within his mind)to set up mob families in a series of mob kidnappings eliciting cash thanks to two cop-posers, Moon(the always-villainous Richard Lynch)and Bo(Bill Hickman)working with

Processing:  97%|█████████▋| 486/500 [00:10<00:00, 46.32it/s]

Could not predict sentiment for review: The trio are a pleasant, nostalgic journey to that first hint of desire--when it was still about simple exploration of the unknown--before we "grew up" and added those complexities of HIV status, emotional baggage and gotta-run-my-pager-just-went-off into the emotional mix.<br /><br />The angst portrayed is pure adolescent angst, but it rings true in all three stories. Their sweetness and positivity make you feel good that you are gay. And those kinds of films are few and far between.<br /><br />Good news! Both Boys Life and Boys Life 2 are now readily available on DVD as of September 1999.
Top k tokens: [' <', '<', ' The', ' ', '\n', ' Yes', ' \n', '</', ' I', ' "', ' Good', ' No', ' </', ' If', ' Boys', ' It', ' This', ' A', '\n ', ' You', '\n  ', ' Y', ' (', '<|endoftext|>', ' Bad', 'The', ' *', ' yes', ' Not', ' There', ' What', ' In', ' We', '  ', '\n\t', ' My', 'Yes', ' One', ' **', ' That', ' [', ' Boy', ' Please', ' For', ' B', '\n\n', ' 

Processing: 100%|██████████| 500/500 [00:11<00:00, 45.12it/s]

Could not predict sentiment for review: The only time I seem to trawl through IMDb comments is when I've seen a duff film. I guess I'm looking to find reassurance that it's not just me. For me, then, Lonesome Jim was a duff film packed with unbelievable characters in unbelievable situations which limped on lamely and boringly towards a cop-out hackneyed conclusion. So I check out what other people have to say and feel a bit like Jim, out on a limb, alienated, as page after page of multiple star ratings and plaudits leave me doubting my critical faculties. Yet maybe I should check the settings for the comments presentation, since after a while the gushing dies down and I'm relieved to see appreciations that mirror my own. I feel vindicated. It IS a rubbish film, it DOESN'T hang together and it DOES constitute a wasted evening sitting through it. Praise be to kindred spirits.
Top k tokens: ['\n', ' ', ' \n', ' I', ' The', ' Yes', ' L', ' It', ' No', ' This', ' If', '\n ', ' A', ' [', '  




## Few Shot

In [7]:
few_135_results = evaluate_model(model, tokenizer, device, test_dataset, shot_type='few')

few_135_table = create_results_table(few_135_results, model_name="SmolLM2-135M Few-Shot")
print(few_135_table)

Processing: 100%|██████████| 500/500 [00:11<00:00, 44.61it/s]

[1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 




# SmolLM-360M

In [8]:
# Clears cuda from last run
# if device == "cuda":
#     torch.cuda.empty_cache()
#     torch.cuda.ipc_collect()
#     torch.cuda.reset_peak_memory_stats()
    
checkpoint = "HuggingFaceTB/SmolLM2-360M"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForCausalLM.from_pretrained(checkpoint).to(device)

## Zero Shot

In [9]:
zero_360_results = evaluate_model(model, tokenizer, device, test_dataset, shot_type='zero')

zero_360_table = create_results_table(zero_360_results, model_name="SmolLM2-360M Zero-Shot")
print(zero_360_table)

Processing: 100%|██████████| 500/500 [00:13<00:00, 36.09it/s]

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 




## Few Shot

In [10]:
few_360_results = evaluate_model(model, tokenizer, device, test_dataset, shot_type='few')

few_360_table = create_results_table(few_360_results, model_name="SmolLM2-360M Few-Shot")
print(few_360_table)

Processing: 100%|██████████| 500/500 [00:15<00:00, 32.52it/s]

[1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 




# SmolLM2-1.7B

In [11]:
# Clears cuda from last run
# if device == "cuda":
#     torch.cuda.empty_cache()
#     torch.cuda.ipc_collect()
#     torch.cuda.reset_peak_memory_stats()

checkpoint = "HuggingFaceTB/SmolLM2-1.7B"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForCausalLM.from_pretrained(checkpoint).to(device)

## Zero Shot

In [12]:
zero_17_results = evaluate_model(model, tokenizer, device, test_dataset, shot_type='zero')

zero_17_table = create_results_table(zero_17_results, model_name="SmolLM2-1.7B Zero-Shot")
print(zero_17_table)

Processing: 100%|██████████| 500/500 [00:38<00:00, 12.93it/s]

[1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 




## Few Shot

In [13]:
few_17_results = evaluate_model(model, tokenizer, device, test_dataset, shot_type='few')

few_17_table = create_results_table(few_17_results, model_name="SmolLM2-1.7B Few-Shot")
print(few_17_table)

Processing: 100%|██████████| 500/500 [00:48<00:00, 10.33it/s]

[1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 




# Print All Results

In [14]:
print("All models evaluated successfully!")

results_dicts = [
    zero_135_results,
    few_135_results,
    zero_360_results,
    few_360_results,
    zero_17_results,
    few_17_results,
]

# List of model names
model_names = [
    "SmolLM2-135M Zero-Shot",
    "SmolLM2-135M Few-Shot",
    "SmolLM2-360M Zero-Shot",
    "SmolLM2-360M Few-Shot",
    "SmolLM2-1.7B Zero-Shot",
    "SmolLM2-1.7B Few-Shot",
]

table = PrettyTable()

# Define the columns
table.field_names = [
    "Model",
    "Accuracy",
    "Recall (Sensitivity)",
    "Specificity",
    "Precision",
    "F-Score",
    "% True Predictions",
    "% False Predictions",
    "Total Inference Time (s)",
    "Avg Inference Time (s)"
]

# Populate the table
for model_name, results in zip(model_names, results_dicts):
    table.add_row([
        model_name,
        f"{results['accuracy']:.2f}",
        f"{results['recall']:.2f}",
        f"{results['specificity']:.2f}",
        f"{results['precision']:.2f}",
        f"{results['f_score']:.2f}",
        f"{results['true_percent']:.2f}%",
        f"{results['false_percent']:.2f}%",
        f"{results['total_inference_time']:.2f}",
        f"{results['average_inference_time']:.2f}",
    ])

print(table)
print(results_dicts)

All models evaluated successfully!
+------------------------+----------+----------------------+-------------+-----------+---------+--------------------+---------------------+--------------------------+------------------------+
|         Model          | Accuracy | Recall (Sensitivity) | Specificity | Precision | F-Score | % True Predictions | % False Predictions | Total Inference Time (s) | Avg Inference Time (s) |
+------------------------+----------+----------------------+-------------+-----------+---------+--------------------+---------------------+--------------------------+------------------------+
| SmolLM2-135M Zero-Shot |   0.51   |         0.01         |     0.99    |    0.60   |   0.02  |       1.00%        |        99.00%       |           9.43           |          0.02          |
| SmolLM2-135M Few-Shot  |   0.59   |         0.74         |     0.43    |    0.56   |   0.64  |       65.40%       |        34.60%       |           9.36           |          0.02          |
| Smo

# Models tend to say true more because they see it first in the prompt, but to counter this they do default to negative. MENTION IN REPORT

# Cleanup Cuda

In [15]:
# Clears cuda from last run
# if device == "cuda":
#     torch.cuda.empty_cache()
#     torch.cuda.ipc_collect()
#     torch.cuda.reset_peak_memory_stats()