# Import Libraries and setup Hugging Face

In [1]:
import time
import os
from dotenv import load_dotenv
import torch
from sklearn.metrics import accuracy_score
from datasets import load_dataset
from tqdm.notebook import tqdm
from transformers import BatchEncoding

# Update HF cache directory
env_path = os.path.abspath(os.path.join(os.getcwd(), '..', '..', '.env'))
load_dotenv(env_path)
hf_cache_dir = os.getenv('TRANSFORMERS_CACHE')
os.makedirs(hf_cache_dir, exist_ok=True)
print(f"Hugging Face cache directory set to: {hf_cache_dir}")

from transformers import AutoModelForCausalLM, AutoTokenizer

# Check cuda version torch is using
print(f"Using torch {torch.__version__} with cuda {torch.version.cuda}")

workspace_dir = os.getenv('WORKSPACE_DIR')

seed = 42

Hugging Face cache directory set to: /root/repos/DL-Final-Project/.cache/huggingface
Using torch 2.5.1 with cuda 12.1




# Import the dataset
It is stored in the dataset directory which is gitignored so run this block to repopulate if needed

In [2]:
# # Check if dataset is present
# dataset_dir = os.path.join(workspace_dir, 'datasets')
# os.makedirs(dataset_dir, exist_ok=True)

# if not os.path.exists(os.path.join(dataset_dir, 'IMDB Dataset.csv')):
#     !kaggle datasets download lakshmi25npathi/imdb-dataset-of-50k-movie-reviews --path {dataset_dir} --unzip

# # Load dataset into dataframe
# dataset = pd.read_csv(os.path.join(dataset_dir, 'IMDB Dataset.csv'))
# print(dataset.head())

# _, test_set = train_test_split(dataset, test_size=0.2, random_state=seed)

imdb = load_dataset("imdb")

test_dataset = imdb['test'].shuffle(seed=seed).select([i for i in list(range(1000))])
train_dataset = imdb['train'].shuffle(seed=seed)

print(f"Train dataset: {len(train_dataset)}")
print(f"Test dataset: {len(test_dataset)}")
print(test_dataset[0])

Train dataset: 25000
Test dataset: 1000
{'text': "<br /><br />When I unsuspectedly rented A Thousand Acres, I thought I was in for an entertaining King Lear story and of course Michelle Pfeiffer was in it, so what could go wrong?<br /><br />Very quickly, however, I realized that this story was about A Thousand Other Things besides just Acres. I started crying and couldn't stop until long after the movie ended. Thank you Jane, Laura and Jocelyn, for bringing us such a wonderfully subtle and compassionate movie! Thank you cast, for being involved and portraying the characters with such depth and gentleness!<br /><br />I recognized the Angry sister; the Runaway sister and the sister in Denial. I recognized the Abusive Husband and why he was there and then the Father, oh oh the Father... all superbly played. I also recognized myself and this movie was an eye-opener, a relief, a chance to face my OWN truth and finally doing something about it. I truly hope A Thousand Acres has had the same 

# Define Experiment Function

In [3]:
"""
# Define the movie review for classification
example_review = "This movie was an absolute masterpiece with stunning visuals and a gripping story!"
example_review_neg = "This movie was terrible and I hated it."
example_negative_review_2 = "I really think this movie is not that good. It was a waste of time."

example_inference_review = "what a movie ! changed my life! I love luke's character and actor"
# Prompt for zero-shot classification
prompt = [
    ['Review :', example_review, ' Sentiment[positive/negative] :', ' positive','\n'],
    ['Review :', example_review_neg, ' Sentiment[positive/negative] :', ' negative','\n'],
    ['Review :', example_negative_review_2, ' Sentiment[positive/negative] :', ' negative','\n'],
    ['Review :', example_inference_review, ' Sentiment[positive/negative] :'],
]

prompt_=[]
for i in prompt:
    prompt_.append(''.join(i))
#make all attached
prompt = ''.join(prompt_)

# Tokenize and set up for inference
inputs = tokenizer(prompt, return_tensors="pt").to(device)

# Generate the output
start_inf_time = time.time()

with torch.no_grad():
    output = model.generate(**inputs, max_new_tokens=5)

end_inf_time = time.time()

# Decode and print the output
output_str = tokenizer.decode(output[0], skip_special_tokens=True)
print(output_str)

print(f"Inference time: {end_inf_time - start_inf_time:.2f} seconds")
"""


"""
from tqdm import tqdm
def test_model(few_shot_prompt,test_dataset):                       
    total_right = 0
    
    for entry in tqdm(test_dataset):
        rev = entry['text']
        posNeg = entry['label']
        input_text = f"{few_shot_prompt}" + f'"{rev}" This movie review is'
        
    
        input_ids = tokenizer.encode(input_text, return_tensors='pt').to(device)

        # Get model predictions
        with torch.no_grad():
            outputs = model(input_ids)
            logits = outputs.logits

        # Get the probabilities for the next token
        next_token_logits = logits[:, -1, :]  # Only consider the last token's logits
        probabilities = torch.softmax(next_token_logits, dim=-1)

        # Get the top 10 most likely tokens
        top_k = 10
        top_k_probs, top_k_indices = torch.topk(probabilities, top_k)

        # Decode the top 10 tokens
        top_k_tokens = [tokenizer.decode([token]) for token in top_k_indices[0]]
        pred = 1
        for tok in top_k_tokens:
            if tok == ' positive':
                #print('pssssss')
                pred = 1
                break
            elif tok == ' negative':
                #print('nggg')
                pred = 0
                break
        if pred == posNeg:
            total_right+=1
        
        
#         print(rev)
#         print(f"pred is {pred}")
#         print(f"the target was {posNeg}")
#         print('\n')
        
    print(total_right/len(test_dataset))


test_model(few_shot_prompt,small_test_dataset)

"""



def evaluate_model_zero_shot(model, tokenizer, device, dataset, top_k=50):
    # Data preparation
    reviews = [example['text'] for example in dataset]
    true_labels = [example['label'] for example in dataset]  # 0 for negative, 1 for positive

    # Define the prompts
    prompts = [f"Movie Review: {review} \n Only Answer if this Movie Review is Positive or Negative:" for review in reviews]

    # Perform inference
    predictions = []
    inference_times = []

    for idx, example in tqdm(enumerate(dataset), total=len(dataset), desc="Processing", leave=True):
        # Tokenize the input
        inputs = tokenizer.encode(prompts[idx], return_tensors="pt").to(device)

        # Perform inference
        start_time = time.time()
        with torch.no_grad():
            outputs = model(inputs)
            logits = outputs.logits
        end_time = time.time()

        # Get the probabilities for the next token
        next_token_logits = logits[:, -1, :]  # Only consider the last token's logits
        probabilities = torch.softmax(next_token_logits, dim=-1)

        # Get the top k most likely tokens
        top_k_probs, top_k_indices = torch.topk(probabilities, top_k) # these are sorted in order of most likely to least likely

        # Decode the top k tokens
        top_k_tokens = [tokenizer.decode([token]) for token in top_k_indices[0]]

        # Extract the sentiment prediction from the top 10 tokens
        pred = -1
        for token in top_k_tokens:
            token_lower = token.strip().lower()
            if token_lower == 'positive':
                pred = 1
                break
            elif token_lower == 'negative':
                pred = 0
                break

        # If the model did not predict a sentiment, default to negative
        predictions.append(pred)
        
        inference_times.append(end_time - start_time)

    # Calculate accuracy
    accuracy = accuracy_score(true_labels, predictions)
    total_inference_time = sum(inference_times)
    average_inference_time = total_inference_time / len(inference_times)
    return accuracy, total_inference_time, average_inference_time



def evaluate_model_few_shot(model, tokenizer, device, dataset, top_k=50):
    # Data preparation
    reviews = [example['text'] for example in dataset]
    true_labels = [example['label'] for example in dataset]  # 0 for negative, 1 for positive

    # Define the prompts
    # example_review = "This movie was an absolute masterpiece with stunning visuals and a gripping story!"
    # example_review_neg = "This movie was terrible and I hated it."
    # example_negative_review_2 = "I really think this movie is not that good. It was a waste of time."
    few_shot_rev_1 = "Movie Review: I loved this movie ! So good plot ! \n Only Answer if this Movie Review is Positive or Negative: Positive \n"
    few_shot_rev_2 = "Movie Review: I hated this, could be a lot better \n Only Answer if this Movie Review is Positive or Negative: Negative \n"
 
    prompts = [f"{few_shot_rev_1} {few_shot_rev_2} Movie Review: {review} \n Only Answer if this Movie Review is Positive or Negative:" for review in reviews]

    # Perform inference
    predictions = []
    inference_times = []

    for idx, example in tqdm(enumerate(dataset), total=len(dataset), desc="Processing", leave=True):
        # Tokenize the input
        inputs = tokenizer.encode(prompts[idx], return_tensors="pt").to(device)

        # Perform inference
        start_time = time.time()
        with torch.no_grad():
            outputs = model(inputs)
            logits = outputs.logits
        end_time = time.time()

        # Get the probabilities for the next token
        next_token_logits = logits[:, -1, :]  # Only consider the last token's logits
        probabilities = torch.softmax(next_token_logits, dim=-1)

        # Get the top k most likely tokens
        top_k_probs, top_k_indices = torch.topk(probabilities, top_k) # these are sorted in order of most likely to least likely

        # Decode the top k tokens
        top_k_tokens = [tokenizer.decode([token]) for token in top_k_indices[0]]

        # Extract the sentiment prediction from the top 10 tokens
        pred = -1
        for token in top_k_tokens:
            token_lower = token.strip().lower()
            if token_lower == 'positive':
                pred = 1
                break
            elif token_lower == 'negative':
                pred = 0
                break

        # If the model did not predict a sentiment, default to negative
        predictions.append(pred)
        
        inference_times.append(end_time - start_time)

    # Calculate accuracy
    accuracy = accuracy_score(true_labels, predictions)
    total_inference_time = sum(inference_times)
    average_inference_time = total_inference_time / len(inference_times)
    return accuracy, total_inference_time, average_inference_time

# SmolLM-135M


In [4]:
device = "cuda" # for GPU usage or "cpu" for CPU usage

# Clears cuda from last run
if device == "cuda":
    torch.cuda.empty_cache()
    torch.cuda.ipc_collect()
    torch.cuda.reset_peak_memory_stats()

#FROM https://huggingface.co/HuggingFaceTB/SmolLM2-135M TODO: Dont forget to cite the model in report
checkpoint = "HuggingFaceTB/SmolLM2-135M"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForCausalLM.from_pretrained(checkpoint).to(device)

## Zero Shot

In [5]:
zero_135_accuracy, zero_135_total_time, zero_135_average_time = evaluate_model_zero_shot(model, tokenizer, device, test_dataset)

print(f"Accuracy: {zero_135_accuracy:.2f}")
print(f"Total inference time: {zero_135_total_time:.2f} seconds")
print(f"Average inference time: {zero_135_average_time:.2f} seconds")

Processing:   0%|          | 0/1000 [00:00<?, ?it/s]

Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)


Accuracy: 0.44
Total inference time: 15.33 seconds
Average inference time: 0.02 seconds


## Few Shot

In [6]:
few_135_accuracy, few_135_total_time, few_135_average_time = evaluate_model_few_shot(model, tokenizer, device, test_dataset)

print(f"Accuracy: {few_135_accuracy:.2f}")
print(f"Total inference time: {few_135_total_time:.2f} seconds")
print(f"Average inference time: {few_135_average_time:.2f} seconds")

Processing:   0%|          | 0/1000 [00:00<?, ?it/s]

Accuracy: 0.49
Total inference time: 16.97 seconds
Average inference time: 0.02 seconds


# Cleanup Cuda

In [7]:
# Clears cuda from last run
if device == "cuda":
    torch.cuda.empty_cache()
    torch.cuda.ipc_collect()
    torch.cuda.reset_peak_memory_stats()