# NLP Final Project Notebook

This notebook contains the main analysis and experiments for the NLP final project.

# Setup

Uncomment and run the cell below in Colab

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

In [None]:
# # Replace this by the folder path where you put your assignment. If you are working with local, skip executing the previous cell and add the path to local directory here.
# # folder_path = '.'
# import os
# folder_path = '/content/drive/My Drive/Colab Notebooks/NLPFinalProject-output_test' # Change the path to the folder where the assignment is stored in Google Drive.

# # Files in the folder -
# os.listdir(folder_path)

# os.chdir(folder_path)

# print(os.listdir())

# print('Current working directory -', os.getcwd())

In [None]:
# import importlib, sys
# sys.modules["imp"] = importlib

In [None]:
# from importlib import reload

In [None]:
# # Install components
# !curl https://ollama.ai/install.sh | sh
# !pip install ollama

# !echo 'debconf debconf/frontend select Noninteractive' | sudo debconf-set-selections

# import os
# # Set LD_LIBRARY_PATH so the system NVIDIA library
# os.environ.update({'LD_LIBRARY_PATH': '/usr/lib64-nvidia'})

# # Start server
# import subprocess
# proccess = subprocess.Popen(['ollama', 'serve'])

FIRST... Follow instructions in README.md

Check if Ollama is properly installed and running.

In [None]:
!./scripts/check_ollama.sh

## Env Example

Below is an example cell on how to reference and call functions we create in isolated files

In [None]:
from src.visualization.utils import example_plot, example_function

example_plot()
example_function()

# Global Imports

In [None]:
%load_ext autoreload
%reload_ext autoreload
%autoreload 2

In [None]:
# Standard Library Imports
import json
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt

# Custom Imports
from src.io.ollama_client import OllamaClient
from src.io.ollama_client import ConversationEntry, SavedConversationData

# Our function imports
from src.experiments.output_task import OutputTest
from src.experiments.telephone import TelephoneTest
from src.metrics.similarity import compute_cosine_similarity_matrix
from src.metrics.drift import velocity, acceleration, normalize_0_1, ema


# Example Usage

## OllamaClient Usage

This is the client that wraps the usage of the ollama API, and should be instantiated per model

In [None]:
qwen = OllamaClient("qwen3:0.6b", log_conversations=True)

Run the cell below to generate text from the model, this will generate a standalone response (no context)

In [None]:
qwen.generate_text("Hello, world!")

After running the cell above, the conversation history will be populated with the prompt and response

In [None]:
print(qwen.get_conversation_count())
# The conversation history can be accessed by calling the get_conversation_history method
print(qwen.get_conversation_history())
# The last conversation can be accessed by calling the get_last_conversation method
print(qwen.get_last_conversation())
# The conversation history can be cleared by calling the clear_conversation_history method
# qwen.clear_conversation_history()

## Conversation Persistence

You can save and load conversation histories to disk for experiment tracking.


In [None]:
# Create a client with custom output directory
client = OllamaClient("qwen3:0.6b", output_dir="./experiments", log_conversations=True)

# Generate some conversations
response1 = client.generate_text("What is machine learning? Explain like I'm 5 in 20 words or less.")
print(f"Response 1: {response1}\n")

response2 = client.generate_text("Explain neural networks? Explain like I'm 5 in 20 words or less.")
print(f"Response 2: {response2}\n")

# Flush conversation history to disk
saved_path = client.flush_conversation_history("test_experiment")
print(f"Saved conversation history to: {saved_path}")


### Loading Conversation History

You can reload conversation histories from disk. By default, it loads the most recent file.

- Method 1: Using get_conversation_history() to iterate through the in-memory loaded conversations
- Method 2: Using iter_experiment_conversations() to iterate directly from files (memory-efficient for large experiments)

In [None]:
new_client = OllamaClient("qwen3:0.6b", output_dir="./experiments", log_conversations=True)

# Load the most recent conversation history for "test_experiment"
count = new_client.load_conversation_history("test_experiment")
print(f"Loaded {count} conversations from disk\n")

# Verify the loaded history
print(f"Conversation count: {new_client.get_conversation_count()}")
print(f"Last response: {new_client.get_last_response_text()}")
print("\n" + "="*80 + "\n")

# Method 1: Iterate through in-memory conversation history
# print("METHOD 1: Iterating through in-memory conversation history")
# print("="*80)
# for i, conv in enumerate[ConversationEntry](new_client.get_conversation_history(), 1):
#     print(f"\n--- Conversation {i} ---")
#     print(f"Prompt:   {conv['input'][:80]}...")  # First 80 chars
#     print(f"Response: {conv['response'][:80]}...")  # First 80 chars

# print("\n" + "="*80 + "\n")

# Method 2: Memory-efficient file iterator (doesn't load all into memory)
# print("METHOD 2: Iterating using file iterator (memory-efficient)")
# print("="*80)
# for i, conv in enumerate[ConversationEntry](new_client.iter_experiment_conversations("test_experiment"), 1):
#     print(f"\n--- Conversation {i} ---")
#     print(f"Prompt:   {conv['input'][:80]}...")  # First 80 chars
#     print(f"Response: {conv['response'][:80]}...")  # First 80 chars

### Viewing Saved JSON Structure

The saved conversation history contains all the details needed for analysis.


In [None]:
# Example of viewing a single conversation entry
conversation = new_client.get_conversation_history()[0]

print("Conversation Structure:")
print(f"Input: {conversation['input']}")
print(f"Response: {conversation['response']}")
print(f"Type: {conversation['type']}")
print(f"Timestamp: {conversation['timestamp']}")
print(f"\nDetails keys: {conversation['details'].keys()}")


The response from ollama and stored in the client contains a ton of information if needed. Note the difference in the already parsed thinking and response secitons, this could be used later.

In [None]:
resp = qwen.get_last_conversation().get("details")
print(json.dumps(resp, indent=4))

Or more simply

In [None]:
qwen.get_last_response_text()

Depending on how you choose to setup your experiments, you can choose to use the cached responses in the client class, or keep track of the inputs and outputs outside of the client, either is fine just use `log_conversations=False` to prevent memory bloat if you are doing it yourself. I imagine keeping it within the class will work better, then appending a new message like `prompt + qwen.get_last_response()` for the output test. This way, we can parse the message history into formats used for calculating embeddings, drift, accelertaion etc. Use the flush functionality to save to the file system if you are running this for a while, or running this for a lot of iterations. The scheme for your tests should be split into data creation, then data parsing. I.E. create a cell to run the prompts and save the conversations, then create a cell that iterates through the class or files to compute embeddings.

## Embeddings

Create embeddings using the same OllamaClient with a new model name

In [None]:
string_to_embed = qwen.get_last_response_text() # "Hello world!"

embeddinggemma = OllamaClient("embeddinggemma", log_conversations=False)

embeddings = np.array(embeddinggemma.generate_embeddings(string_to_embed))

print(embeddings.shape)
print(embeddings)

A quick similarity sanity check

In [None]:
# Test cases
cases = [
    ("Not Similar", "The quick brown fox jumps over the lazy dog",
     "Machine learning is a subset of artificial intelligence"),
    ("Similar", "A dog is playing in the park",
     "A puppy is running in the garden"),
    ("Identical", "Hello world!", "Hello world!")
]

for label, text1, text2 in cases:
    emb1 = np.array(embeddinggemma.generate_embeddings(text1)).reshape(1, -1)
    emb2 = np.array(embeddinggemma.generate_embeddings(text2)).reshape(1, -1)
    similarity = cosine_similarity(emb1, emb2)[0][0]
    print(f"{label:12s} | Similarity: {similarity:.4f}")

A quick drift sanity check

In [None]:
linear = np.linspace(1.0, 0.0, 10)             # steadily declining
quadratic = 1 - (np.linspace(0, 1, 10) ** 2)   # faster decline at the start
plateau = np.concatenate([np.ones(5), np.linspace(1, 0.5, 5)])  # flat then drop

cases = {"linear": linear, "quadratic": quadratic, "plateau": plateau}

for name, s in cases.items():
    v = velocity(s)
    a = acceleration(s)
    norm = normalize_0_1(s)
    smooth = ema(norm, alpha=0.3)

    print(f"\n=== {name.upper()} SEQUENCE ===")
    print("Original:", np.round(s, 3))
    print("Velocity:", np.round(v, 3))
    print("Acceleration:", np.round(a, 3))

# Output Test

Create test env setup below

In [None]:
# Call boilerplate code and specific constructs you need from your src files here
output_test_qwen = OllamaClient("qwen3:0.6b", output_dir="./experiments", log_conversations=True)
output_test = OutputTest(output_test_qwen)
output_test.run(50, save_history_every=50, experiment_name="output_task_qwen3:0.6b")

In [None]:
from src.visualization.utils import visualize_output_test
visualize_output_test("output_task_qwen3:0.6b")

## Run average batch

In [None]:
from src.experiments.output_task import run_batch_output_test

# Batch Output Test - Run multiple times
output_test_qwen_batch = OllamaClient("qwen3:0.6b", output_dir="./output_task_experiments", log_conversations=True)
output_test_gemma_batch = OllamaClient("gemma3:1b-it-q8_0", output_dir="./output_task_experiments", log_conversations=True)
output_test_llama_batch = OllamaClient("llama3.2:1b-instruct-q4_K_M", output_dir="./output_task_experiments", log_conversations=True)

run_batch_output_test(
    clients=[output_test_qwen_batch, output_test_gemma_batch, output_test_llama_batch],
    num_runs=30,
    iterations_per_run=15,
    save_history_every=15
)

In [None]:
from src.visualization.utils import visualize_output_test_aggregated
visualize_output_test_aggregated("output_task_llama3.2:1b-instruct-q4_K_M", show_individual_runs=True, output_dir="./output_task_experiments")

In [None]:
from src.visualization.utils import visualize_output_test_comparison
visualize_output_test_comparison(output_dir="./output_task_experiments")

# Telephone Test


Create test env setup below

In [None]:
telephone_test_qwen06_batch = OllamaClient("qwen3:0.6b", output_dir="./telephone_task_experiments", log_conversations=True)

In [None]:
telephone_test_qwen8_batch = OllamaClient("qwen3:8b", output_dir="./telephone_task_experiments", log_conversations=True)

In [None]:
# telephone_test_qwen235_batch = OllamaClient("qwen3:235b", output_dir="./telephone_task_experiments", log_conversations=True)

In [None]:
telephone_test_llama70_batch = OllamaClient("llama3.1:70b", output_dir="./telephone_task_experiments", log_conversations=True)

In [None]:
telephone_test_llama8_batch = OllamaClient("llama3.1:8b", output_dir="./telephone_task_experiments", log_conversations=True)

In [None]:
telephone_test_gemma4_batch = OllamaClient("Gemma3:4b", output_dir="./telephone_task_experiments", log_conversations=True)

In [None]:
telephone_test_gptoss_batch = OllamaClient("GPT-OSS:20b", output_dir="./telephone_task_experiments", log_conversations=True)

In [None]:
from src.experiments.telephone import run_batch_telephone_test
# Define shared embedding client
embedding_model = OllamaClient("embeddinggemma", output_dir="./telephone_task_experiments", log_conversations=False)

# Run batch test (uses DEFAULT_INITIAL_TEXT from telephone.py)
run_batch_telephone_test(
    text_clients=[telephone_test_qwen06_batch, telephone_test_qwen8_batch, telephone_test_llama70_batch, telephone_test_llama8_batch, telephone_test_gemma4_batch, telephone_test_gptoss_batch],
    embedding_client=embedding_model,
    num_runs=300,
    iterations_per_run=20,
    save_history_every=20
)

In [None]:
from src.visualization.utils import visualize_telephone_test_aggregated

# Visualize one model's aggregated runs
visualize_telephone_test_aggregated(
    experiment_name="telephone_qwen3:0.6b",
    output_dir="./telephone_task_experiments",
    show_individual_runs=True,
    confidence_level=0.95
)

In [None]:
from src.visualization.utils import visualize_telephone_test_comparison

# Auto-discovers and compares all telephone_* experiments
visualize_telephone_test_comparison(
    output_dir="./telephone_task_experiments",
    confidence_level=0.95
)

# Vizualization and Analytics

Create test env setup below

In [None]:
# Call boilerplate code and specific constructs you need from your src files here