In [26]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [1]:
import sys, os, subprocess, hashlib, torch
from mlx_lm import load, generate

#### 2. Run shell commands with live output
* Runs any shell command.
* Prints the output line by line instead of waiting until the end.
* Very useful for training loops that take minutes to hours.

In [2]:
def run_command_with_live_output(command: list[str]) -> None:
    """
    Courtesy of ChatGPT:
    Runs a command and prints its output line by line as it executes.

    Args:
        command (List[str]): The command and its arguments to be executed.

    Returns:
        None
    """
    process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)

    # Print the output line by line
    while True:
        output = process.stdout.readline()
        if output == '' and process.poll() is not None:
            break
        if output:
            print(output.strip())
        
    # Print the error output, if any
    err_output = process.stderr.read()
    if err_output:
        print(err_output)

#### 3. Format shell commands for easier copy/paste
* Converts a Python list command into a clean string.
* Example: `['python', 'scripts/convert.py', '--hf-path', 'model'] → "python scripts/convert.py --hf-path model"`

In [3]:
def construct_shell_command(command: list[str]) -> str:
    
    return str(command).replace("'","").replace("[","").replace("]","").replace(",","")

#### 4. Build prompts for testing inference
* Defines a role prompt for instruction-tuned models.
* Example: If a user says “Great content, thank you!”, the prompt tells the model how to respond like “ShawGPT”.
* prompt_builder wraps user comments in this instruction format.

In [4]:
# prompt format
intstructions_string = f"""ShawGPT, functioning as a virtual data science consultant on YouTube, communicates in clear, accessible language, escalating to technical depth upon request. \
It reacts to feedback aptly and ends responses with its signature '–ShawGPT'. \
ShawGPT will tailor the length of its responses to match the viewer's comment, providing concise acknowledgments to brief expressions of gratitude or feedback, \
thus keeping the interaction natural and engaging.

Please respond to the following comment.
"""

prompt_builder = lambda comment: f'''<s>[INST] {intstructions_string} \n{comment} \n[/INST]\n'''

#### 5. Convert Hugging Face model → MLX format / Quantize Model (optional)
* Downloads Hugging Face Mistral model.
* Converts it to MLX format (.npz files) for Apple Silicon.
* -q quantizes the model → smaller & faster.
* Prints the runnable command so you can also run it directly in the terminal.

In [5]:
hf_model_path = "mistralai/Mistral-7B-Instruct-v0.2"

In [6]:
# define command to convert hf model to mlx format and save locally (-q flag quantizes model)
command = ['python', 'scripts/convert.py', '--hf-path', hf_model_path, '-q']

# print runable version of command (copy and paste into command line to run)
print(construct_shell_command(command))

python scripts/convert.py --hf-path mistralai/Mistral-7B-Instruct-v0.2 -q


#### 6. Load quantized MLX model & test inference / Run inference with quantized model
* Loads the 4-bit quantized Mistral model.
* Builds a test prompt with prompt_builder.
* Runs inference with generate.
* max_tokens=140 → limits response length.
* ✅ Baseline inference before fine-tuning.

In [7]:
model_path = "mlx-community/Mistral-7B-Instruct-v0.2-4bit"
prompt = prompt_builder("Great content, thank you!")
max_tokens = 140

In [8]:
model, tokenizer = load("mlx-community/Mistral-7B-Instruct-v0.2-4bit")
response = generate(model, tokenizer, prompt=prompt, max_tokens = max_tokens,verbose=True)

Fetching 7 files:   0%|          | 0/7 [00:00<?, ?it/s]

–ShawGPT: I'm glad you're finding the content helpful and enjoyable! If you have any specific questions or topics you'd like me to cover in more depth, feel free to ask. Otherwise, I'll keep providing clear and accessible explanations for all things data science. Thanks for tuning in!
Prompt: 121 tokens, 177.215 tokens-per-sec
Generation: 69 tokens, 36.989 tokens-per-sec
Peak memory: 4.547 GB


#### 7. Deduplication Helpers (CS + DR)

In [19]:
forward_cache = {}  # for compute reuse
fastcdc_path = "/Users/sanjeeb/Coding/HSSL/fastcdc-go/cmd/fastcdc/fastcdc"  # path to your binary

def hash_batch(batch_tensor: torch.Tensor) -> str:
    return hashlib.sha256(batch_tensor.cpu().numpy().tobytes()).hexdigest()

def run_training_with_cache(num_iters=100, seed=None):
    if seed is not None:
        torch.manual_seed(seed)   # ensure same batches each run
    hits, misses = 0, 0
    for step in range(num_iters):
        fake_batch = torch.randint(0, 1000, (8, 32))
        h = hash_batch(fake_batch)
        if h in forward_cache:
            hits += 1
        else:
            forward_cache[h] = fake_batch.sum().item()
            misses += 1
    return hits, misses


def run_fastcdc(input_file, output_file):
    cmd = [fastcdc_path, "-file", input_file, "-min", "16384", "-avg", "32768", "-max", "65536"]
    with open(output_file, "w") as f:
        subprocess.run(cmd, stdout=f, stderr=subprocess.PIPE, text=True)
    print(f"✅ FastCDC finished: {input_file} → {output_file}")

def load_hashes(chunk_file):
    hashes = []
    with open(chunk_file) as f:
        for line in f:
            parts = line.strip().split()
            if len(parts) >= 3:
                hashes.append(parts[2])
    return set(hashes)


#### 8. Hyperparameter Sweep with Compute + Storage Deduplication


In [13]:
num_iters = "100"
steps_per_eval = "10"
val_batches = "-1"
num_layers = 16

learning_rates = ["1e-5", "5e-5"]
results = []
prev_hashes = None

for lr in learning_rates:
    adapter_file = f"adapters_lr{lr}.npz"
    chunk_file = f"chunks_lr{lr}.txt"

    if os.path.exists(adapter_file) and os.path.exists(chunk_file):
        print(f"Skipping fine-tuning for LR={lr}, artifacts already exist.")
    else:
        print(f"\n=== Fine-tuning with LR={lr} ===")

        # LoRA fine-tuning run
        command = [
            'python', 'scripts/lora.py',
            '--model', model_path,
            '--train',
            '--iters', num_iters,
            '--steps-per-eval', steps_per_eval,
            '--val-batches', val_batches,
            '--learning-rate', lr,
            '--lora-layers', str(num_layers)
        ]
        run_command_with_live_output(command)

        # Save adapters
        adapter_file = f"adapters_lr{lr}.npz"
        os.rename("adapters.npz", adapter_file)

        # Run FastCDC
        chunk_file = f"chunks_lr{lr}.txt"
        run_fastcdc(adapter_file, chunk_file)

        # Compute Savings (CS)
        hits, misses = run_training_with_cache(num_iters=100, seed=42)

        cs = hits / (hits + misses)
        print(f"Compute Savings (CS): {cs:.2%} (Hits={hits}, Misses={misses})")

        # Deduplication Ratio (DR)
        hashes = load_hashes(chunk_file)
        if prev_hashes is not None:
            common = len(prev_hashes.intersection(hashes))
            unique_curr = len(hashes - prev_hashes)
            if (len(prev_hashes) + unique_curr) > 0:
                dr = (len(prev_hashes) + len(hashes)) / (len(prev_hashes) + unique_curr)
            else:
                dr = 1.0
            print(f"Shared chunks: {common}, New unique chunks: {unique_curr}")
            print(f"Deduplication Ratio (DR): {dr:.2f}")
        else:
            dr = 1.00
            print("Baseline run: DR = 1.00")

        results.append({"lr": lr, "CS": cs, "DR": dr})
        prev_hashes = hashes


Skipping fine-tuning for LR=1e-5, artifacts already exist.
Skipping fine-tuning for LR=5e-5, artifacts already exist.


## Deduplication Analysis

In [27]:
### Deduplication Analysis (Byte-level vs Value-level)

import numpy as np

learning_rates = ["1e-5", "5e-5"]
results = []
prev_hashes = None
prev_adapter = None

for lr in learning_rates:
    print(f"\n=== Deduplication Analysis for LR={lr} ===")

    adapter_file = f"adapters_lr{lr}.npz"
    chunk_file = f"chunks_small_lr{lr}.txt"

    if not os.path.exists(adapter_file):
        print(f"❌ Missing adapter file: {adapter_file}")
        continue

    # --- Run FastCDC again with smaller chunk sizes ---
    cmd = [
        fastcdc_path,
        "-file", adapter_file,
        "-min", "1024",  # 1 KB
        "-avg", "2048",  # 2 KB
        "-max", "4096"  # 4 KB
    ]
    with open(chunk_file, "w") as f:
        subprocess.run(cmd, stdout=f, stderr=subprocess.PIPE, text=True)
    print(f"✅ Re-chunked {adapter_file} → {chunk_file}")

    # --- Compute Savings (CS) ---
    hits, misses = run_training_with_cache(num_iters=100, seed=42)
    cs = hits / (hits + misses) if (hits + misses) > 0 else 0.0
    print(f"Compute Savings (CS): {cs:.2%} (Hits={hits}, Misses={misses})")

    # --- Byte-level Deduplication (FastCDC DR) ---
    hashes = load_hashes(chunk_file)
    if prev_hashes is not None:
        common = len(prev_hashes.intersection(hashes))
        unique_curr = len(hashes - prev_hashes)
        if (len(prev_hashes) + unique_curr) > 0:
            dr_byte = (len(prev_hashes) + len(hashes)) / (len(prev_hashes) + unique_curr)
        else:
            dr_byte = 1.0
        print(f"[Byte-level] Shared chunks: {common}, New unique chunks: {unique_curr}, DR={dr_byte:.2f}")
    else:
        dr_byte = 1.00
        print("[Byte-level] Baseline run: DR = 1.00")

    # --- Value-level Similarity (NumPy arrays) ---
    with np.load(adapter_file) as arrs:
        if prev_adapter is not None:
            diffs = []
            for k in arrs.files:
                if k in prev_adapter:
                    diff = np.mean(np.abs(arrs[k] - prev_adapter[k]))
                    diffs.append(diff)
            mean_diff = float(np.mean(diffs)) if diffs else 0.0
            print(f"[Value-level] Mean weight difference: {mean_diff:.6f}")
        else:
            mean_diff = 0.0
            print("[Value-level] Baseline run (no comparison).")

    results.append({"lr": lr, "CS": cs, "DR_byte": dr_byte, "MeanDiff_val": mean_diff})
    prev_hashes = hashes
    prev_adapter = dict(np.load(adapter_file))

# --- Show Results Table ---
import pandas as pd

df = pd.DataFrame(results)
print("\n=== Results (Byte-level vs Value-level) ===")
print(df.to_string(index=False))



=== Deduplication Analysis for LR=1e-5 ===
✅ Re-chunked adapters_lr1e-5.npz → chunks_small_lr1e-5.txt
Compute Savings (CS): 100.00% (Hits=100, Misses=0)
[Byte-level] Baseline run: DR = 1.00
[Value-level] Baseline run (no comparison).

=== Deduplication Analysis for LR=5e-5 ===
✅ Re-chunked adapters_lr5e-5.npz → chunks_small_lr5e-5.txt
Compute Savings (CS): 100.00% (Hits=100, Misses=0)
[Byte-level] Shared chunks: 0, New unique chunks: 0, DR=1.00
[Value-level] Mean weight difference: 0.006872

=== Results (Byte-level vs Value-level) ===
  lr  CS  DR_byte  MeanDiff_val
1e-5 1.0      1.0      0.000000
5e-5 1.0      1.0      0.006872
