In [None]:
"""
1) Read from the same movie text files (X lines, 5000 for now but can specify)
2) Create a prompt for an LLM which tells the LLM to generate a heirarchy of movies based on their script
3) Use the transformers .from_pretrained method with mistralai/Mistral-7B-v0.1, to feed the prompt to this model, and have it generate a heirarchy
4) Then, create a graph based on this heirarchy, and compare the graph created by the LLM to the one created manually
"""

In [1]:
"""
What is the point? What can we do?
1) Discover which features/words from graphs are most salient for deciding categories
2) We really want to understand how an LLM will understand concepts.. how to do this?
3) E.g. we can probe the attention heads and create movie representations..??
4) E.g. we can do the same when it is looking at the movie reviews.. how do the spaces look?
5) Beyond just comparing how they look, need a way of putting labels on movies back..
"""

'\nWhat is the point? What can we do?\n'

In [3]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.52.2-py3-none-any.whl.metadata (40 kB)
Collecting huggingface-hub<1.0,>=0.30.0 (from transformers)
  Downloading huggingface_hub-0.31.4-py3-none-any.whl.metadata (13 kB)
Collecting regex!=2019.12.17 (from transformers)
  Using cached regex-2024.11.6-cp313-cp313-win_amd64.whl.metadata (41 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers)
  Downloading tokenizers-0.21.1-cp39-abi3-win_amd64.whl.metadata (6.9 kB)
Collecting safetensors>=0.4.3 (from transformers)
  Using cached safetensors-0.5.3-cp38-abi3-win_amd64.whl.metadata (3.9 kB)
Downloading transformers-4.52.2-py3-none-any.whl (10.5 MB)
   ---------------------------------------- 0.0/10.5 MB ? eta -:--:--
   ---------------------------------------- 0.0/10.5 MB ? eta -:--:--
   ---------------- ----------------------- 4.2/10.5 MB 25.9 MB/s eta 0:00:01
   ---------------------------------------  10.2/10.5 MB 26.9 MB/s eta 0:00:01
   ---------------------------------------- 1

In [6]:
# from hugging face...
llama_api_code = "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"
llama_model_name = "meta-llama/Llama-2-7b-chat-hf"

In [None]:
#!/usr/bin/env python3
# llm_hierarchy_graph.py
# Build script-similarity graph, ask Llama-2 for a hierarchy, compare & plot.

import os
import glob
import itertools
import re

import torch
import networkx as nx
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

# ─── AUTH ────────────────────────────────────────────────────────────────────
HF_TOKEN = "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"
os.environ["HUGGINGFACEHUB_API_TOKEN"] = HF_TOKEN

# ─── CONFIG ─────────────────────────────────────────────────────────────────
SCRIPT_DIR  = "Action/Action"
MAX_LINES   = 5000
SIM_THRESH  = 0.25
TOP_K       = 5
MODEL_NAME  = "meta-llama/Llama-2-7b-chat-hf"
MAX_TOKENS  = 512
TEMPERATURE = 0.7

# ─── STEP 1: load scripts & build similarity graph ───────────────────────────
paths  = sorted(glob.glob(os.path.join(SCRIPT_DIR, "*.txt")))
titles = []
docs   = []

for p in paths:
    with open(p, "r", encoding="utf-8", errors="ignore") as f:
        docs.append(" ".join(itertools.islice(f, MAX_LINES)))
    titles.append(os.path.splitext(os.path.basename(p))[0])

# TF-IDF + cosine
tfidf = TfidfVectorizer(min_df=2, max_df=0.9, stop_words="english")
X     = tfidf.fit_transform(docs)
sim   = cosine_similarity(X)

sim_graph = nx.Graph()
for i, title in enumerate(titles, start=1):
    sim_graph.add_node(i, title=title)

n = len(titles)
for i in range(n):
    if TOP_K:
        neighs = sim[i].argsort()[-(TOP_K+1):][::-1]
    else:
        neighs = [j for j in range(n) if j != i and sim[i,j] >= SIM_THRESH]

    for j in neighs:
        if i == j:
            continue
        w = float(sim[i,j])
        if w < SIM_THRESH:
            continue
        sim_graph.add_edge(i+1, j+1, weight=w)

# ─── STEP 2: build LLM prompt ────────────────────────────────────────────────
prompt = (
    "You are an expert film critic.\n"
    "Produce a HIERARCHICAL clustering (indented plain text) of these ACTION movies.\n\n"
)
for title, doc in zip(titles, docs):
    snippet = doc[:1000].replace("\n", " ")
    prompt += f"--- {title} ---\n{snippet}\n\n"
prompt += "Hierarchy:\n"

# ─── STEP 3: load LLM & generate ─────────────────────────────────────────────
device_str = "cuda" if torch.cuda.is_available() else "cpu"
device_id  = 0      if torch.cuda.is_available() else -1

tokenizer = AutoTokenizer.from_pretrained(
    MODEL_NAME,
    token=HF_TOKEN,
    use_fast=True
)

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    use_auth_token=HF_TOKEN
).to(device_str)

gen = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=MAX_TOKENS,
    do_sample=True,
    temperature=TEMPERATURE,
    device=device_id
)

generated = gen(prompt, return_full_text=False)[0]["generated_text"]
print("=== LLM hierarchy ===\n")
print(generated)

# ─── STEP 4: parse hierarchy into DiGraph ───────────────────────────────────
hier_graph = nx.DiGraph()
stack = [(0, None)]   # (indent_level, parent_id)

for line in generated.splitlines():
    if not line.strip(): 
        continue
    indent = len(line) - len(line.lstrip())
    name   = line.strip("-•*0123456789. ").strip()
    if not name:
        continue

    try:
        idx = titles.index(name) + 1
    except ValueError:
        idx = f"group:{name}"

    # find correct parent
    while stack and indent <= stack[-1][0]:
        stack.pop()
    parent = stack[-1][1]

    hier_graph.add_node(idx)
    if parent is not None:
        hier_graph.add_edge(parent, idx)
    stack.append((indent, idx))

# ─── STEP 5: compare graphs ─────────────────────────────────────────────────
ged = nx.graph_edit_distance(sim_graph, hier_graph, timeout=10)
print(f"\nApproximate graph-edit distance: {ged}")

# ─── STEP 6: visualize both graphs ──────────────────────────────────────────
plt.figure(figsize=(12,5))

# similarity graph
plt.subplot(1,2,1)
pos1 = nx.spring_layout(sim_graph, k=0.5, seed=42, weight="weight")
nx.draw(
    sim_graph, pos1,
    with_labels=True,
    labels={i:i for i in sim_graph.nodes()},
    node_color="#1f77b4",
    node_size=400,
    font_color="white"
)
plt.title("Script-similarity graph")

# LLM hierarchy
plt.subplot(1,2,2)
try:
    pos2 = nx.nx_agraph.graphviz_layout(hier_graph, prog="dot")
except:
    pos2 = nx.spring_layout(hier_graph, seed=42)
nx.draw(
    hier_graph, pos2,
    with_labels=True,
    node_color="#d62728",
    node_size=400
)
plt.title("LLM-generated hierarchy")

plt.tight_layout()
plt.show()


model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

Device set to use cpu
