In [65]:
import json
import numpy as np
from pprint import pprint
import re
import torch
from transformers import CLIPProcessor, CLIPModel
from tqdm.notebook import tqdm
from umap import UMAP

In [2]:
MAX_WORDS = 50

In [3]:
with open("./../data/leaves-of-grass.txt", "r") as in_file:
    text = in_file.read()
    text = re.sub(r'\n{3,}', '\n\n', text)
lines = text.split("\n")

In [39]:
def split_stanza(stanza, MAX_WORDS=MAX_WORDS):
    lines = stanza.split('\n')
    current_chunk = []
    current_word_count = 0
    chunks = []
    
    for line in lines:
        line_words = line.split()
        line_word_count = len(line_words)
        
        if current_word_count + line_word_count > MAX_WORDS and len(current_chunk) > 0:
            chunks.append('\n'.join(current_chunk))
            current_chunk = []
            current_word_count = 0
            
        # If the line is too long, we'll just let CLIP truncate.
        if line_word_count > MAX_WORDS:
            if len(current_chunk) > 0:
                chunks.append("\n".join(current_chunk))
                current_chunk = []
                current_word_count = 0
            chunks.append("\n".join([line]))
        else:
            current_chunk.append(line)
            current_word_count += line_word_count
    
    if current_chunk:
        chunks.append('\n'.join(current_chunk))
    
    return chunks

poems = []
book = ""
title = ""
poem = []
for line in lines:
    if line != "" and not line.startswith(" "):
        if len([l for l in poem if l != ""]) > 0:
            poems.append({
                "book": book,
                "title": title,
                "lines": poem,
            })
        if line.upper() == line:
            book = line.strip()
        else:
            title = line.strip()
        poem = []
    else:
        poem.append(line.strip())
poems.append({
    "book": book,
    "title": title,
    "lines": poem
})

for poem in poems:
    poem["stanzas"] = [stanza.strip() for stanza in "\n".join(poem["lines"]).split("\n\n")]
    poem["chunks"] = []
    for stanza in poem["stanzas"]:
        total_words = len(' '.join(stanza.split('\n')).split())
        if total_words <= MAX_WORDS:
            poem["chunks"].append(stanza)
        else:
            poem["chunks"].extend(split_stanza(stanza))

In [40]:
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

In [41]:
for poem in tqdm(poems):
    poem["embeddings"] = []
    for chunk in poem["chunks"]:
        with torch.no_grad():
            inputs = processor(text=[chunk], return_tensors="pt", padding=True, truncation=True).to("cpu")
            text_features = model.get_text_features(**inputs)
            embeddings = text_features / text_features.norm(dim=1, keepdim=True)
        poem["embeddings"].append(embeddings[0].numpy().tolist())

  0%|          | 0/383 [00:00<?, ?it/s]

In [75]:
embeddings = []
for poem_index, poem in enumerate(poems):
    for embedding_index, embedding in enumerate(poem["embeddings"]):
        embeddings.append({
            "poem_index": poem_index,
            "embedding_index": embedding_index,
            "embedding": embedding
        })
    

reducer = UMAP(random_state=42, n_neighbors=20)
reduced = reducer.fit_transform([d["embedding"] for d in embeddings])

reducer_long = UMAP(random_state=42, n_neighbors=20, n_components=10)
reduced_long = reducer.fit_transform([d["embedding"] for d in embeddings])

for index, embedding in enumerate(embeddings):
    embedding["reduced"] = reduced[index]
    embedding["long"] = reduced_long[index]

  warn(


In [76]:
with open("./../data/vectors.json", "r") as in_file:
    images = json.load(in_file)

In [77]:
def find_most_similar(needle, haystack):
    needle = np.array(needle)
    haystack = np.array(haystack)
    needle_normalized = needle / np.linalg.norm(needle)
    haystack_normalized = haystack / np.linalg.norm(haystack, axis=1)[:, np.newaxis]
    
    similarities = haystack_normalized @ needle_normalized
    
    most_similar_idx = np.argmax(similarities)
    similarity_score = similarities[most_similar_idx]
    
    return most_similar_idx, similarity_score

for d in tqdm(embeddings):
    index, score = find_most_similar(d["embedding"], [image["vector"] for image in images])
    d["image_filename"] = images[index]["image_filename"]

  0%|          | 0/3469 [00:00<?, ?it/s]

In [82]:
out = []
for poem_index, poem in tqdm(enumerate(poems)):
    if not "title" in poem:
        print(poem)
        continue
    needles = [e for e in embeddings if e["poem_index"] == poem_index]
    if len(needles) != len(poem["chunks"]):
        print("needles", len(needles))
        print("chunks", len(poem["chunks"]))
        continue
    datum = {
        "book": poem["book"],
        "title": poem["title"],
        "chunks": [
            {
                "body": chunk,
                "embedding": needles[index]["long"].tolist(),
                "image": needles[index]["image_filename"],
                "reduced": needles[index]["reduced"].tolist()
            } for index, chunk in enumerate(poem["chunks"])
        ]
    }
    out.append(datum)


0it [00:00, ?it/s]

In [83]:
with open("./../src/lib/data.json", "w") as out_file:
    json.dump(out, out_file, indent=2)