# Imports

In [8]:
from datasets import load_dataset
import weaviate
import uuid


# Data

Download the Reddit TIFU dataset:

In [9]:
num_samples = 100

dataset = load_dataset("reddit_tifu", "short", split="train")\
    .shuffle(seed=42)\
    .select(range(num_samples))\
    .add_column("id", [uuid.uuid4().__str__() for _ in range(num_samples)])\
    .remove_columns(["ups", "num_comments", "upvote_ratio", "score"])\
    .rename_column("documents", "content")

# to check with model summary later
df = dataset.to_pandas().set_index("id")


Found cached dataset reddit_tifu (/home/vscode/.cache/huggingface/datasets/reddit_tifu/short/1.1.0/1c73fb08807b54ec26b025829b2a3d90c6f7466dac20801c825571af9514c049)
Loading cached shuffled indices for dataset at /home/vscode/.cache/huggingface/datasets/reddit_tifu/short/1.1.0/1c73fb08807b54ec26b025829b2a3d90c6f7466dac20801c825571af9514c049/cache-224f5f019baecc2e.arrow
Loading cached processed dataset at /home/vscode/.cache/huggingface/datasets/reddit_tifu/short/1.1.0/1c73fb08807b54ec26b025829b2a3d90c6f7466dac20801c825571af9514c049/cache-c578ad205720b401.arrow


Upload to weaviate:

In [10]:
client = weaviate.Client("http://localhost:8080")

post_class_schema = {
    "class": "Post",
    "description": "A reddit post",
    "properties": [{
        "name": "content",
        "dataType": ["text"]
    }]
}

client.schema.create_class(post_class_schema)


In [11]:
client.batch(batch_size=20, dynamic=True, num_workers=1)

with client.batch as batch:
    for d in dataset:
        batch.add_data_object(
            data_object={
                "content": d["content"]
            },
            class_name='Post',
            uuid=d["id"]
        )


Summarize something:

In [14]:
post_id = df.sample(1).index.values[0]

post_id_filter = {
    "path": ["id"],
    "operator": "Equal",
    "valueString": post_id
}

result = (
    client.query
    .get('Post', ['_additional  {id summary ( properties: ["content"]) { result }}'])
    .with_where(post_id_filter)
    .do()
)

model_tldr = result["data"]["Get"]["Post"][0]["_additional"]["summary"][0]["result"]
content = df.loc[post_id]["content"]
true_tldr = df.loc[post_id]["tldr"] or df.loc[post_id]["title"]


In [15]:
print(f"Content:\n{content}")
print("-"*80)
print(f"model tldr:\n{model_tldr}")
print("-"*80)
print(f"true tldr:\n{true_tldr}")


Content:
obligatory this didn't happen today. i was going to post this on the day it happened so that i could feel above and better than everyone who says "obligatory this didn't happen today", but i didn't. so i don't.  

anyways it all started with a game of volleyball, was a lovely fun game with friends and family and involved sand. sand is usually fun and soft to fall on. however it is not recommended to use as lubricant. i came straight home, a short walk, after our fun little session of play and decided to embark on my own, personal session. by which i mean i masturbated.  

now i didn't have the time to wash my hands thoroughly before my misadventure and so i realised halfway through that there were still grains of sand, rubbing against my shaft. bravely, i continued, relaxing my hand and caressing myself with a lovers embrace rather than a death clinch with a thrust great enough to achieve geostationary orbit, and so was able to tenaciously proceed... albeit with caution.  

as