In [13]:
from datasets import load_dataset
from datetime import datetime, timezone
from dateutil import parser

ds = load_dataset("alpindale/two-million-bluesky-posts", split="train")

In [19]:
print("First post:\n", ds[0])

First post:
 {'text': "This is really interesting polling data about national public attitudes re: California.  It's from the LA Times, in January.  I wonder if this will change substantially in the next two years?  5233025.fs1.hubspotusercontent-na1.net/hubfs/523302...", 'created_at': '2024-11-27T07:53:47.202Z', 'author': 'did:plc:5ug6fzthlj6yyvftj3alekpj', 'uri': 'at://did:plc:5ug6fzthlj6yyvftj3alekpj/app.bsky.feed.post/3lbw33zxvik24', 'has_images': False, 'reply_to': None}


In [20]:
start_time = datetime(2024, 11, 27, 7, 0, 0, tzinfo=timezone.utc)
end_time   = datetime(2024, 11, 27, 8, 0, 0, tzinfo=timezone.utc)

def in_target_hour(example):
    # Replace trailing 'Z' with '+00:00'
    aware_str = example["created_at"].replace("Z", "+00:00")
    dt = parser.isoparse(aware_str)  # <-- dateutil's tolerant parser
    return start_time <= dt < end_time

subset_time = ds.filter(in_target_hour)
# If you have predicted_language, filter to English:
def is_english(example):
    return example.get('predicted_language', '') == 'en'

if 'predicted_language' in ds.column_names:
    subset_en = subset_time.filter(is_english)
    print(f"\nFound {len(subset_en)} English posts in the hour 07:00–08:00.\n")
    for i in range(min(3, len(subset_en))):
        print(subset_en[i])
else:
    print(f"\nFound {len(subset_time)} posts in the hour 07:00–08:00 (no language predictions available).\n")
    for i in range(min(3, len(subset_time))):
        print(subset_time[i])


Found 16866 posts in the hour 07:00–08:00 (no language predictions available).

{'text': "This is really interesting polling data about national public attitudes re: California.  It's from the LA Times, in January.  I wonder if this will change substantially in the next two years?  5233025.fs1.hubspotusercontent-na1.net/hubfs/523302...", 'created_at': '2024-11-27T07:53:47.202Z', 'author': 'did:plc:5ug6fzthlj6yyvftj3alekpj', 'uri': 'at://did:plc:5ug6fzthlj6yyvftj3alekpj/app.bsky.feed.post/3lbw33zxvik24', 'has_images': False, 'reply_to': None}
{'text': 'Niet been, iets.', 'created_at': '2024-11-27T07:53:46.656Z', 'author': 'did:plc:xfvxxrfblwuc3kdthl344vc5', 'uri': 'at://did:plc:xfvxxrfblwuc3kdthl344vc5/app.bsky.feed.post/3lbw33zh7cs2l', 'has_images': False, 'reply_to': 'at://did:plc:fwjmfthdu5wppad75pcgpalj/app.bsky.feed.post/3lbvzv3tuc226'}
{'text': 'よく考えたらchrがsbさんの腰を両脚で挟んでどうこうとかポストしてたな\nいまさらだった……', 'created_at': '2024-11-27T07:53:46.554Z', 'author': 'did:plc:33m5bavcbydbgxqv7lzukclo'

In [23]:
import os
import asyncio
import json
import numpy as np
from getpass import getpass

from huggingface_hub import (
    notebook_login,
    create_inference_endpoint,
    list_inference_endpoints,
    whoami
)
from datasets import load_dataset
from dateutil import parser
from datetime import datetime, timezone
from tqdm import tqdm

In [26]:
notebook_login()

who = whoami()
print("Currently logged in as:", who["name"])

# If you belong to an org, and want to charge it instead of your personal account,
# you can prompt for it or just set it manually here:
# organization = getpass("What is your Hugging Face 🤗 organization (with payment method)? ")
# namespace = organization or who["name"]
namespace = who["name"]  # or an organization name

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

Currently logged in as: amuralit


In [None]:
##
# 2. Create/Load an Inference Endpoint
##
ENDPOINT_NAME = "my-bluesky-embedding-endpoint"

# These were used in Derek's tutorial:
VENDOR = "aws"
REGION = "us-east-1"
INSTANCE_SIZE = "x1"         # Single GPU
INSTANCE_TYPE = "nvidia-a100"

MAX_WORKERS = 5  # How many async workers (and a factor in how many tokens we can handle at once)

# Create or load the Inference Endpoint
try:
    endpoint = create_inference_endpoint(
        name=ENDPOINT_NAME,
        repository="nvidia/NV-Embed-v2",
        task="sentence-embeddings",
        framework="pytorch",
        accelerator="gpu",
        instance_size=INSTANCE_SIZE,
        instance_type=INSTANCE_TYPE,
        region=REGION,
        max_position_embeddings=32768,
        vendor=VENDOR,
        namespace=namespace,
        custom_image={
            "health_route": "/health",
            "env": {
                "MAX_BATCH_TOKENS": str(MAX_WORKERS * 2048),
                "MAX_CONCURRENT_REQUESTS": "512",
                "MODEL_ID": "/repository",
            },
            "url": "ghcr.io/huggingface/text-embeddings-inference:0.5.0",
        },
        type="protected",
    )
    print(f"Created new endpoint: {ENDPOINT_NAME}")
except Exception as e:
    # If the endpoint already exists, we load it from the list
    print("Endpoint already exists or could not be created, loading existing endpoint...")
    endpoint = [ie for ie in list_inference_endpoints(namespace=namespace) if ie.name == ENDPOINT_NAME][0]

# Wait until the endpoint is running
print("\nWaiting for endpoint to be running...")
endpoint.wait()
print("Endpoint is running!\n")

Created new endpoint: my-bluesky-embedding-endpoint

Waiting for endpoint to be running...


InferenceEndpointError: Inference Endpoint my-bluesky-embedding-endpoint failed to deploy. Please check the logs for more information.

In [29]:
##
# 4. Embed one sample text from subset_en
##
example_to_embed = subset_time[0]["text"]
print("Embedding this text:\n", example_to_embed)

# Send the text to your Inference Endpoint
response_bytes = endpoint.client.post(
    json={
        "inputs": example_to_embed,
        "truncate": True  # If text is too long, we truncate it
    },
    task="feature-extraction",
)
# Convert response from bytes -> JSON -> NumPy array
embedding_array = np.array(json.loads(response_bytes.decode()))

print("\nEmbedded vector shape:", embedding_array.shape)
print("First 10 values of the embedding:\n", embedding_array[0][:10])


Embedding this text:
 This is really interesting polling data about national public attitudes re: California.  It's from the LA Times, in January.  I wonder if this will change substantially in the next two years?  5233025.fs1.hubspotusercontent-na1.net/hubfs/523302...

Embedded vector shape: (1, 768)
First 10 values of the embedding:
 [-0.04396549 -0.07015622  0.07088956  0.03038124 -0.02131925 -0.02956059
  0.01464934 -0.04396549  0.06275297  0.02435737]
