# Author Feed Extractor

This notebook extracts posts from reputable news feeds using Bluesky's API.

In [86]:
import time
import dotenv
import pandas as pd
import requests
import os
from bs4 import BeautifulSoup

This cell looks for an existing file containing Bluesky posts, and if one exists, looks loads UUIDs to prevent adding duplicate posts; if file does not exist, it creates it.

In [87]:
# Step 1: Set paths and filenames
output_dir = 'data/output/1_author_feed_extractor'
output_file = f'collected_feeds.csv'
full_path = os.path.join(output_dir, output_file)

# Step 2: Ensure output directory exists
os.makedirs(output_dir, exist_ok=True)

# Step 3: If file exists, load it and get existing UUIDs
if os.path.exists(full_path):
    existing_df = pd.read_csv(full_path)
    existing_uuids = set(existing_df['uuid'].dropna().unique())
else:
    existing_uuids = pd.DataFrame
    existing_df = pd.DataFrame

In [88]:
# Load environment variables from .env file
dotenv.load_dotenv('data/input/.env')

# Access environment variables
bluesky_account = os.getenv('BLUESKY_HANDLE')
bluesky_account_password = os.getenv('BLUESKY_APP_PASSWORD')

accounts = [
    "nytimes.com",
    "washingtonpost.com",
    "wsj.com",
    "cnn.com",
    "nbcnews.com",
    "cbsnews.com",
    "latimes.com",
    "expressnews.com",
    "apnews.com",
    "reuters.com",
    "npr.org",
    "msnbc.com",
    "financialtimes.com",
    "bloomberg.com",
    "theguardian.com",
    "chicagotribune.com",
    "usatoday.com",
    "cnbc.com",
    # "bbc.com",
    "axios.com",
    "thehill.com",
    "politico.com",
    "financialtimes.com",
    "economist.com",
    'thediplomat.com',
    'asia.nikkei.com',
    'scmp.com',
    'japantimes.co.jp',
    'aljazeera.com',
    'theguardian.com',
    'politico.eu',
    'sydmorningherald.bsky.social'
]

This cell handles authentication.

In [89]:
auth_resp = requests.post(
    "https://bsky.social/xrpc/com.atproto.server.createSession",
    json={"identifier": bluesky_account, "password": bluesky_account_password}
)
auth_resp.raise_for_status()
access_jwt = auth_resp.json()["accessJwt"]

headers = {"Authorization": f"Bearer {access_jwt}"}

This cell fetches an author's feed


In [90]:
def fetch_author_feed(actor, cursor=None, limit=10):
    """Fetch posts for the given author handle, with optional pagination cursor."""
    url = "https://bsky.social/xrpc/app.bsky.feed.getAuthorFeed"
    params = {"actor": actor, "limit": limit}
    # if cursor:
    #     params["cursor"] = cursor
    resp = requests.get(url, headers=headers, params=params)
    if resp.status_code == 429:
        # Rate limited; wait and retry
        print("Too many requests, retrying in 5 seconds...")
        time.sleep(5)
        # return fetch_author_feed(actor, cursor, limit)
        return fetch_author_feed(actor, limit)
    resp.raise_for_status()
    return resp.json()


The following cells begin the monitoring loop.

In [91]:
collected_posts = []

try:
    for acct in accounts:
        try:
            data = fetch_author_feed(acct, limit=10)
            posts = data.get("feed", [])
            for item in posts:
                post = item.get("post", {})
                record = post.get("record", {})
                author_info = post.get("author", {})

                if record.get("$type") == "app.bsky.feed.post":
                    uuid = post.get("uri", "")
                    text = record.get("text", "").strip()
                    timestamp = record.get("createdAt", "")
                    display_name = author_info.get("displayName", "")
                    title = record.get("embed", {}).get("external", {}).get("title", "")
                    external_uri = record.get("embed", {}).get("external", {}).get("uri", "")

                    if text and timestamp and external_uri:
                        collected_posts.append({
                            "uuid": uuid,
                            "author": acct,
                            "displayName": display_name,
                            "title": title,
                            "text": text,
                            "timestamp": timestamp,
                            "uri": external_uri,
                        })
                    else:
                        collected_posts.append({
                        "uuid": uuid,
                        "author": acct,
                        "displayName": display_name,
                        "title": title,
                        "text": text,
                        "timestamp": timestamp,
                        "uri": '',
                            })
            time.sleep(1)
        except Exception as e:
            print(f"Error fetching feed for {acct}: {e}")
    print(f"\nCollected {len(collected_posts)} posts total.")
except KeyboardInterrupt:
    print("Stopping feed monitor.")


Collected 300 posts total.


In [92]:
len(collected_posts)

300

The cell below searches for UUIDs that already exist in our collection (if any) and discards any newly collected posts that contain the same UUIDs.

In [93]:
# Convert to DataFrame
raw_df = pd.DataFrame(collected_posts)
# Drop duplicates based on UUID
raw_df = raw_df.drop_duplicates(subset='uuid')

# Filter new data to only include unseen UUIDs
new_df = raw_df[~raw_df['uuid'].isin(existing_uuids)]

new_df

Unnamed: 0,uuid,author,displayName,title,text,timestamp,uri
0,at://did:plc:eclio37ymobqex2ncko63h4r/app.bsky...,nytimes.com,The New York Times,Japan Welcomes a New Sumo Champ. Surprise: He’...,Onosato Daiki of Japan was named on Wednesday ...,2025-05-28T15:18:52.783Z,https://www.nytimes.com/2025/05/28/world/asia/...
1,at://did:plc:eclio37ymobqex2ncko63h4r/app.bsky...,nytimes.com,The New York Times,,From @theathleticfc.bsky.social: Lamine Yamal ...,2025-05-28T15:12:48.960Z,
2,at://did:plc:eclio37ymobqex2ncko63h4r/app.bsky...,nytimes.com,The New York Times,,The mere thought of ticks makes our skin crawl...,2025-05-28T14:38:26.326Z,
3,at://did:plc:eclio37ymobqex2ncko63h4r/app.bsky...,nytimes.com,The New York Times,Do I Need to Refrigerate Ketchup? An A-to-Z Gu...,Do you need to refrigerate ketchup? Soy sauce?...,2025-05-28T14:32:38.171Z,https://nyti.ms/43udGJj
4,at://did:plc:eclio37ymobqex2ncko63h4r/app.bsky...,nytimes.com,The New York Times,,How did you do in Connections today? Players m...,2025-05-28T14:29:10.910Z,
...,...,...,...,...,...,...,...
286,at://did:plc:bak7f4b3jsiqlpyo6o4ejaji/app.bsky...,politico.eu,POLITICO Europe,Christine Lagarde discussed leaving ECB early ...,Christine Lagarde has discussed leaving the Eu...,2025-05-28T10:30:36.896Z,https://ow.ly/NxCk50VZVXE
287,at://did:plc:bak7f4b3jsiqlpyo6o4ejaji/app.bsky...,politico.eu,POLITICO Europe,Prague accuses China of hacking Czech foreign ...,Prague accuses China of hacking Czech foreign ...,2025-05-28T09:25:35.827Z,https://ow.ly/3IQ250VZTO6
288,at://did:plc:e5ixzv36nmsa5brpxw5tqixg/app.bsky...,politico.eu,Camille Gijs,"A big, beautiful EU trade deal with Trump? Dre...",Donald Trump may be happy with the EU’s promis...,2025-05-28T09:15:42.803Z,https://www.politico.eu/article/eu-trade-deal-...
289,at://did:plc:kquml2hoj6p7qsw7ohlgfa5k/app.bsky...,politico.eu,Pieter Haeck,US Republicans slam EU ‘double standard’ over ...,NEW: U.S. Republican lawmakers are reproaching...,2025-05-28T07:02:24.127Z,https://www.politico.eu/article/us-republicans...


In [94]:
# Convert timestamp to datetime and sort
new_df['timestamp'] = pd.to_datetime(new_df['timestamp'], utc=True, errors='coerce')
new_df = new_df.sort_values(by='timestamp', ascending=False)

# Desired column order
cols = ['title', 'text'] + [col for col in new_df.columns if col not in ['title', 'text', 'uuid']] + ['uuid']
# Reorder the DataFrame
new_df = new_df[cols]
# Rename the 'uri' column
new_df = new_df.rename(columns={'uri': 'external_url'})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['timestamp'] = pd.to_datetime(new_df['timestamp'], utc=True, errors='coerce')


In [95]:
new_df.head()

Unnamed: 0,title,text,author,displayName,timestamp,external_url,uuid
50,Capitol police chief Thomas Manger says Trump'...,Capitol police chief Thomas Manger has led — a...,cbsnews.com,CBS News,2025-05-28 15:20:01.664000+00:00,https://cbsn.ws/4jou7wO,at://did:plc:3bxtpdpr73tf7tldv5q4oyqc/app.bsky...
0,Japan Welcomes a New Sumo Champ. Surprise: He’...,Onosato Daiki of Japan was named on Wednesday ...,nytimes.com,The New York Times,2025-05-28 15:18:52.783000+00:00,https://www.nytimes.com/2025/05/28/world/asia/...,at://did:plc:eclio37ymobqex2ncko63h4r/app.bsky...
1,,From @theathleticfc.bsky.social: Lamine Yamal ...,nytimes.com,The New York Times,2025-05-28 15:12:48.960000+00:00,,at://did:plc:eclio37ymobqex2ncko63h4r/app.bsky...
110,"Hayes: We are more powerful than ‘one petty, a...",OPINION:\n\nHayes: We are more powerful than ‘...,msnbc.com,MSNBC,2025-05-28 15:11:37.400000+00:00,https://www.youtube.com/watch?v=W-cCxwxI8Fk,at://did:plc:ofbkqcjzvm6gtwuufsubnkaf/app.bsky...
200,‘He Is Ohio’: DeWine Pitches an Alternative to...,Column: Ohio Gov. Mike DeWine has a new plan t...,politico.com,Politico,2025-05-28 15:10:58.986000+00:00,https://www.politico.com/news/magazine/2025/05...,at://did:plc:yf6hctt2ug3qyfty4in64yob/app.bsky...


The following cells fetch additional metadata for each post by accessing embedded links within each post (if available)


In [96]:
def fetch_metadata(url):
    try:
        headers = {'User-Agent': 'Mozilla/5.0'}
        response = requests.get(url, timeout=10, headers=headers)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')

        # Page title
        page_title = soup.title.string.strip() if soup.title else ''

        # Meta description (Open Graph preferred)
        meta = soup.find('meta', attrs={'property': 'og:description'}) or \
               soup.find('meta', attrs={'name': 'description'})
        description = meta['content'].strip() if meta and 'content' in meta.attrs else ''

        return page_title, description

    except Exception as e:
        return '', ''

In [97]:
if not new_df.empty:
    new_df[['page_title', 'meta_description']] = new_df['external_url'].apply(
        lambda url: pd.Series(fetch_metadata(url)))

This cell saves collected posts to file.

In [98]:
df = pd.concat([existing_df, new_df], ignore_index=True)

df.to_csv(full_path, index=False)

print(f"Appended {len(new_df)} new rows.")

Appended 220 new rows.
Final saved file: data/output/1_author_feed_extractor/collected_feeds.csv (total 280 rows)


In [99]:
df.sort_index(ascending=False)

Unnamed: 0,title,text,author,displayName,timestamp,external_url,uuid,page_title,meta_description
529,"Aboriginal man, 24, dies in police custody in ...",NT police say ‘the man stopped breathing’ shor...,aljazeera.com,,NaT,https://bit.ly/3HrRxE2,at://did:plc:2lofqead276vtc5647ye7sl2/app.bsky...,"Aboriginal man, 24, dies in police custody in ...",NT police say ‘the man stopped breathing’ shor...
528,"Gulf states, China take centre stage at summit...","The GCC, China, ASEAN countries pledged to pro...",aljazeera.com,,NaT,https://bit.ly/3HeFOZG,at://did:plc:2lofqead276vtc5647ye7sl2/app.bsky...,"Gulf states, China take centre stage at summit...","The GCC, China, ASEAN countries pledged to pro..."
527,Ukraine’s Zelenskyy to meet Germany’s Merz in ...,Talks come after German chancellor says allies...,aljazeera.com,,NaT,https://bit.ly/4mBTTAw,at://did:plc:2lofqead276vtc5647ye7sl2/app.bsky...,Merz says Germany will help Ukraine produce lo...,Zelenskyy said the two leaders agreed to coope...
526,German court rejects Peruvian farmer’s climate...,Judge rules that companies ‘may be obligated t...,aljazeera.com,,NaT,https://bit.ly/43zznrE,at://did:plc:2lofqead276vtc5647ye7sl2/app.bsky...,German court rejects Peruvian farmer’s climate...,Judge rules that companies ‘may be obligated t...
525,US pauses new student visas: What it means and...,The Trump administration has paused new visa i...,aljazeera.com,,NaT,https://bit.ly/4kHmany,at://did:plc:2lofqead276vtc5647ye7sl2/app.bsky...,US pauses new student visas: What it means and...,The Trump administration has paused new visa i...
...,...,...,...,...,...,...,...,...,...
4,"Former Times reporter sues Villanueva, L.A Cou...","Former Times reporter sues Villanueva, L.A Cou...",latimes.com,Los Angeles Times,2025-05-28 00:51:03.498000+00:00,https://www.latimes.com/california/story/2025-...,at://did:plc:d2jith367s6ybc3ldsusgdae/app.bsky...,"Former Times reporter sues Villanueva, L.A Cou...",Former Times reporter Maya Lau has filed a law...
3,"Trump touts free ""Golden Dome"" for Canada, as ...",Canadian PM Mark Carney told CBC today he want...,axios.com,Axios,2025-05-28 00:51:43.197000+00:00,https://www.axios.com/2025/05/28/trump-canada-...,at://did:plc:f6avy7jkujdhusski5n64joj/app.bsky...,,
2,SpaceX megarocket gets farther in test than la...,UPDATE: SpaceX's Starship travels farther in i...,cnn.com,CNN,2025-05-28 00:52:04.026000+00:00,https://www.cnn.com/science/live-news/spacex-s...,at://did:plc:dzezcmpb3fhcpns4n4xm4ur5/app.bsky...,SpaceX launches Starship test flight 9: Live u...,SpaceX has launched a ninth uncrewed test flig...
1,New student visas paused as State Dept. plans ...,The State Department on Tuesday suspended fore...,washingtonpost.com,The Washington Post,2025-05-28 00:52:20.780000+00:00,https://www.washingtonpost.com/education/2025/...,at://did:plc:k5nskatzhyxersjilvtnz4lh/app.bsky...,,
