In [15]:
import urllib 
import json 
import pandas as pd

from datetime import datetime
from pydantic.v1 import BaseModel, Field 

from langchain_openai import ChatOpenAI
from langchain.prompts import PromptTemplate 
from langchain.output_parsers import PydanticOutputParser


In [16]:
# Load API key from .env file
import dotenv 
dotenv.load_dotenv()

True

# Get some posts from a subreddit

In [17]:
SUBREDDIT = "todayilearned"

In [18]:
# You can put ".json" at the end of any subreddit URL to get recent posts in JSON format
# We don't get comments here, but we can apply the same approach to an individual post URL to get them.
with urllib.request.urlopen(f"https://www.reddit.com/r/{SUBREDDIT}.json") as url:
    data = json.loads(url.read().decode())

In [41]:
permalink = "/r/todayilearned/comments/1969o7i/til_about_the_crazy_horse_memorial_the_worlds/"
permalink_url = f"https://www.reddit.com{permalink}"
comments_url

'https://www.reddit.com/r/todayilearned/comments/1969o7i/til_about_the_crazy_horse_memorial_the_worlds/'

In [19]:
# there are lots of fields for each post, we'll just take these ones
required_keys = ["permalink", "title", "created_utc", "num_comments", "score", "selftext"]

posts = []
for post in data["data"]["children"]:
    post_data = post["data"]
    post_data = {k: post_data[k] for k in required_keys}
    posts.append(post_data)

In [20]:
# at this point, the data looks like this
posts[0]

{'permalink': '/r/todayilearned/comments/195yrit/til_new_yorks_paramount_theater_would_reek_of/',
 'title': "TIL New York's Paramount theater would reek of urine after Frank Sinatra shows because the bobby-soxers would come for the first show at 9:15 a.m. and stay for show after show, determined never to relinquish a precious seat even if it meant soaking in it.",
 'created_utc': 1705181511.0,
 'num_comments': 692,
 'score': 16535,
 'selftext': ''}

# Define a template for parsing each post

We want to send the LLM the posts to summarize plus some instructions about what to do. We also want to get the instructions back in JSON format.

In [21]:
# We can edit the text here to give the LLM more context about what we're interested in.
template_string = """ 

This is a post from the subreddit {subreddit}:

{post}

{format_instructions}

"""


In [22]:
# We can add a custom series of fields here to give the LLM more guidance on what we're interested in. 
# We specify whether each field is freeform text, a true/false flag or whatever else.
class Post(BaseModel):
    summary: str = Field(description="A short summary of what the post is about. You don't need to explain that it is a reddit post.", max_length=1000)
    sport: bool = Field(description="Is this post about sport?")
    entertainment: bool = Field(description="Is this post about entertainment?")
    politics: bool = Field(description="Is this post about politics?")
    science: bool = Field(description="Is this post about science?")
    history: bool = Field(description="Is this post about history?")

In [23]:
output_parser = PydanticOutputParser(pydantic_object=Post)
format_instructions = output_parser.get_format_instructions()

prompt = PromptTemplate.from_template(template=template_string)

# Send the prompts to the LLM

We'll use ChatGPT 4 here, but it's easy to swap in different models with Langchain.

In [24]:
llm = ChatOpenAI(model_name='gpt-4')

In [25]:
chain = prompt | llm | output_parser

In [79]:
inputs = [({
    "subreddit": SUBREDDIT,
    "post": post, 
    "format_instructions": format_instructions}) for post in posts]

In [81]:
results = chain.batch(inputs)

# Results

Put all the relevant parts into a data frame.

In [142]:
# As far as I can tell, the order of the batched results is the same as the order of the inputs, but I'm not sure if this is guaranteed.
results_df = pd.DataFrame(dict(r) for r in results)
posts_df = pd.DataFrame(posts)

reddit = (
    pd.concat([results_df, posts_df], axis=1)
    .assign(
        post_datetime = lambda df: pd.to_datetime(df.created_utc.astype('int'), utc=True, unit='s').dt.tz_localize(None),  # remove timezone
        scraped_datetime = datetime.today()
    )
    .assign(
        # convenient to create a full url for each post
        link = lambda df: df['permalink'].apply(lambda x: f"https://www.reddit.com{x}")
    )
)

In [145]:
# inspect a particular category
reddit.query("sport == True")

Unnamed: 0,summary,sport,entertainment,politics,science,history,permalink,title,created_utc,num_comments,score,selftext,post_datetime,scraped_datetime,link
7,"Sumo wrestlers, despite eating 10,000+ calorie...",True,False,False,True,False,/r/todayilearned/comments/195o141/til_that_sum...,"TIL that sumo wrestlers, despite eating 10,000...",1705153000.0,920,15683,,2024-01-13 13:29:29,2024-01-14 13:06:27.898256,https://www.reddit.com/r/todayilearned/comment...
22,"TIL Steve Young, 3x NFL Super Bowl Champion, i...",True,False,False,False,True,/r/todayilearned/comments/195x26i/til_steve_yo...,"TIL Steve Young, 3x NFL Super Bowl Champion, i...",1705177000.0,37,428,,2024-01-13 20:16:33,2024-01-14 13:06:27.898256,https://www.reddit.com/r/todayilearned/comment...
23,"The French developed Canne de combat, a cane f...",True,False,False,False,True,/r/todayilearned/comments/195tzjx/til_the_fren...,TIL: The French in the 19th century developed ...,1705169000.0,35,599,,2024-01-13 18:04:46,2024-01-14 13:06:27.898256,https://www.reddit.com/r/todayilearned/comment...


# Output

Write to a csv

In [140]:
# columns in the data frame: various identifiers plus the things we asked for in the prompt
cols = ['link', 'title', 'post_datetime'] + list(Post.__fields__.keys())

In [141]:
date_suffix = datetime.today().strftime("%Y%m%d")
reddit[cols].to_csv(f"../output/{SUBREDDIT}_{date_suffix}.csv", index=False)