In [1]:
import urllib 
import json 
import pandas as pd

from datetime import datetime
from pydantic.v1 import BaseModel, Field 

from langchain_openai import ChatOpenAI
from langchain.prompts import PromptTemplate 
from langchain.output_parsers import PydanticOutputParser


In [2]:
# Load API key from .env file
import dotenv 
dotenv.load_dotenv()

True

# Get some posts from a subreddit

In [3]:
SUBREDDIT = "todayilearned"

In [4]:
# You can put ".json" at the end of any subreddit URL to get recent data in JSON format. Limited but easy.
with urllib.request.urlopen(f"https://www.reddit.com/r/{SUBREDDIT}.json") as url:
    data = json.loads(url.read().decode())

In [5]:
# there are lots of fields for each post, we'll just take these ones
required_keys = ["permalink", "title", "created_utc", "num_comments", "score", "selftext"]

posts = []
for post in data["data"]["children"]:
    post_data = post["data"]
    post_data = {k: post_data[k] for k in required_keys}
    posts.append(post_data)

In [6]:
# at this point, the data looks like this
posts[0]

{'permalink': '/r/todayilearned/comments/195sjnx/til_when_george_miller_was_casting_mad_max_fury/',
 'title': "TIL when George Miller was casting Mad Max: Fury Road, his first choice actor to play Max was Eminem who only turned down the role because he didn't want to leave his home state",
 'created_utc': 1705165290.0,
 'num_comments': 717,
 'score': 10535,
 'selftext': ''}

# Define a template for parsing each post

We want to send the LLM the posts to summarize plus some instructions about what to do. We also want to get the instructions back in JSON format.

In [7]:
# We can edit the text here to give the LLM more context about what we're interested in.
template_string = """ 

This is a post from the subreddit {subreddit}:

{post}

{format_instructions}

"""


In [8]:
# We can add a custom series of fields here to give the LLM more guidance on what we're interested in. 
# We specify whether each field is freeform text, a true/false flag or whatever else.
class Post(BaseModel):
    summary: str = Field(description="A short summary of what the post is about. You don't need to explain that it is a reddit post.", max_length=1000)
    sport: bool = Field(description="Is this post about sport?")
    entertainment: bool = Field(description="Is this post about entertainment?")
    politics: bool = Field(description="Is this post about politics?")
    science: bool = Field(description="Is this post about science?")
    history: bool = Field(description="Is this post about history?")

In [9]:
output_parser = PydanticOutputParser(pydantic_object=Post)
format_instructions = output_parser.get_format_instructions()
prompt = PromptTemplate.from_template(template=template_string)

# Send the prompt to the LLM

We'll use ChatGPT 4 here, but it's easy to swap in different models with Langchain.

In [10]:
llm = ChatOpenAI(model_name='gpt-4')

In [11]:
chain = prompt | llm | output_parser

In [12]:
results = [] 
for post in posts:
    response = chain.invoke({
        "subreddit": SUBREDDIT,
        "post": post,
        "format_instructions": format_instructions
        })
        
    try:
        result = dict(response)
    except:
        result = {}
        continue

    # do this so we can match the results back to the original post
    result['permalink'] = post['permalink']
    results.append(result)

# Results

Put all the relevant parts into a data frame.

In [14]:
posts_df = pd.DataFrame(posts).set_index("permalink")
summaries_df = pd.DataFrame(dict(r) for r in results).set_index("permalink")

reddit = (
    posts_df
    .join(summaries_df)
    .assign(
        # clear the timezone information 
        post_datetime = lambda df: pd.to_datetime(df.created_utc.astype('int'), utc=True, unit='s').dt.tz_localize(None),
        scraped_datetime = datetime.today()
    )
    # sometimes the same post has different urls; take the latest one
    .sort_values("scraped_datetime", ascending=False)
    .groupby("title")
    .head(1)
    .reset_index()
    .sort_values("post_datetime", ascending=False)
    .reset_index(drop=True)
)

In [15]:
reddit

Unnamed: 0,permalink,title,created_utc,num_comments,score,selftext,summary,sport,entertainment,politics,science,history,post_datetime,scraped_datetime
0,/r/todayilearned/comments/195ycta/til_chess_ga...,TIL Chess games are banned at Russian Antarcti...,1705180000.0,5,100,,TIL Chess games are banned at Russian Antarcti...,False,True,False,True,True,2024-01-13 21:13:35,2024-01-13 22:24:29.868675
1,/r/todayilearned/comments/195wjmp/til_that_phd...,TIL that PhD students display twice as many sy...,1705176000.0,15,214,,TIL that PhD students display twice as many sy...,False,False,False,True,False,2024-01-13 19:54:47,2024-01-13 22:24:29.868675
2,/r/todayilearned/comments/195wg2t/til_in_1979_...,"TIL In 1979 a rabbit, hissing and baring its t...",1705175000.0,27,179,,"In 1979, Jimmy Carter faced an aggressive rabb...",False,False,True,False,True,2024-01-13 19:50:36,2024-01-13 22:24:29.868675
3,/r/todayilearned/comments/195v801/til_that_aft...,TIL that after being thrown off a chairlift by...,1705172000.0,219,4953,,After being thrown off a chairlift by a mental...,False,False,False,False,False,2024-01-13 18:58:03,2024-01-13 22:24:29.868675
4,/r/todayilearned/comments/195v589/til_in_1998_...,"TIL in 1998, a portion of the Titanic's hull w...",1705172000.0,37,424,,"In 1998, a part of the Titanic's hull was reco...",False,True,False,False,True,2024-01-13 18:54:32,2024-01-13 22:24:29.868675
5,/r/todayilearned/comments/195tzjx/til_the_fren...,TIL: The French in the 19th century developed ...,1705169000.0,18,227,,TIL: The French in the 19th century developed ...,True,False,False,False,True,2024-01-13 18:04:46,2024-01-13 22:24:29.868675
6,/r/todayilearned/comments/195tc23/til_the_acad...,TIL The Academic quarter was established in th...,1705167000.0,34,961,,TIL The Academic quarter was established in th...,False,False,False,False,True,2024-01-13 17:36:28,2024-01-13 22:24:29.868675
7,/r/todayilearned/comments/195sjnx/til_when_geo...,TIL when George Miller was casting Mad Max: Fu...,1705165000.0,717,10535,,George Miller's first choice for the role of M...,False,True,False,False,False,2024-01-13 17:01:30,2024-01-13 22:24:29.868675
8,/r/todayilearned/comments/195puze/til_in_1967_...,TIL in 1967 a man named Thomas Jolley fled the...,1705158000.0,126,2523,,"In 1967, Thomas Jolley fled the US for Canada ...",False,False,True,False,True,2024-01-13 15:02:01,2024-01-13 22:24:29.868675
9,/r/todayilearned/comments/195pcjz/til_the_dala...,TIL the Dalai Lama carries a high-end Swiss po...,1705157000.0,208,4081,,TIL the Dalai Lama carries a high-end Swiss po...,False,False,True,False,True,2024-01-13 14:37:32,2024-01-13 22:24:29.868675


In [20]:
# inspect a particular category
reddit.query("sport == True")[['title', 'summary']]

Unnamed: 0,title,summary
5,TIL: The French in the 19th century developed ...,TIL: The French in the 19th century developed ...
10,"TIL that sumo wrestlers, despite eating 10,000...","This post talks about sumo wrestlers, who desp..."
21,TIL that the average height of a male gymnast ...,The post is about average height of a male gym...
23,TIL that the distance driven by the fastest ca...,TIL that the distance driven by the fastest ca...


# Output

Write to a csv

In [22]:
# columns in the data frame: various identifiers plus the fields we specified above 
cols = ['title', 'post_datetime', 'permalink'] + list(Post.__fields__.keys())

In [23]:
date_suffix = datetime.today().strftime("%Y%m%d")
reddit[cols].to_csv(f"../output/reddit_{date_suffix}.csv", index=False)