In [161]:
import urllib 
import json 
import pandas as pd

from datetime import datetime
from pydantic.v1 import BaseModel, Field 

from langchain_openai import ChatOpenAI
from langchain.prompts import PromptTemplate 
from langchain.output_parsers import PydanticOutputParser


In [162]:
# Load API key from .env file
import dotenv 
dotenv.load_dotenv()

True

# Get some posts from a subreddit

In [163]:
SUBREDDIT = "todayilearned"

In [164]:
# You can put ".json" at the end of any subreddit URL to get the 25 most recent posts in JSON format
# We don't get comments here, but we can apply the same approach to an individual post URL to get them.
with urllib.request.urlopen(f"https://www.reddit.com/r/{SUBREDDIT}.json") as url:
    data = json.loads(url.read().decode())

In [165]:
# there are lots of fields for each post, we'll just take these ones
required_keys = ["permalink", "title", "created_utc", "num_comments", "score", "selftext"]

posts = []
for post in data["data"]["children"]:
    post_data = post["data"]
    post_data = {k: post_data[k] for k in required_keys}
    posts.append(post_data)

In [167]:
# at this point, the data looks like this
posts[-1]

{'permalink': '/r/todayilearned/comments/1961vie/til_that_the_uss_ward_the_first_ship_involved_in/',
 'title': "TIL that the USS Ward, the first ship involved in WW II when it sunk a sub sneaking into Pearl Harbor hours before the aerial attack, was commanded by Wm. Outerbridge. 3 years later, USS Ward was mortally damaged by Kamikaze, and ordered sunk by USS O'Brien, commanded by Wm. Outerbridge.",
 'created_utc': 1705189947.0,
 'num_comments': 39,
 'score': 409,
 'selftext': ''}

# Define a template for parsing each post

We want to send the LLM the posts to summarize plus some instructions about what to do. We also want to get the instructions back in JSON format.

In [168]:
# We can edit the text here to give the LLM more context about what we're interested in.
template_string = """ 

This is a post from the subreddit {subreddit}:

{post_title}
{post_text}

{format_instructions}

"""


In [169]:
# We can add a custom series of fields here to give the LLM more guidance on what we're interested in. 
# We specify whether each field is freeform text, a true/false flag or whatever else.
class Post(BaseModel):
    summary: str = Field(description="A short summary of what the post is about. You don't need to explain that it is a reddit post.", max_length=1000)
    sport: bool = Field(description="Is this post about sport?")
    entertainment: bool = Field(description="Is this post about entertainment?")
    politics: bool = Field(description="Is this post about politics?")
    science: bool = Field(description="Is this post about science?")
    history: bool = Field(description="Is this post about history?")

In [170]:
output_parser = PydanticOutputParser(pydantic_object=Post)
format_instructions = output_parser.get_format_instructions()

prompt = PromptTemplate.from_template(template=template_string)

# Send the prompts to the LLM

We'll use ChatGPT 4 here, but it's easy to swap in different models with Langchain.

In [171]:
llm = ChatOpenAI(model_name='gpt-4')

In [172]:
chain = prompt | llm | output_parser

In [173]:
inputs = [({
    "subreddit": SUBREDDIT,
    "post_title": post['title'],
    "post_text": post['selftext'],
    "format_instructions": format_instructions
}) for post in posts]

In [174]:
results = chain.batch(inputs)

# Results

Put all the relevant parts into a data frame.

In [175]:
# As far as I can tell, the order of the batched results is the same as the order of the inputs, but I'm not sure if this is guaranteed.
results_df = pd.DataFrame(dict(r) for r in results)
posts_df = pd.DataFrame(posts)

reddit = (
    pd.concat([results_df, posts_df], axis=1)
    .assign(
        post_datetime = lambda df: pd.to_datetime(df.created_utc.astype('int'), utc=True, unit='s').dt.tz_localize(None),  # remove timezone
        scraped_datetime = datetime.today()
    )
    .assign(
        # convenient to create a full url for each post
        link = lambda df: df['permalink'].apply(lambda x: f"https://www.reddit.com{x}")
    )
)

In [179]:
reddit

Unnamed: 0,summary,sport,entertainment,politics,science,history,permalink,title,created_utc,num_comments,score,selftext,post_datetime,scraped_datetime,link
0,"In 2001, Snoop Dogg was involved in a hardcore...",False,True,False,False,False,/r/todayilearned/comments/19695o3/til_in_2001_...,TIL in 2001 Snoop Dogg was in a hardcore porn ...,1705213000.0,114,2489,,2024-01-14 06:13:22,2024-01-14 13:36:59.337301,https://www.reddit.com/r/todayilearned/comment...
1,TIL New York's Paramount theater would smell o...,False,True,False,False,True,/r/todayilearned/comments/195yrit/til_new_york...,TIL New York's Paramount theater would reek of...,1705182000.0,735,17446,,2024-01-13 21:31:51,2024-01-14 13:36:59.337301,https://www.reddit.com/r/todayilearned/comment...
2,"David Bowie, originally named David Jones, had...",False,True,False,False,True,/r/todayilearned/comments/196dihn/til_david_bo...,TIL David Bowie was born David Jones. When sta...,1705230000.0,27,671,,2024-01-14 11:04:03,2024-01-14 13:36:59.337301,https://www.reddit.com/r/todayilearned/comment...
3,"TIL Bermuda has no indigenous population, and ...",False,False,False,False,True,/r/todayilearned/comments/1960t7u/til_bermuda_...,"TIL Bermuda has no indigenous population, and ...",1705187000.0,254,5582,,2024-01-13 23:03:13,2024-01-14 13:36:59.337301,https://www.reddit.com/r/todayilearned/comment...
4,George Miller's first choice for the role of M...,False,True,False,False,False,/r/todayilearned/comments/195sjnx/til_when_geo...,TIL when George Miller was casting Mad Max: Fu...,1705165000.0,1084,17442,,2024-01-13 17:01:30,2024-01-14 13:36:59.337301,https://www.reddit.com/r/todayilearned/comment...
5,"In 1979, Jimmy Carter faced a rabbit that swam...",False,False,True,False,True,/r/todayilearned/comments/195wg2t/til_in_1979_...,"TIL In 1979 a rabbit, hissing and baring its t...",1705175000.0,333,5617,,2024-01-13 19:50:36,2024-01-14 13:36:59.337301,https://www.reddit.com/r/todayilearned/comment...
6,"In 1927, Radium-Nutex Condoms were marketed us...",False,False,False,True,True,/r/todayilearned/comments/196dx7v/til_radiumnu...,TIL Radium-Nutex Condoms from 1927 were market...,1705232000.0,22,236,,2024-01-14 11:31:33,2024-01-14 13:36:59.337301,https://www.reddit.com/r/todayilearned/comment...
7,"During WWII, RAF bomber command had a casualty...",False,False,False,False,True,/r/todayilearned/comments/196el80/til_during_w...,"TIL during WWII, RAF bomber command suffered a...",1705234000.0,15,179,,2024-01-14 12:14:23,2024-01-14 13:36:59.337301,https://www.reddit.com/r/todayilearned/comment...
8,"In 1998, a portion of the Titanic's hull was r...",False,True,False,False,True,/r/todayilearned/comments/195v589/til_in_1998_...,"TIL in 1998, a portion of the Titanic's hull w...",1705172000.0,137,4516,,2024-01-13 18:54:32,2024-01-14 13:36:59.337301,https://www.reddit.com/r/todayilearned/comment...
9,"Sumo wrestlers, despite consuming over 10,000 ...",True,False,False,True,False,/r/todayilearned/comments/195o141/til_that_sum...,"TIL that sumo wrestlers, despite eating 10,000...",1705153000.0,928,15951,,2024-01-13 13:29:29,2024-01-14 13:36:59.337301,https://www.reddit.com/r/todayilearned/comment...


In [180]:
# inspect a particular category
reddit.query("sport == True")

Unnamed: 0,summary,sport,entertainment,politics,science,history,permalink,title,created_utc,num_comments,score,selftext,post_datetime,scraped_datetime,link
1,TIL New York's Paramount theater would smell o...,False,True,False,False,True,/r/todayilearned/comments/195yrit/til_new_york...,TIL New York's Paramount theater would reek of...,1705182000.0,735,17446,,2024-01-13 21:31:51,2024-01-14 13:36:59.337301,https://www.reddit.com/r/todayilearned/comment...
2,"David Bowie, originally named David Jones, had...",False,True,False,False,True,/r/todayilearned/comments/196dihn/til_david_bo...,TIL David Bowie was born David Jones. When sta...,1705230000.0,27,671,,2024-01-14 11:04:03,2024-01-14 13:36:59.337301,https://www.reddit.com/r/todayilearned/comment...
3,"TIL Bermuda has no indigenous population, and ...",False,False,False,False,True,/r/todayilearned/comments/1960t7u/til_bermuda_...,"TIL Bermuda has no indigenous population, and ...",1705187000.0,254,5582,,2024-01-13 23:03:13,2024-01-14 13:36:59.337301,https://www.reddit.com/r/todayilearned/comment...
5,"In 1979, Jimmy Carter faced a rabbit that swam...",False,False,True,False,True,/r/todayilearned/comments/195wg2t/til_in_1979_...,"TIL In 1979 a rabbit, hissing and baring its t...",1705175000.0,333,5617,,2024-01-13 19:50:36,2024-01-14 13:36:59.337301,https://www.reddit.com/r/todayilearned/comment...
6,"In 1927, Radium-Nutex Condoms were marketed us...",False,False,False,True,True,/r/todayilearned/comments/196dx7v/til_radiumnu...,TIL Radium-Nutex Condoms from 1927 were market...,1705232000.0,22,236,,2024-01-14 11:31:33,2024-01-14 13:36:59.337301,https://www.reddit.com/r/todayilearned/comment...
7,"During WWII, RAF bomber command had a casualty...",False,False,False,False,True,/r/todayilearned/comments/196el80/til_during_w...,"TIL during WWII, RAF bomber command suffered a...",1705234000.0,15,179,,2024-01-14 12:14:23,2024-01-14 13:36:59.337301,https://www.reddit.com/r/todayilearned/comment...
8,"In 1998, a portion of the Titanic's hull was r...",False,True,False,False,True,/r/todayilearned/comments/195v589/til_in_1998_...,"TIL in 1998, a portion of the Titanic's hull w...",1705172000.0,137,4516,,2024-01-13 18:54:32,2024-01-14 13:36:59.337301,https://www.reddit.com/r/todayilearned/comment...
10,"The Dalai Lama carries a Swiss pocket watch, w...",False,False,True,False,True,/r/todayilearned/comments/195pcjz/til_the_dala...,TIL the Dalai Lama carries a high-end Swiss po...,1705157000.0,319,7478,,2024-01-13 14:37:32,2024-01-14 13:36:59.337301,https://www.reddit.com/r/todayilearned/comment...
11,The Academic quarter was established in the ea...,False,False,False,False,True,/r/todayilearned/comments/195tc23/til_the_acad...,TIL The Academic quarter was established in th...,1705167000.0,67,3809,,2024-01-13 17:36:28,2024-01-14 13:36:59.337301,https://www.reddit.com/r/todayilearned/comment...
14,"The post is about the Crazy Horse Memorial, th...",False,False,False,False,True,/r/todayilearned/comments/1969o7i/til_about_th...,"TIL about the Crazy Horse Memorial, the World’...",1705215000.0,39,276,,2024-01-14 06:45:12,2024-01-14 13:36:59.337301,https://www.reddit.com/r/todayilearned/comment...


# Output

Write to a csv

In [181]:
# columns in the data frame: various identifiers plus the things we asked for in the prompt
cols = ['link', 'title', 'post_datetime'] + list(Post.__fields__.keys())

In [182]:
date_suffix = datetime.today().strftime("%Y%m%d")
reddit[cols].to_csv(f"../output/{SUBREDDIT}_{date_suffix}.csv", index=False)