# Reddit

In [1]:
import praw
from dotenv import load_dotenv
import os

load_dotenv()

reddit = praw.Reddit(
    client_id=os.environ["REDDIT_CLIENT_ID"],
    client_secret=os.environ["REDDIT_CLIENT_SECRET"],
    user_agent="Comment Extraction (by u/trnka)",
)


# Design notes

1. Transform the product name into an appropriate query
2. Google search reddit.com with the query (possibly multiple time windows - alltime, 1y, 1m?)
3. Process each post:
    - Pull the top post
    - Pull any comments
    - Identify whether the post is primarily about the product, jobs at the company, or bizdev
4. Summarize the product-related posts:
    - Timeline of posts with titles and one key excerpt, linking to the original. Also include the amount of activity on the post and the score

In [10]:
# Helper to convert a Reddit thread to text
from datetime import datetime
from praw.models import MoreComments

DATE_FORMAT = "%Y-%m-%d"
def utc_to_date(utc: float):
    return datetime.utcfromtimestamp(utc).strftime(DATE_FORMAT)

def reddit_thread_to_text(submission):
    submission.comments.replace_more(limit=10)

    text = f"""
{submission.title} by {submission.author} on {utc_to_date(submission.created_utc)} [{submission.score:+d} votes]:
{submission.selftext}
"""
    for top_level_comment in submission.comments:
        if isinstance(top_level_comment, MoreComments):
            continue

        text += f"""
Comment by {top_level_comment.author} on {utc_to_date(top_level_comment.created_utc)} [{top_level_comment.score:+d} votes]:
{top_level_comment.body}
"""
    return text

# print(reddit_thread_to_text(submission))

# Summarize with LangChain

In [4]:
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate

# See also
# https://www.reddit.com/r/ChatGPT/comments/11twe7z/prompt_to_summarize/
# https://www.reddit.com/r/ChatGPT/comments/13na8yp/highly_effective_prompt_for_summarizing_gpt4/

prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            """
            You're an expert at reading and summarizing public online discussions.

            Please read and summarize the following Reddit thread.
            The summary should cover all the key points and main ideas presented in the original text, while also condensing the information into a concise and easy-to-understand format. 
            Please ensure that the summary includes relevant details and examples that support the main ideas, while avoiding any unnecessary information or repetition. 
            The length of the summary should be appropriate for the length and complexity of the original text, providing a clear and accurate overview without omitting any important information.
            """,
        ),
        (
            "human", 
            """
            Reddit thread: 
            {text}
            """
            ),
    ]
)

llm = ChatOpenAI(model="gpt-3.5-turbo-0125", temperature=0)
runnable = prompt | llm

# Google Search on Reddit for sources

In [5]:

from googlesearch import search
from functools import lru_cache
from typing import Iterable

@lru_cache(1000)
def reddit_search(query: str, num=10, stop=10, pause=2) -> Iterable[str]:
    query = f'site:reddit.com "{query}""'
 
    return list(search(query, num=num, stop=stop, pause=pause))

def test_search():
    for url in reddit_search("98point6", stop=20, pause=2):
        print(url)

test_search()

https://www.reddit.com/r/povertyfinance/comments/bg7ip2/internet_medicine_is_awesome_98point6_was_so_so/
https://www.reddit.com/r/Chipotle/comments/l5bbt9/has_anyone_used_the_98point6_primary_care/
https://www.reddit.com/r/AmazonFC/comments/rgxxbw/has_anyone_used_amazon_care_app_or_98point6_app/
https://www.reddit.com/r/AmazonFC/comments/nqrtaw/98point6/
https://www.reddit.com/r/depressionregimens/comments/bko5k9/psa_98point6_is_an_amazing_app_for_medication/
https://www.reddit.com/r/CostcoEmployee/comments/11wu6zh/anyone_use_98point6_are_they_helpful_better_than/
https://www.reddit.com/r/WalgreensStores/comments/14n48uy/virtual_doctor/
https://www.reddit.com/r/TTC_PCOS/comments/ipmklh/98point6_pcos_appointment_experience/
https://www.reddit.com/r/QuikTrip/comments/u2j0ut/my_apologies_for_any_bad_links_qt_is_fully/
https://www.reddit.com/r/tretinoin/comments/lx0zjb/didnt_have_to_walk_into_a_dermatologist_or_even_a/
https://www.reddit.com/r/AmazonFC/comments/nqxfli/besides_using_98point

In [6]:
from typing import NamedTuple
from langchain_core.messages.ai import AIMessage

def wrap_html(content: str):
    return f"""
<html>
<body>
    {content}
</body>
</html>
"""

class ThreadResult(NamedTuple):
    submission: praw.models.Submission
    text: str
    summary_result: AIMessage

    def to_html(self):
        text = self.text.replace("\n", "<br>")
        return f"""
    <h1>{self.submission.title} by {self.submission.author} on {utc_to_date(self.submission.created_utc)}</h1>
    <a href="{self.submission.url}">{self.submission.url}</a>
    <h2>Summary</h2>
    <p>{self.summary_result.content}</p>

    <h2>Original Thread</h2>
    <p>{text}</p>
"""

def process_url(url: str) -> ThreadResult:
    submission = reddit.submission(url=url)
    text = reddit_thread_to_text(submission)
    summary_result = runnable.invoke({"text": text})
    return ThreadResult(submission=submission, text=text, summary_result=summary_result)


In [7]:
from IPython.display import HTML

thread_urls = reddit_search("98point6", stop=20, pause=2)
random_url = thread_urls[1]
random_thread_result = process_url(random_url)
display(HTML(random_thread_result.to_html()))

In [8]:
def summarize_prompt(prompt):
    return f"""
<h1>Prompt</h1>

<h2>System prompt</h2>
<pre>
{prompt.messages[0].prompt.template}
</pre>

<h2>Prompt</h2>
<pre>
{prompt.messages[1].prompt.template}
</pre>
    """

display(HTML(summarize_prompt(prompt)))

In [11]:
import hashlib
import os

def short_evaluation(company_name: str):
    # This is cached so it should be quick
    thread_urls = reddit_search(company_name, stop=10, pause=2)[:2]

    # The ID of the test is the last 4 chars of the sha of the url list
    test_id = hashlib.sha256("".join(thread_urls).encode()).hexdigest()[-4:]
    
    folder = f"evaluation/test_{test_id}"
    os.makedirs(folder, exist_ok=True)

    results = [process_url(url) for url in thread_urls]
    html_result = wrap_html(summarize_prompt(prompt) + "\n".join(r.to_html() for r in results))

    # Create the filename using the current timestamp
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    filename = f"{folder}/{timestamp}.html"

    with open(filename, "w") as f:
        f.write(html_result)

short_evaluation("98point6")


In [14]:
from typing import List

def long_evaluation(companies: List[str]):
    thread_lists = [reddit_search(company, stop=10, pause=2)[:3] for company in companies]

    test_id = hashlib.sha256((str(companies) + str(thread_lists)).encode()).hexdigest()[-4:]

    folder = f"evaluation/test_{test_id}"
    os.makedirs(folder, exist_ok=True)

    html_result = summarize_prompt(prompt)

    for company, results in zip(companies, thread_lists):
        html_result += f"<h1>{company}</h1>"
        results = [process_url(url) for url in results]
        html_result += "\n".join(r.to_html() for r in results)

    # Create the filename using the current timestamp
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    filename = f"{folder}/{timestamp}.html"

    with open(filename, "w") as f:
        f.write(html_result)

long_evaluation(["98point6", "Rad AI"])