# Reddit

In [48]:
from typing import NamedTuple

class CompanyProduct(NamedTuple):
    company: str
    product: str

    @classmethod
    def same(cls, name: str):
        return cls(company=name, product=name)
    
CompanyProduct.same("98point6")

CompanyProduct(company='98point6', product='98point6')

In [1]:
import praw
from dotenv import load_dotenv
import os

load_dotenv()

reddit = praw.Reddit(
    client_id=os.environ["REDDIT_CLIENT_ID"],
    client_secret=os.environ["REDDIT_CLIENT_SECRET"],
    user_agent="Comment Extraction (by u/trnka)",
)


# Design notes

1. Transform the product name into an appropriate query
2. Google search reddit.com with the query (possibly multiple time windows - alltime, 1y, 1m?)
3. Process each post:
    - Pull the top post
    - Pull any comments
    - Identify whether the post is primarily about the product, jobs at the company, or bizdev
4. Summarize the product-related posts:
    - Timeline of posts with titles and one key excerpt, linking to the original. Also include the amount of activity on the post and the score

In [112]:
# Helper to convert a Reddit thread to text
from datetime import datetime
from praw.models import MoreComments

DATE_FORMAT = "%Y-%m-%d"
def utc_to_date(utc: float):
    return datetime.utcfromtimestamp(utc).strftime(DATE_FORMAT)

def include_comment(comment):
    """Decide whether to include a comment in the output"""
    return not isinstance(comment, MoreComments) and not comment.stickied and comment.score > 0

def format_comment_forest(comment, level=1, parent_id=None, max_depth=4):
    """
    Recursively format a Reddit comment tree into a markdown-like text with basic filtering and depth control.
    """
    if not include_comment(comment) or level > max_depth:
        return ""

    parent_header = f" (in reply to {parent_id})" if parent_id else ""
    text = f"{'#' * level} Comment {comment.id} by {comment.author} on {utc_to_date(comment.created_utc)} [{comment.score:+d} votes]{parent_header}:\n"
    text += f"{comment.body}\n\n"

    text += "\n\n".join(format_comment_forest(reply, level + 1, parent_id=comment.id) for reply in comment.replies)

    return text

def format_reddit_thread(submission, pagination_limit=10):
    """
    Format a Reddit thread into a markdown-like text with basic filtering and depth control.
    """
    submission.comments.replace_more(limit=pagination_limit)

    text = f"""
# Post {submission.id}:  {submission.title} by {submission.author} on {utc_to_date(submission.created_utc)} [{submission.score:+d} votes]
{submission.selftext}

"""
    
    text += "\n\n".join(format_comment_forest(reply, 2, parent_id=submission.id) for reply in submission.comments)
    return text

submission = reddit.submission(url="https://www.reddit.com/r/ChatGPT/comments/11twe7z/prompt_to_summarize/")
print(format_reddit_thread(submission))


# Post 11twe7z:  Prompt to summarize by AlarmedJury7058 on 2023-03-17 [+25 votes]
Hey, what’s the best prompt to ask Chatgpt to summarize a text with details ?



## Comment jcm7h9z by SpiritualCopy4288 on 2023-03-17 [+32 votes] (in reply to 11twe7z):
Here’s a few

“Could you please provide a concise and comprehensive summary of the given text? The summary should capture the main points and key details of the text while conveying the author's intended meaning accurately. Please ensure that the summary is well-organized and easy to read, with clear headings and subheadings to guide the reader through each section. The length of the summary should be appropriate to capture the main points and key details of the text, without including unnecessary information or becoming overly long.”

“Can you provide a comprehensive summary of the given text? The summary should cover all the key points and main ideas presented in the original text, while also condensing the information into a concise a

# Summarize with LangChain

In [77]:
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate

# See also
# https://www.reddit.com/r/ChatGPT/comments/11twe7z/prompt_to_summarize/
# https://www.reddit.com/r/ChatGPT/comments/13na8yp/highly_effective_prompt_for_summarizing_gpt4/

thread_summary_prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            """
            Please read the following Reddit thread and summarize the key points relating to the COMPANY and PRODUCT specified.
            The summary should begin with a brief 1-2 sentence summary of the thread.
            Then it should three sections summarizing key facts and opinions from different perspectives:
            1. User experience perspective: The key strengths and weaknesses of the PRODUCT from the perspective of current users. If there is no information from current users, say "Not applicable".
            2. Prospective employee perspective: The key strengths and weaknesses of the COMPANY from the perspective of employees. For example this could include information about the benefits, company culture, work-life balance, or other relevant information. If there is no information about current or former employees, say "Not applicable".
            3. Prospective investor perspective: Any key information about the COMPANY from the perspective of a prospective investor, such as fundraising, valuation, layoffs, partnerships, or other information indicating that the company is improving or worsening. If there is no information about the company's financials, say "Not applicable".

            The summary should be evidence-based. For any claims made, provide a list of usernames supporting the claim, as in "CLAIM (source1, source2, source3)".
            Short texts should be summarized in 1-2 sentences, while longer texts should be summarized in 3-4 sentences.
            Provide a clear and concise summary of the key points, avoiding unnecessary details.
            Do not make speculations, simply summarize the key facts and opinions stated in the thread.
            Format the results as HTML.
            """,
        ),
        (
            "human", 
            """
            COMPANY: {company}
            PRODUCT: {product}
            
            Reddit thread: 
            {text}
            """
            ),
    ]
)

aggregation_prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            """
            Please read the following summaries of Reddit threads and form an aggregated summary of the key points relating to the COMPANY and PRODUCT specified.

            The summary should begin with a brief paragraph summarizing key points about the product, then a brief paragraph summarizing key points about the company.

            Then it should three sections summarizing key facts and opinions from different perspectives:
            1. User experience perspective: The key strengths and weaknesses of the PRODUCT from the perspective of current users. If there is no information from current users, say "Not applicable".
            2. Prospective employee perspective: The key strengths and weaknesses of the COMPANY from the perspective of employees. For example this could include information about the benefits, company culture, work-life balance, or other relevant information. If there is no information about current or former employees, say "Not applicable".
            3. Prospective investor perspective: Any key information about the COMPANY from the perspective of a prospective investor, such as fundraising, valuation, layoffs, partnerships, or other information indicating that the company is improving or worsening. If there is no information about the company's financials, say "Not applicable".
            Any claims in these three sections should have a hyperlink citation to all sources, for example "CLAIM (source1, source2, source3)".

            Then the summary should have a bulleted list of all of the sources. Each source should have a date, title, and hyperlink to the article source if possible.
            Do not make speculations, simply summarize the key facts and opinions stated in the thread.
            Format the results as HTML.
            """,
        ),
        (
            "human", 
            """
            COMPANY: {company}
            PRODUCT: {product}
            
            Summaries: 
            {text}
            """
            ),
    ]
)

llm = ChatOpenAI(model="gpt-3.5-turbo-0125", temperature=0)
runnable = thread_summary_prompt | llm

# Google Search on Reddit for sources

In [50]:

from googlesearch import search
from functools import lru_cache
from typing import Iterable
import re

url_pattern = re.compile(r".*/comments/.+")

@lru_cache(1000)
def reddit_search(target: CompanyProduct, num=10, stop=10, pause=2) -> Iterable[str]:
    query = f'site:reddit.com "{target.company}""'
    if target.product != target.company:
        query += f' "{target.product}"'
 
    return list(url for url in search(query, num=num, stop=stop, pause=pause) if url_pattern.match(url))

def test_search():
    for url in reddit_search(CompanyProduct("Singularity 6", "Palia"), stop=20, pause=2):
        print(url)

test_search()

https://www.reddit.com/r/MMORPG/comments/1bz2e0z/palia_developers_singularity_6_axes_35_of_staff/
https://www.reddit.com/r/MMORPG/comments/1dtp97n/daybreak_acquires_singularity_6_palia_developer/
https://www.reddit.com/r/pcgaming/comments/1bwiuin/cozy_mmo_palia_developer_singularity_6_has/
https://www.reddit.com/r/Palia/comments/15ss9gg/singularity_6_is_not_an_indie_company/
https://www.reddit.com/r/CozyGamers/comments/1dt80wx/daybreak_acquires_singularity_6_developer_of_palia/
https://www.reddit.com/r/Palia/comments/1dt8dbs/palia_studio_singularity_6_acquired_by_daybreak/
https://www.reddit.com/r/Palia/comments/19bqfoy/cant_even_give_singularity_6_my_money/
https://www.reddit.com/r/Games/comments/1cu1f7i/palia_studio_singularity_6_confirms_36_workers/
https://www.reddit.com/r/Palia/comments/1dt7ujt/daybreak_game_company/
https://www.reddit.com/r/MMORPG/comments/obkdoi/singularity_6_raises_30m_to_fund_debut_mmo_game/
https://www.reddit.com/r/Palia/comments/18zqxpx/please_singularity_6/

In [82]:
from typing import NamedTuple
from langchain_core.messages.ai import AIMessage

def wrap_html(content: str):
    return f"""
<html>
<body>
    {content}
</body>
</html>
"""

class ThreadResult(NamedTuple):
    submission: praw.models.Submission
    text: str
    summary_result: AIMessage

    def to_html(self):
        text = self.text.replace("\n", "<br>")
        summary_html = self.summary_result.content
        return f"""
        <h1>{self.submission.title} by {self.submission.author} on {utc_to_date(self.submission.created_utc)}</h1>
        <a href="{self.submission.url}">{self.submission.url}</a>
        <h2>Summary</h2>
        {summary_html}

        <h2>Original Thread</h2>
        <p>{text}</p>
        """
    
class AggregationResult(NamedTuple):
    # inputs
    target: CompanyProduct
    summaries: List[ThreadResult]
    aggregation_prompt_context: str

    # outputs
    summary_result: AIMessage

    def to_html(self):
        summary_html = self.summary_result.content
        return f"""
        <h1>Aggregate summary for {self.target.company} / {self.target.product}</h1>
        {summary_html}
        """


def process_url(target: CompanyProduct, url: str, text_max_chars=30000) -> ThreadResult:
    submission = reddit.submission(url=url)
    text = format_reddit_thread(submission)

    if len(text) > text_max_chars:
        print(f"Text too long: {len(text)} > {text_max_chars}. Truncating.")
        text = text[:text_max_chars]
    
    runnable = thread_summary_prompt | llm
    summary_result = runnable.invoke({"text": text, "company": target.company, "product": target.product})
    return ThreadResult(submission=submission, text=text, summary_result=summary_result)

def process_summaries(target: CompanyProduct, summaries: List[ThreadResult]) -> AggregationResult:
    text = "\n-----\n".join(f"{utc_to_date(result.submission.created_utc)} {result.submission.title} {result.submission.url}\n\n{result.summary_result.content}" for result in summaries)

    print(f"""Summary of summaries input for {target.company} {target.product}:
          {text}
          """)

    runnable = aggregation_prompt | llm
    result = runnable.invoke({"text": text, "company": target.company, "product": target.product})

    return AggregationResult(target=target, summaries=summaries, aggregation_prompt_context=text, summary_result=result)


In [74]:
from IPython.display import HTML

target = CompanyProduct("Singularity 6", "Palia")

thread_urls = reddit_search(target, stop=10, pause=2)
random_url = thread_urls[1]
random_thread_result = process_url(target, random_url)
display(HTML(random_thread_result.to_html()))

In [8]:
def summarize_prompt(prompt):
    return f"""
<h1>Prompt</h1>

<h2>System prompt</h2>
<pre>
{prompt.messages[0].prompt.template}
</pre>

<h2>Prompt</h2>
<pre>
{prompt.messages[1].prompt.template}
</pre>
    """

display(HTML(summarize_prompt(prompt)))

In [83]:
import hashlib
import os

def short_evaluation(target: CompanyProduct):
    # This is cached so it should be quick
    thread_urls = reddit_search(target, stop=10, pause=2)[:2]

    # The ID of the test is the last 4 chars of the sha of the url list
    test_id = hashlib.sha256("".join(thread_urls).encode()).hexdigest()[-4:]
    
    folder = f"evaluation/test_{test_id}"
    os.makedirs(folder, exist_ok=True)

    # individual thread results
    results = [process_url(target, url) for url in thread_urls]

    # aggregation result
    aggregation_result = process_summaries(target, results)

    # make a unified page
    result_htmls = "\n".join(r.to_html() for r in results)    
    html_result = wrap_html(f"""
{aggregation_result.to_html()}

<hr/>

{summarize_prompt(prompt)}

{result_htmls}
""")

    # Create the filename using the current timestamp
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    filename = f"{folder}/{timestamp}.html"

    with open(filename, "w") as f:
        f.write(html_result)

    print(f"Results saved to {filename}")

short_evaluation(CompanyProduct.same("98point6"))


Summary of summaries input for 98point6 98point6:
          Internet medicine is awesome, 98point6 was so so helpful for me https://www.reddit.com/r/povertyfinance/comments/bg7ip2/internet_medicine_is_awesome_98point6_was_so_so/


<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta http-equiv="X-UA-Compatible" content="IE=edge">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Reddit Thread Summary</title>
</head>
<body>
    <h2>Reddit Thread Summary on 98point6</h2>
    <h3>Summary:</h3>
    <p>Users shared positive experiences with 98point6, highlighting its affordability and convenience in providing online medical consultations and prescriptions.</p>

    <h3>User Experience Perspective:</h3>
    <p><strong>Strengths:</strong> Affordable at $20 for a one-year membership, convenient online appointments, helpful doctors who address cost concerns, and access to prescription medications at reasonable prices through GoodRx (Fr

In [84]:
from typing import List

def long_evaluation(targets: List[CompanyProduct]):
    thread_lists = [reddit_search(target, stop=10, pause=2)[:3] for target in targets]

    test_id = hashlib.sha256((str(targets) + str(thread_lists)).encode()).hexdigest()[-4:]

    folder = f"evaluation/test_{test_id}"
    os.makedirs(folder, exist_ok=True)

    html_result = summarize_prompt(prompt)

    for target, results in zip(targets, thread_lists):
        html_result += f"<h1>{target.company}: {target.product}</h1>"

        # individual thread summaries
        results = [process_url(target, url) for url in results]
        
        # aggregated result
        aggregation_result = process_summaries(target, results)

        html_result += aggregation_result.to_html()
        html_result += "\n".join(r.to_html() for r in results)

        html_result += "<hr/>"

    # Create the filename using the current timestamp
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    filename = f"{folder}/{timestamp}.html"

    with open(filename, "w") as f:
        f.write(html_result)

    print(f"Results saved to {filename}")

# TODO: Update this once we have company-product pairs instead
long_evaluation([CompanyProduct.same("98point6"), CompanyProduct.same("Rad AI"), CompanyProduct("Singularity 6", "Palia"), CompanyProduct.same("Instacart")])

Summary of summaries input for 98point6 98point6:
          Internet medicine is awesome, 98point6 was so so helpful for me https://www.reddit.com/r/povertyfinance/comments/bg7ip2/internet_medicine_is_awesome_98point6_was_so_so/


<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta http-equiv="X-UA-Compatible" content="IE=edge">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Reddit Thread Summary</title>
</head>
<body>
    <h2>Reddit Thread Summary for 98point6</h2>
    <h3>Summary:</h3>
    <p>Users shared positive experiences with 98point6, highlighting its affordability and convenience in providing online medical consultations and prescriptions.</p>

    <h3>User Experience Perspective:</h3>
    <p><strong>Strengths:</strong> Affordable at $20 for a one-year membership, convenient online appointments, helpful doctors providing cost-effective medication options through GoodRx (FrugalChef13).</p>
    <p><strong>Weaknesses

CompanyProduct(company='98point6', product='98point6')