# Reddit

In [1]:
from typing import NamedTuple

class CompanyProduct(NamedTuple):
    company: str
    product: str

    @classmethod
    def same(cls, name: str):
        return cls(company=name, product=name)
    
CompanyProduct.same("98point6")

CompanyProduct(company='98point6', product='98point6')

In [2]:
import praw
from dotenv import load_dotenv
import os

load_dotenv()

reddit = praw.Reddit(
    client_id=os.environ["REDDIT_CLIENT_ID"],
    client_secret=os.environ["REDDIT_CLIENT_SECRET"],
    user_agent="Comment Extraction (by u/trnka)",
)


# Design notes

1. Transform the product name into an appropriate query
2. Google search reddit.com with the query (possibly multiple time windows - alltime, 1y, 1m?)
3. Process each post:
    - Pull the top post
    - Pull any comments
    - Identify whether the post is primarily about the product, jobs at the company, or bizdev
4. Summarize the product-related posts:
    - Timeline of posts with titles and one key excerpt, linking to the original. Also include the amount of activity on the post and the score

In [3]:
# Helper to convert a Reddit thread to text
from datetime import datetime
from praw.models import MoreComments

DATE_FORMAT = "%Y-%m-%d"
def utc_to_date(utc: float):
    return datetime.utcfromtimestamp(utc).strftime(DATE_FORMAT)

def include_comment(comment):
    """Decide whether to include a comment in the output"""
    return not isinstance(comment, MoreComments) and not comment.stickied and comment.score > 0

def format_comment_forest(comment, level=1, parent_id=None, max_depth=4):
    """
    Recursively format a Reddit comment tree into a markdown-like text with basic filtering and depth control.
    """
    if not include_comment(comment) or level > max_depth:
        return ""

    parent_header = f" (in reply to {parent_id})" if parent_id else ""
    text = f"{'#' * level} Comment {comment.id} by {comment.author} on {utc_to_date(comment.created_utc)} [{comment.score:+d} votes]{parent_header}:\n"
    text += f"{comment.body}\n\n"

    text += "\n\n".join(format_comment_forest(reply, level + 1, parent_id=comment.id) for reply in comment.replies)

    return text

def format_reddit_thread(submission, pagination_limit=10):
    """
    Format a Reddit thread into a markdown-like text with basic filtering and depth control.
    """
    submission.comments.replace_more(limit=pagination_limit)

    text = f"""
# Post {submission.id}:  {submission.title} by {submission.author} on {utc_to_date(submission.created_utc)} [{submission.score:+d} votes]
{submission.selftext}

"""
    
    text += "\n\n".join(format_comment_forest(reply, 2, parent_id=submission.id) for reply in submission.comments)
    return text

submission = reddit.submission(url="https://www.reddit.com/r/ChatGPT/comments/11twe7z/prompt_to_summarize/")
print(format_reddit_thread(submission))


# Post 11twe7z:  Prompt to summarize by AlarmedJury7058 on 2023-03-17 [+26 votes]
Hey, what’s the best prompt to ask Chatgpt to summarize a text with details ?



## Comment jcm7h9z by SpiritualCopy4288 on 2023-03-17 [+32 votes] (in reply to 11twe7z):
Here’s a few

“Could you please provide a concise and comprehensive summary of the given text? The summary should capture the main points and key details of the text while conveying the author's intended meaning accurately. Please ensure that the summary is well-organized and easy to read, with clear headings and subheadings to guide the reader through each section. The length of the summary should be appropriate to capture the main points and key details of the text, without including unnecessary information or becoming overly long.”

“Can you provide a comprehensive summary of the given text? The summary should cover all the key points and main ideas presented in the original text, while also condensing the information into a concise a

# Summarize with LangChain

In [84]:
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate

# See also
# https://www.reddit.com/r/ChatGPT/comments/11twe7z/prompt_to_summarize/
# https://www.reddit.com/r/ChatGPT/comments/13na8yp/highly_effective_prompt_for_summarizing_gpt4/

thread_summary_prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            """
            Please read the following Reddit thread and write an evididence-based summary of the key points relating to the COMPANY and PRODUCT specified.
            The summary should begin with a brief 1-2 sentence summary of the thread.
            Then it should three sections summarizing key facts and opinions from different perspectives:
            1. User experience perspective: The key strengths and weaknesses of the PRODUCT from the perspective of current users.
            2. Prospective employee perspective: The key strengths and weaknesses of the COMPANY from the perspective of employees. For example this could include information about the benefits, company culture, work-life balance, or other relevant information.
            3. Prospective investor perspective: Any key information about the COMPANY from the perspective of a prospective investor, such as fundraising, valuation, layoffs, partnerships, or other information indicating that the company is improving or worsening. 

            Provide a clear and concise summary of the key points, avoiding unnecessary details.
            Do not make speculations, simply summarize the key facts and opinions stated in the thread.
            
            Limit the response to 5000 tokens.
            Format the results as Json in the following format:
            {json_instructions}
            """,
        ),
        (
            "human", 
            """
            COMPANY: {company}
            PRODUCT: {product}
            
            Reddit thread: 
            {text}
            """
            ),
    ]
)

aggregation_prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            """
            Please read the following summaries of Reddit threads and write a comprehensive summary of the key points relating to the COMPANY and PRODUCT specified.

            The summary should begin with an overview paragraph.

            Then it should three sections summarizing key facts and opinions from different perspectives:
            1. User experience perspective: The key strengths and weaknesses of the PRODUCT from the perspective of current users.
            2. Prospective employee perspective: The key strengths and weaknesses of the COMPANY from the perspective of employees. For example this could include information about the benefits, company culture, work-life balance, or other relevant information. 
            3. Prospective investor perspective: Any key information about the COMPANY from the perspective of a prospective investor, such as fundraising, valuation, layoffs, partnerships, or other information indicating that the company is improving or worsening. 

            Do not make speculations, simply summarize the key facts and opinions stated in the thread.
            Limit the response to 5000 tokens.
            Format the results as Json in the following format:
            {json_instructions}
            """,
        ),
        (
            "human", 
            """
            COMPANY: {company}
            PRODUCT: {product}
            
            Summaries: 
            {text}
            """
            ),
    ]
)

llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)


# Google Search on Reddit for sources

In [5]:

from googlesearch import search
from functools import lru_cache
from typing import Iterable
import re

url_pattern = re.compile(r".*/comments/.+")

@lru_cache(1000)
def reddit_search(target: CompanyProduct, num=10, stop=10, pause=2) -> Iterable[str]:
    query = f'site:reddit.com "{target.company}""'
    if target.product != target.company:
        query += f' "{target.product}"'
 
    return list(url for url in search(query, num=num, stop=stop, pause=pause) if url_pattern.match(url))

def test_search():
    for url in reddit_search(CompanyProduct("Singularity 6", "Palia"), stop=20, pause=2):
        print(url)

test_search()

https://www.reddit.com/r/MMORPG/comments/1bz2e0z/palia_developers_singularity_6_axes_35_of_staff/
https://www.reddit.com/r/MMORPG/comments/1dtp97n/daybreak_acquires_singularity_6_palia_developer/
https://www.reddit.com/r/pcgaming/comments/1bwiuin/cozy_mmo_palia_developer_singularity_6_has/
https://www.reddit.com/r/CozyGamers/comments/1dt80wx/daybreak_acquires_singularity_6_developer_of_palia/
https://www.reddit.com/r/Palia/comments/15ss9gg/singularity_6_is_not_an_indie_company/
https://www.reddit.com/r/Palia/comments/1dt8dbs/palia_studio_singularity_6_acquired_by_daybreak/
https://www.reddit.com/r/Palia/comments/19bqfoy/cant_even_give_singularity_6_my_money/
https://www.reddit.com/r/MMORPG/comments/obkdoi/singularity_6_raises_30m_to_fund_debut_mmo_game/
https://www.reddit.com/r/Palia/comments/1dt7ujt/daybreak_game_company/
https://www.reddit.com/r/Games/comments/1cu1f7i/palia_studio_singularity_6_confirms_36_workers/
https://www.reddit.com/r/Palia/comments/18zqxpx/please_singularity_6/

In [83]:
# Output schema
from typing import Optional, List

from langchain_core.pydantic_v1 import BaseModel, Field


class Claim(BaseModel):
    """A claim made in a Reddit thread"""

    quote: str = Field(description="A short quote from the source representing the key claim")
    comment_id: str = Field(description="The comment ID of the quote")

class ThreadSummary(BaseModel):
    """A structured summary of a Reddit thread or threads about a company or product"""

    thread_summary: str = Field(description="An overview of the content")

    user_experience_strengths: Optional[List[Claim]] = Field(
        default=None, description="Key positive themes in user feedback about the product"
    )
    user_experience_weaknesses: Optional[List[Claim]] = Field(
        default=None, description="Key negative themes in user feedback about the product"
    )

    employee_experience_strengths: Optional[List[Claim]] = Field(
        default=None, description="The key strengths of the company from the employee perspective"
    )
    employee_experience_weaknesses: Optional[List[Claim]] = Field(
        default=None, description="The key weaknesses of the company from the employee perspective"
    )

    investor_perspective: Optional[List[Claim]] = Field(
        default=None, description="Key information about the company from the perspective of a prospective investor"
    )

json_instructions = """
The JSON object should have these top-level keys:

Required:
thread_summary (string): An overview of the content discussed in the Reddit thread(s).

Optional (should be omitted if no information is available):
user_experience_strengths (list of Claim objects): Key positive themes in user feedback about the product.
user_experience_weaknesses (list of Claim objects): Key negative themes in user feedback about the product.
employee_experience_strengths (list of Claim objects): Key strengths of the company from the employee perspective.
employee_experience_weaknesses (list of Claim objects): Key weaknesses of the company from the employee perspective.
investor_perspective (list of Claim objects): Key information about the company from the perspective of a prospective investor.
Each Claim object should match this format:

Claim Object:
quote (string): A short quote from the source representing the key claim.
comment_id (string): The comment ID of the quote.
"""

In [76]:
ThreadSummary.schema_json()

'{"title": "ThreadSummary", "description": "A structured summary of a Reddit thread or threads about a company or product", "type": "object", "properties": {"thread_summary": {"title": "Thread Summary", "description": "An overview of the content", "type": "string"}, "user_experience_strengths": {"title": "User Experience Strengths", "description": "Key positive themes in user feedback about the product", "type": "array", "items": {"$ref": "#/definitions/Claim"}}, "user_experience_weaknesses": {"title": "User Experience Weaknesses", "description": "Key negative themes in user feedback about the product", "type": "array", "items": {"$ref": "#/definitions/Claim"}}, "employee_experience_strengths": {"title": "Employee Experience Strengths", "description": "The key strengths of the company from the employee perspective", "type": "array", "items": {"$ref": "#/definitions/Claim"}}, "employee_experience_weaknesses": {"title": "Employee Experience Weaknesses", "description": "The key weaknesses

In [65]:
ThreadSummary.schema()

{'title': 'ThreadSummary',
 'description': 'A structured summary of a Reddit thread or threads about a company or product',
 'type': 'object',
 'properties': {'thread_summary': {'title': 'Thread Summary',
   'description': 'An overview of the content',
   'type': 'string'},
  'user_experience_strengths': {'title': 'User Experience Strengths',
   'description': 'Key positive themes in user feedback about the product',
   'type': 'array',
   'items': {'$ref': '#/definitions/Claim'}},
  'user_experience_weaknesses': {'title': 'User Experience Weaknesses',
   'description': 'Key negative themes in user feedback about the product',
   'type': 'array',
   'items': {'$ref': '#/definitions/Claim'}},
  'employee_experience_strengths': {'title': 'Employee Experience Strengths',
   'description': 'The key strengths of the company from the employee perspective',
   'type': 'array',
   'items': {'$ref': '#/definitions/Claim'}},
  'employee_experience_weaknesses': {'title': 'Employee Experience Weak

In [85]:
from typing import NamedTuple
from langchain_core.messages.ai import AIMessage
import markdown

def wrap_html(content: str):
    return f"""
<html>
<body>
    {content}
</body>
</html>
"""

def claims_to_html(claims: Optional[List[Claim]]) -> str:
    if not claims:
        return ""

    return "<ul>" + "\n".join(f'<li>"{claim.quote}" (source: {claim.comment_id})</li>' for claim in claims) + "</ul>"


class ThreadResult(NamedTuple):
    submission: praw.models.Submission
    text: str
    summary_result: AIMessage

    def to_html(self):
        summary_content = self.summary_result

        # Note: This was refactored to work properly with the structured output format

        return f"""
<h1>{self.submission.title} by {self.submission.author} on {utc_to_date(self.submission.created_utc)}</h1>
<a href="{self.submission.url}">{self.submission.url}</a>

{summary_content.thread_summary}

<h2>User Experience</h2>

<h3>Strengths</h3>

{claims_to_html(summary_content.user_experience_strengths)}

<h3>Weaknesses</h3>

{claims_to_html(summary_content.user_experience_weaknesses)}

<h2>Employee Experience</h2>

<h3>Strengths</h3>

{claims_to_html(summary_content.employee_experience_strengths)}

<h3>Weaknesses</h3>

{claims_to_html(summary_content.employee_experience_weaknesses)}

<h2>Investor Perspective</h2>

{claims_to_html(summary_content.investor_perspective)}

<h2>Original Thread</h2>
<p>{markdown.markdown(self.text)}</p>
        """
    
class AggregationResult(NamedTuple):
    # inputs
    target: CompanyProduct
    summaries: List[ThreadResult]
    aggregation_prompt_context: str

    # outputs
    summary_result: AIMessage

    def to_html(self):
        summary_content = self.summary_result

        # Note: This was refactored to work properly with the structured output format

        return f"""
<h1>{self.target.company} / {self.target.product}</h1>

{summary_content.thread_summary}

<h2>User Experience</h2>

<h3>Strengths</h3>

{claims_to_html(summary_content.user_experience_strengths)}

<h3>Weaknesses</h3>

{claims_to_html(summary_content.user_experience_weaknesses)}

<h2>Employee Experience</h2>

<h3>Strengths</h3>

{claims_to_html(summary_content.employee_experience_strengths)}

<h3>Weaknesses</h3>

{claims_to_html(summary_content.employee_experience_weaknesses)}

<h2>Investor Perspective</h2>

{claims_to_html(summary_content.investor_perspective)}
        """




In [86]:
def summarize_thread(target: CompanyProduct, url: str, text_max_chars=40000) -> ThreadResult:
    submission = reddit.submission(url=url)
    text = format_reddit_thread(submission)

    if len(text) > text_max_chars:
        print(f"Text too long: {len(text)} > {text_max_chars}. Truncating.")
        text = text[:text_max_chars]
    
    runnable = thread_summary_prompt | llm.with_structured_output(schema=ThreadSummary, method="json_mode")
    summary_result = runnable.invoke({"text": text, "company": target.company, "product": target.product, "json_instructions": json_instructions})
    return ThreadResult(submission=submission, text=text, summary_result=summary_result)

def claims_to_markdown(claims: Optional[List[Claim]]) -> str:
    if not claims:
        return "Not applicable"

    return "\n".join(f'- "{claim.quote}" (source: {claim.comment_id})' for claim in claims)

def summary_to_markdown(summary_result: ThreadResult, debug=False) -> str:
    text =  f"""
# Summary: {summary_result.submission.title} (thread id: {summary_result.submission.id})

{summary_result.summary_result.thread_summary}

## User Experience

### Strengths

{claims_to_markdown(summary_result.summary_result.user_experience_strengths)}

### Weaknesses

{claims_to_markdown(summary_result.summary_result.user_experience_weaknesses)}

## Employee Experience

### Strengths

{claims_to_markdown(summary_result.summary_result.employee_experience_strengths)}

### Weaknesses

{claims_to_markdown(summary_result.summary_result.employee_experience_weaknesses)}

## Investor Perspective

{claims_to_markdown(summary_result.summary_result.investor_perspective)}
    """

    if debug:
        text += f"""
## Debug

### Original Thread
{summary_result.text}
        """

    return text

def summarize_summaries(target: CompanyProduct, summaries: List[ThreadResult]) -> AggregationResult:
    text = "\n\n".join(summary_to_markdown(result) for result in summaries)

    runnable = aggregation_prompt | llm.with_structured_output(schema=ThreadSummary)
    result = runnable.invoke({"text": text, "company": target.company, "product": target.product, "json_instructions": json_instructions})

    return AggregationResult(target=target, summaries=summaries, aggregation_prompt_context=text, summary_result=result)


summary_result = summarize_thread(CompanyProduct("Singularity 6", "Palia"), "https://www.reddit.com/r/MMORPG/comments/1bz2e0z/palia_developers_singularity_6_axes_35_of_staff/")
summary_result


ThreadResult(submission=Submission(id='1bz2e0z'), text='\n# Post 1bz2e0z:  Palia developers, Singularity 6, axes 35% of staff just after Steam launch by Hexdro on 2024-04-08 [+220 votes]\n\n\n## Comment kyn4d7g by generalmasandra on 2024-04-08 [+34 votes] (in reply to 1bz2e0z):\nI got hopeful when I read former Riot, a non pvp/combat-ish mmo years ago.\n\nBut the question needs to be asked - why play this over a survival base builder or a city builder or a Sim City type game? They all provide a more unique and detailed experience in their respective areas.\n\nFrom what I remember in testing - you\'re basically handed a plot of land at the beginning after some quests and you can start building/questing to progress. But these plots of land were away from the main village tucked away and I thought it was an instanced zone you could see with others so they could recycle the same handful of plots.\n\nAnd at that point why not play Valheim or V Rising or Palworld or something where you can b

In [87]:


print(summary_to_markdown(summary_result, debug=True))


# Summary: Palia developers, Singularity 6, axes 35% of staff just after Steam launch (thread id: 1bz2e0z)

The Reddit thread discusses the recent layoffs at Singularity 6, the developers of the game Palia, following a mixed reception after its Steam launch. Users express disappointment with the game's limitations and monetization strategies, while employees and investors share concerns about the company's future and management decisions.

## User Experience

### Strengths

- "I enjoy the characters and setting." (source: kyppu7o)
- "The story and characters are great, if you like those kind of stories." (source: kypy0pe)
- "The foundations are 100% amazing. The mining, the hunting, the way you can place furniture and items, even the character designs are great." (source: kyqhodd)

### Weaknesses

- "Palia felt restrictive... you had to have had more freedom for players to build solo." (source: kyn4d7g)
- "The gameplay loop was so boring, only two zones and max. 24 other Players with 

In [20]:
# Test the structured summary
url = "https://www.reddit.com/r/Games/comments/nribi0/singularity_6s_palia_is_a_relaxing_mmo/"
result = summarize_thread(CompanyProduct("Singularity 6", "Palia"), url)
print(summary_to_markdown(result, debug=True))


# Summary: Singularity 6’s Palia Is A Relaxing MMO (thread id: nribi0)

Users are discussing Singularity 6's Palia, an MMO game, and its relaxing nature compared to other games like Stardew Valley and Animal Crossing.

## User Experience

### Strengths

- "Solely based on the trailer? Star dew valley/harvest moon relaxing." (source: h0j6jbn)

### Weaknesses

Not applicable

## Employee Experience

### Strengths

Not applicable

### Weaknesses

Not applicable

## Investor Perspective

Not applicable
    
## Debug

### Original Thread

# Post nribi0:  Singularity 6’s Palia Is A Relaxing MMO by onesidedcircle on 2021-06-03 [+15 votes]


## Comment h0jaa2x by quetiapinenapper on 2021-06-04 [+5 votes] (in reply to nribi0):
I wish we could get shit like this on consoles. I like the social aspect. PlayStation home was great because it was about socializing. I don’t need combat or pvp in everything anymore. Kind of want something that’s fun to just make my own space in and idly chat doing do 

In [21]:
from IPython.display import HTML

target = CompanyProduct("Singularity 6", "Palia")

thread_urls = reddit_search(target, stop=10, pause=2)
random_url = thread_urls[1]
random_thread_result = summarize_thread(target, random_url)
display(HTML(random_thread_result.to_html()))

Text too long: 30769 > 20000. Truncating.


ValidationError: 2 validation errors for ThreadSummary
employee_experience_strengths -> 0
  value is not a valid dict (type=type_error.dict)
employee_experience_weaknesses -> 0
  value is not a valid dict (type=type_error.dict)

In [27]:
def summarize_prompt(prompt):
    return f"""
<h1>Prompt</h1>

<h2>System prompt</h2>
<pre>
{prompt.messages[0].prompt.template}
</pre>

<h2>Prompt</h2>
<pre>
{prompt.messages[1].prompt.template}
</pre>
    """

display(HTML(summarize_prompt(thread_summary_prompt)))

In [90]:
import hashlib
import os

def short_evaluation(target: CompanyProduct, num_threads=2):
    # This is cached so it should be quick
    thread_urls = reddit_search(target, stop=10, pause=2)[:num_threads]

    # The ID of the test is the last 4 chars of the sha of the url list
    test_id = hashlib.sha256("".join(thread_urls).encode()).hexdigest()[-4:]
    
    folder = f"evaluation/test_{test_id}"
    os.makedirs(folder, exist_ok=True)

    # individual thread results
    results = [summarize_thread(target, url) for url in thread_urls]

    # aggregation result
    aggregation_result = summarize_summaries(target, results)

    # make a unified page
    result_htmls = "\n".join(r.to_html() for r in results)    
    html_result = wrap_html(f"""
{aggregation_result.to_html()}

<hr/>

<h1>Debugging the aggregation</h1>

<h2>Aggregation prompt</h2>
{summarize_prompt(aggregation_prompt)}

<h2>Aggregation input (converted markdown to HTML)</h2>
{markdown.markdown(aggregation_result.aggregation_prompt_context)}

<hr/>

<h1>Debugging the mapping</h1>

<h2>Mapping prompt</h2>

{summarize_prompt(thread_summary_prompt)}


<h2>Individual summaries</h2>
{result_htmls}
""")

    # Create the filename using the current timestamp
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    filename = f"{folder}/{timestamp}.html"

    with open(filename, "w") as f:
        f.write(html_result)

    print(f"Results for {target} saved to {filename}")

short_evaluation(CompanyProduct("Rad AI", "Omni"), 5)


Results for CompanyProduct(company='Rad AI', product='Omni') saved to evaluation/test_158f/20240728_182343.html


In [42]:
from typing import List

def long_evaluation(targets: List[CompanyProduct]):
    for target in targets:
        short_evaluation(target, 4)

# TODO: Update this once we have company-product pairs instead
long_evaluation([CompanyProduct.same("98point6"), CompanyProduct.same("Rad AI"), CompanyProduct("Singularity 6", "Palia"), CompanyProduct.same("Instacart")])

Results for CompanyProduct(company='98point6', product='98point6') saved to evaluation/test_a64d/20240728_173337.html


ValidationError: 1 validation error for ThreadSummary
investor_perspective -> 0
  value is not a valid dict (type=type_error.dict)

CompanyProduct(company='98point6', product='98point6')