# Reddit

In [48]:
from typing import NamedTuple

class CompanyProduct(NamedTuple):
    company: str
    product: str

    @classmethod
    def same(cls, name: str):
        return cls(company=name, product=name)
    
CompanyProduct.same("98point6")

CompanyProduct(company='98point6', product='98point6')

In [1]:
import praw
from dotenv import load_dotenv
import os

load_dotenv()

reddit = praw.Reddit(
    client_id=os.environ["REDDIT_CLIENT_ID"],
    client_secret=os.environ["REDDIT_CLIENT_SECRET"],
    user_agent="Comment Extraction (by u/trnka)",
)


# Design notes

1. Transform the product name into an appropriate query
2. Google search reddit.com with the query (possibly multiple time windows - alltime, 1y, 1m?)
3. Process each post:
    - Pull the top post
    - Pull any comments
    - Identify whether the post is primarily about the product, jobs at the company, or bizdev
4. Summarize the product-related posts:
    - Timeline of posts with titles and one key excerpt, linking to the original. Also include the amount of activity on the post and the score

In [10]:
# Helper to convert a Reddit thread to text
from datetime import datetime
from praw.models import MoreComments

DATE_FORMAT = "%Y-%m-%d"
def utc_to_date(utc: float):
    return datetime.utcfromtimestamp(utc).strftime(DATE_FORMAT)

def reddit_thread_to_text(submission):
    submission.comments.replace_more(limit=10)

    text = f"""
{submission.title} by {submission.author} on {utc_to_date(submission.created_utc)} [{submission.score:+d} votes]:
{submission.selftext}
"""
    for top_level_comment in submission.comments:
        if isinstance(top_level_comment, MoreComments):
            continue

        text += f"""
Comment by {top_level_comment.author} on {utc_to_date(top_level_comment.created_utc)} [{top_level_comment.score:+d} votes]:
{top_level_comment.body}
"""
    return text

# print(reddit_thread_to_text(submission))

# Summarize with LangChain

In [55]:
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate

# See also
# https://www.reddit.com/r/ChatGPT/comments/11twe7z/prompt_to_summarize/
# https://www.reddit.com/r/ChatGPT/comments/13na8yp/highly_effective_prompt_for_summarizing_gpt4/

prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            """
            Please read the following Reddit thread and summarize the key points relating to the COMPANY and PRODUCT specified.
            The summary should list the key strengths and weaknesses of the user experience of the PRODUCT, any key strengths and weaknesses of employee experience of the COMPANY, and any key strengths and weaknesses of the business of the COMPANY. 
            The summary should be evidence-based, for example referencing any specific usernames or quotes from the Reddit thread as appropriate.
            The length of the summary should be appropriate for the length and complexity of the original text, providing a clear and accurate overview without omitting any important information.
            Format the results as HTML.
            """,
        ),
        (
            "human", 
            """
            COMPANY: {company}
            PRODUCT: {product}
            Reddit thread: 
            {text}
            """
            ),
    ]
)

llm = ChatOpenAI(model="gpt-3.5-turbo-0125", temperature=0)
runnable = prompt | llm

# Google Search on Reddit for sources

In [50]:

from googlesearch import search
from functools import lru_cache
from typing import Iterable
import re

url_pattern = re.compile(r".*/comments/.+")

@lru_cache(1000)
def reddit_search(target: CompanyProduct, num=10, stop=10, pause=2) -> Iterable[str]:
    query = f'site:reddit.com "{target.company}""'
    if target.product != target.company:
        query += f' "{target.product}"'
 
    return list(url for url in search(query, num=num, stop=stop, pause=pause) if url_pattern.match(url))

def test_search():
    for url in reddit_search(CompanyProduct("Singularity 6", "Palia"), stop=20, pause=2):
        print(url)

test_search()

https://www.reddit.com/r/MMORPG/comments/1bz2e0z/palia_developers_singularity_6_axes_35_of_staff/
https://www.reddit.com/r/MMORPG/comments/1dtp97n/daybreak_acquires_singularity_6_palia_developer/
https://www.reddit.com/r/pcgaming/comments/1bwiuin/cozy_mmo_palia_developer_singularity_6_has/
https://www.reddit.com/r/Palia/comments/15ss9gg/singularity_6_is_not_an_indie_company/
https://www.reddit.com/r/CozyGamers/comments/1dt80wx/daybreak_acquires_singularity_6_developer_of_palia/
https://www.reddit.com/r/Palia/comments/1dt8dbs/palia_studio_singularity_6_acquired_by_daybreak/
https://www.reddit.com/r/Palia/comments/19bqfoy/cant_even_give_singularity_6_my_money/
https://www.reddit.com/r/Games/comments/1cu1f7i/palia_studio_singularity_6_confirms_36_workers/
https://www.reddit.com/r/Palia/comments/1dt7ujt/daybreak_game_company/
https://www.reddit.com/r/MMORPG/comments/obkdoi/singularity_6_raises_30m_to_fund_debut_mmo_game/
https://www.reddit.com/r/Palia/comments/18zqxpx/please_singularity_6/

In [56]:
from typing import NamedTuple
from langchain_core.messages.ai import AIMessage

def wrap_html(content: str):
    return f"""
<html>
<body>
    {content}
</body>
</html>
"""

class ThreadResult(NamedTuple):
    submission: praw.models.Submission
    text: str
    summary_result: AIMessage

    def to_html(self):
        text = self.text.replace("\n", "<br>")
        summary_html = self.summary_result.content
        return f"""
        <h1>{self.submission.title} by {self.submission.author} on {utc_to_date(self.submission.created_utc)}</h1>
        <a href="{self.submission.url}">{self.submission.url}</a>
        <h2>Summary</h2>
        {summary_html}

        <h2>Original Thread</h2>
        <p>{text}</p>
        """


def process_url(target: CompanyProduct, url: str, text_max_chars=30000) -> ThreadResult:
    submission = reddit.submission(url=url)
    text = reddit_thread_to_text(submission)

    if len(text) > text_max_chars:
        print(f"Text too long: {len(text)} > {text_max_chars}. Truncating.")
        text = text[:text_max_chars]
    
    summary_result = runnable.invoke({"text": text, "company": target.company, "product": target.product})
    return ThreadResult(submission=submission, text=text, summary_result=summary_result)


In [57]:
from IPython.display import HTML

target = CompanyProduct("Singularity 6", "Palia")

thread_urls = reddit_search(target, stop=10, pause=2)
random_url = thread_urls[1]
random_thread_result = process_url(target, random_url)
display(HTML(random_thread_result.to_html()))

In [8]:
def summarize_prompt(prompt):
    return f"""
<h1>Prompt</h1>

<h2>System prompt</h2>
<pre>
{prompt.messages[0].prompt.template}
</pre>

<h2>Prompt</h2>
<pre>
{prompt.messages[1].prompt.template}
</pre>
    """

display(HTML(summarize_prompt(prompt)))

In [58]:
import hashlib
import os

def short_evaluation(target: CompanyProduct):
    # This is cached so it should be quick
    thread_urls = reddit_search(target, stop=10, pause=2)[:2]

    # The ID of the test is the last 4 chars of the sha of the url list
    test_id = hashlib.sha256("".join(thread_urls).encode()).hexdigest()[-4:]
    
    folder = f"evaluation/test_{test_id}"
    os.makedirs(folder, exist_ok=True)

    results = [process_url(target, url) for url in thread_urls]
    html_result = wrap_html(summarize_prompt(prompt) + "\n".join(r.to_html() for r in results))

    # Create the filename using the current timestamp
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    filename = f"{folder}/{timestamp}.html"

    with open(filename, "w") as f:
        f.write(html_result)

    print(f"Results saved to {filename}")

short_evaluation(CompanyProduct.same("98point6"))


Results saved to evaluation/test_67bf/20240726_150511.html


In [47]:
from typing import List

def long_evaluation(targets: List[CompanyProduct]):
    thread_lists = [reddit_search(target, stop=10, pause=2)[:3] for target in targets]

    test_id = hashlib.sha256((str(targets) + str(thread_lists)).encode()).hexdigest()[-4:]

    folder = f"evaluation/test_{test_id}"
    os.makedirs(folder, exist_ok=True)

    html_result = summarize_prompt(prompt)

    for target, results in zip(targets, thread_lists):
        html_result += f"<h1>{target.company}: {target.product}</h1>"
        results = [process_url(target, url) for url in results]
        html_result += "\n".join(r.to_html() for r in results)

    # Create the filename using the current timestamp
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    filename = f"{folder}/{timestamp}.html"

    with open(filename, "w") as f:
        f.write(html_result)

    print(f"Results saved to {filename}")

# TODO: Update this once we have company-product pairs instead
long_evaluation([CompanyProduct.same("98point6"), CompanyProduct.same("Rad AI"), CompanyProduct("Singularity 6", "Palia"), CompanyProduct.same("Instacart")])

TypeError: process_url() missing 2 required positional arguments: 'product' and 'url'

CompanyProduct(company='98point6', product='98point6')