# load_summarize_chain experiment

If I can get load_summarize_chain to work it may simplify my pipeline considerably. Reddit is the main source in which there's too much content.

In [34]:
from core import CompanyProduct, make_experiment_dir

target = CompanyProduct.same("98point6")

experiment_dir = make_experiment_dir(target)
experiment_dir

'experiments/98point6_98point6/20240810_132055'

In [35]:
# Step 1: Load a bunch of Reddit data
import reddit.summarizer
import reddit.search
import reddit.fetch

from search import SearchResult

from dataclasses import dataclass
from typing import List, Mapping, Optional

@dataclass
class RedditSummary:
    sources: List[SearchResult]

    overall_summary: reddit.summarizer.AggregatedSummaryResult
    summaries: List[reddit.summarizer.ThreadSummaryResult]
    permalinks: Mapping[str, str]

def load_submissions(target: CompanyProduct, num_threads=2, min_comments=2) -> Optional[RedditSummary]:
    reddit_client = reddit.fetch.init()

    # Search for URLs
    search_results = reddit.search.find_submissions(target, num_results=num_threads)

    # Fetch the Submissions from Reddit
    post_submissions = [reddit_client.submission(url=result.link) for result in search_results]

    # Filter Submissions to only those with enough comments
    post_submissions = [submission for submission in post_submissions if submission.num_comments >= min_comments]

    if len(post_submissions) == 0:
        print(f"No posts with enough comments found for {target}")
        return None

    # Limit the number of threads
    return post_submissions[:num_threads]

threads = load_submissions(target, num_threads=10)
threads

[Submission(id='bg7ip2'),
 Submission(id='nqxfli'),
 Submission(id='rgxxbw'),
 Submission(id='l5bbt9'),
 Submission(id='ipmklh'),
 Submission(id='14n48uy'),
 Submission(id='lx0zjb'),
 Submission(id='tz9sws'),
 Submission(id='u7mkpr')]

In [36]:
# Copy of the markdown formatter so I cacn iterate on it
from reddit.fetch import Comment, Submission, utc_to_date, include_comment

def comment_forest_to_markdown_v2(comment: Comment, level=1, parent_id=None, max_depth=4):
    """
    Recursively format a Reddit comment tree into a markdown-like text with basic filtering and depth control.
    """
    if not include_comment(comment) or level > max_depth:
        return ""

    optional_parent_header = f" (in reply to ID {parent_id})" if parent_id else ""
    text = f"{'#' * level} Comment ID {comment.id} with {comment.score:+d} score by [{comment.author} on {utc_to_date(comment.created_utc)}](https://www.reddit.com{comment.permalink}){optional_parent_header}:\n"
    text += f"{comment.body.strip()}\n\n"

    text += "\n\n".join(
        comment_forest_to_markdown_v2(reply, level + 1, parent_id=comment.id)
        for reply in comment.replies
    )

    return text.strip()


def submission_to_markdown_v2(submission: Submission, pagination_limit=10) -> str:
    """
    Format a Reddit thread into a markdown-like text with basic filtering and depth control.
    """
    submission.comments.replace_more(limit=pagination_limit)

    text = f"""
# Post ID {submission.id}: {submission.title} with {submission.score:+d} score by [{submission.author} on {utc_to_date(submission.created_utc)}](https://www.reddit.com{submission.permalink})
{submission.selftext}

"""

    text += "\n\n".join(
        comment_forest_to_markdown_v2(reply, 2, parent_id=submission.id)
        for reply in submission.comments
    )
    return text.strip()


thread_markdowns = [submission_to_markdown_v2(thread) for thread in threads]

markdowns = '\n\n'.join(thread_markdowns)

with open(f"{experiment_dir}/reddit_sources.md", "w") as f:
    f.write(markdowns)

print(markdowns)

# Post ID bg7ip2: Internet medicine is awesome, 98point6 was so so helpful for me with +54 score by [FrugalChef13 on 2019-04-22](https://www.reddit.com/r/povertyfinance/comments/bg7ip2/internet_medicine_is_awesome_98point6_was_so_so/)
**TL;DR- $20 got me an awesome appointment with a nice doctor and a prescription for a medication I could afford that solved my issue.**

*Disclaimer: This particular thing worked well for me so I'm going to tell you about it. Everyone is different, so it might not work as well (or at all) for you.  Take what you find useful from this post and ignore the rest.  I'm not compensated or connected to the website I'm discussing.*

So like a lot of people on here I'm usually either uninsured or underinsured.  Right now it's underinsured with a high deductible, so when I messed my back up badly enough that I could barely move I freaked.  I've got scoliosis, a fucked up spine, bad knees, and muscles that love to spasm uncontrollably for days on end.  I'd run out 

In [37]:
combined_markdown = "\n\n".join(thread_markdowns)

In [38]:
# about the input
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)

for i, document in enumerate(thread_markdowns):
    print(f"Document {i}: {len(document):,} chars, {llm.get_num_tokens(document):,} tokens")

Document 0: 10,107 chars, 2,540 tokens
Document 1: 2,455 chars, 799 tokens
Document 2: 963 chars, 290 tokens
Document 3: 1,546 chars, 454 tokens
Document 4: 1,739 chars, 467 tokens
Document 5: 1,091 chars, 343 tokens
Document 6: 6,429 chars, 2,035 tokens
Document 7: 1,678 chars, 496 tokens
Document 8: 104,679 chars, 33,319 tokens


In [39]:
from langchain_openai import ChatOpenAI
from langchain.chains.summarize import load_summarize_chain
from langchain_core.documents import Document

from dotenv import load_dotenv
load_dotenv()

llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)
# llm.get_num_tokens(combined_markdown)

summary_chain = load_summarize_chain(
    llm=llm, 
    chain_type='map_reduce',
    verbose=True
)

documents = [Document(page_content=thread_markdown) for thread_markdown in thread_markdowns]
output = summary_chain.run(documents)

with open(f"{experiment_dir}/map_reduce_baseline.md", "w") as f:
    f.write(output)

output



[1m> Entering new MapReduceDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mWrite a concise summary of the following:


"# Post ID bg7ip2: Internet medicine is awesome, 98point6 was so so helpful for me with +54 score by [FrugalChef13 on 2019-04-22](https://www.reddit.com/r/povertyfinance/comments/bg7ip2/internet_medicine_is_awesome_98point6_was_so_so/)
**TL;DR- $20 got me an awesome appointment with a nice doctor and a prescription for a medication I could afford that solved my issue.**

*Disclaimer: This particular thing worked well for me so I'm going to tell you about it. Everyone is different, so it might not work as well (or at all) for you.  Take what you find useful from this post and ignore the rest.  I'm not compensated or connected to the website I'm discussing.*

So like a lot of people on here I'm usually either uninsured or underinsured.  Right now it's underinsured with a high deductible, so when I messed my 

"In a Reddit discussion, user FrugalChef13 shared a positive experience with the online medical service 98point6, highlighting its affordability and convenience for underinsured individuals. They paid $20 for a one-year membership, which allowed unlimited online consultations, and received helpful medical advice for severe back pain. Other users discussed alternatives to 98point6, shared experiences with various telehealth services, and inquired about obtaining doctor's notes and mental health support. The conversation also touched on Costco membership benefits, including free tire checks and discounts on various services, emphasizing the value and convenience of being a member."

In [40]:
from langchain import PromptTemplate

map_prompt = """
Review the following Reddit thread and extract quotes that express positive or negative experiences with 98point6.

REDDIT THREAD:
{text}

KEY QUOTES:
"""
map_prompt_template = PromptTemplate(template=map_prompt, input_variables=["text"])

combine_prompt = """
Write a summary of the user experience of 98point6 based on the following extracts from Reddit threads.
Separately list out the strengths and weaknesses of 98piont6.

SUMMARISED THREADS:
{text}

SUMMARY IN MARKDOWN FORMAT:
"""
combine_prompt_template = PromptTemplate(template=combine_prompt, input_variables=["text"])


summary_chain = load_summarize_chain(
    llm=llm, 
    chain_type='map_reduce',
    map_prompt=map_prompt_template,
    combine_prompt=combine_prompt_template,
    token_max=30000,
    verbose=True
)

documents = [Document(page_content=thread_markdown) for thread_markdown in thread_markdowns]
output = summary_chain.run(documents)

with open(f"{experiment_dir}/map_reduce_directed_30k.md", "w") as f:
    f.write(output)


output



[1m> Entering new MapReduceDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3m
Review the following Reddit thread and extract quotes that express positive or negative experiences with 98point6.

REDDIT THREAD:
# Post ID bg7ip2: Internet medicine is awesome, 98point6 was so so helpful for me with +54 score by [FrugalChef13 on 2019-04-22](https://www.reddit.com/r/povertyfinance/comments/bg7ip2/internet_medicine_is_awesome_98point6_was_so_so/)
**TL;DR- $20 got me an awesome appointment with a nice doctor and a prescription for a medication I could afford that solved my issue.**

*Disclaimer: This particular thing worked well for me so I'm going to tell you about it. Everyone is different, so it might not work as well (or at all) for you.  Take what you find useful from this post and ignore the rest.  I'm not compensated or connected to the website I'm discussing.*

So like a lot of people on here I'm usually either uninsured or 

'# Summary of User Experience with 98point6\n\nThe user experience with 98point6, as reflected in various Reddit threads, showcases a blend of positive and negative feedback. Many users appreciate the accessibility, affordability, and efficiency of the service, particularly in urgent situations. The ability to consult with doctors quickly and receive prescriptions without the hassle of traditional appointments is frequently highlighted. However, some users express dissatisfaction with the service, particularly regarding the prescription process and the necessity of seeing a family doctor for certain medications.\n\n## Strengths of 98point6\n- **Accessibility**: Users find the service easy to use, allowing for quick consultations without long waits.\n- **Affordability**: The subscription model offers unlimited visits for a low annual fee, making it a cost-effective option for many.\n- **Efficiency**: Many users report receiving prescriptions quickly, even for urgent needs like COVID-19 

In [41]:
# A map_reduce version that adapts our custom prompts
from langchain import PromptTemplate

map_prompt = """
Please read the following Reddit thread and write an evidence-based summary of the key points relating to the COMPANY {company} and PRODUCT {product}.

The summary should begin with a brief 1-2 sentence summary of the thread.
Then write three sections summarizing key facts and opinions about the COMPANY or PRODUCT from different perspectives:
1. User experience perspective: The key strengths and weaknesses of the PRODUCT from the perspective of current users.
2. Prospective employee perspective: The key strengths and weaknesses of the COMPANY from the perspective of employees. For example this could include information about the benefits, company culture, work-life balance, or other relevant information.
3. Prospective investor perspective: Any key information about the COMPANY from the perspective of a prospective investor, such as fundraising, valuation, layoffs, partnerships, or other information indicating that the company is improving or worsening. 

Provide a clear and concise summary of the key points, avoiding unnecessary details.
Do not make speculations, simply summarize the key facts and opinions stated in the thread.

Only include information about the COMPANY {company} and PRODUCT {product}. Do not extract information about other companies or products.

Reddit thread: 
{text}

SUMMARY:
"""
map_prompt_template = PromptTemplate(template=map_prompt, input_variables=["text"])

combine_prompt = """
Please read the following summaries of Reddit threads and write a comprehensive summary of the key points relating to the COMPANY {company} and PRODUCT {product}.

The summary should begin with an overview paragraph.

Then it should three sections summarizing key facts and opinions from different perspectives:
1. User experience perspective: The key strengths and weaknesses of the PRODUCT from the perspective of current users.
2. Prospective employee perspective: The key strengths and weaknesses of the COMPANY from the perspective of employees. For example this could include information about the benefits, company culture, work-life balance, or other relevant information. 
3. Prospective investor perspective: Any key information about the COMPANY from the perspective of a prospective investor, such as fundraising, valuation, layoffs, partnerships, or other information indicating that the company is improving or worsening. 

Do not make speculations, simply summarize the key facts and opinions stated in the thread.

Summaries: 
{text}


OVERALL SUMMARY IN MARKDOWN FORMAT:
"""
combine_prompt_template = PromptTemplate(template=combine_prompt, input_variables=["text"])


summary_chain = load_summarize_chain(
    llm=llm, 
    chain_type='map_reduce',
    map_prompt=map_prompt_template,
    combine_prompt=combine_prompt_template,
    token_max=30000,
    verbose=True
)

documents = [Document(page_content=thread_markdown) for thread_markdown in thread_markdowns]
output = summary_chain.run({
    "company": target.company,
    "product": target.product,
    "input_documents": documents
})

with open(f"{experiment_dir}/map_reduce_specialized_prompts.md", "w") as f:
    f.write(output)


output



[1m> Entering new MapReduceDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3m
Please read the following Reddit thread and write an evidence-based summary of the key points relating to the COMPANY 98point6 and PRODUCT 98point6.

The summary should begin with a brief 1-2 sentence summary of the thread.
Then write three sections summarizing key facts and opinions about the COMPANY or PRODUCT from different perspectives:
1. User experience perspective: The key strengths and weaknesses of the PRODUCT from the perspective of current users.
2. Prospective employee perspective: The key strengths and weaknesses of the COMPANY from the perspective of employees. For example this could include information about the benefits, company culture, work-life balance, or other relevant information.
3. Prospective investor perspective: Any key information about the COMPANY from the perspective of a prospective investor, such as fundraising, valu

"# Summary of 98point6 and Its Product\n\n98point6 is a telehealth service that provides affordable online medical consultations and prescriptions, primarily through a mobile app. Users have shared their experiences with the service, highlighting its accessibility and convenience, while also noting some limitations. The company operates in a growing market, and perspectives from users, employees, and potential investors reveal both strengths and weaknesses.\n\n## User Experience Perspective\n**Strengths:**\n- **Affordability:** Users appreciate the low membership cost of $20 for unlimited consultations, making it accessible for uninsured or underinsured individuals.\n- **Convenience:** The ability to consult with doctors from home is a significant advantage, especially for those with mobility issues or time constraints.\n- **Empathetic Care:** Positive interactions with doctors who are understanding and willing to help users find affordable medication options enhance the overall experi

In [47]:
# A map_reduce version that adapts our custom prompts
from langchain import PromptTemplate

map_prompt = """
Please read the following Reddit thread and extract key opinions and facts relating to the user experience of the PRODUCT {product} by the COMPANY {company} from the perspective of current users.
Only include information about the COMPANY {company} and PRODUCT {product}. 
Do not extract information about other companies or products.
If the text does not contain any relevant information about the COMPANY or PRODUCT, please write "No relevant information found."

Format the results as a Markdown list of quotes, each with a permalink to the source of the quote like so:
- "quote" [Author on Date](permalink)

For example:

Input comment:
## Comment ID hrmpl3t with +3 score by [MarketWorldly9908 on 2022-01-07](https://www.reddit.com/r/povertyfinance/comments/bg7ip2/internet_medicine_is_awesome_98point6_was_so_so/hrmpl3t/) (in reply to ID bg7ip2):
My husband and I have used 98.6 three times. All three times they did not prescribe the needed antibiotic to get better. I had an ear infection, my husband had an ear infection, then I had a sinus infection. We had to wait and get into our family doctor, so we paid 98.6 and our family doctor. I would not recommend them!

Example output:
- "All three times they did not prescribe the needed antibiotic to get better." [MarketWorldly9908 on 2022-01-07](https://www.reddit.com/r/povertyfinance/comments/bg7ip2/internet_medicine_is_awesome_98point6_was_so_so/hrmpl3t/)

Each quote should be a short, concise statement that captures the essence of the sentiment or information.
Be sure to extract a comprehensive sample of both positive and negative opinions, as well as any factual statements about the product.

Reddit thread: 
{text}

MARKDOWN LIST OF QUOTES ABOUT THE COMPANY AND PRODUCT:
"""
map_prompt_template = PromptTemplate(template=map_prompt, input_variables=["text"])

combine_prompt = """
Please cluster all of the quotes below, organizing them into thematic topics of feedback about the COMPANY {company} and PRODUCT {product}.
Use the following top-level headings:
# Positive Sentiments
# Negative Sentiments
# Statements of Fact

If there are many quotes under a heading, please subdivide into headings to group similar quotes together.

Summaries: 
{text}


GROUPED QUOTES IN MARKDOWN FORMAT:
"""
combine_prompt_template = PromptTemplate(template=combine_prompt, input_variables=["text"])


summary_chain = load_summarize_chain(
    llm=llm, 
    chain_type='map_reduce',
    map_prompt=map_prompt_template,
    combine_prompt=combine_prompt_template,
    token_max=30000,
    verbose=True
)

documents = [Document(page_content=thread_markdown) for thread_markdown in thread_markdowns]
output = summary_chain.run({
    "company": target.company,
    "product": target.product,
    "input_documents": documents
})

with open(f"{experiment_dir}/map_reduce_specialized_prompts_v8.md", "w") as f:
    f.write(output)


output



[1m> Entering new MapReduceDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3m
Please read the following Reddit thread and extract key opinions and facts relating to the user experience of the PRODUCT 98point6 by the COMPANY 98point6 from the perspective of current users.
Only include information about the COMPANY 98point6 and PRODUCT 98point6. 
Do not extract information about other companies or products.
If the text does not contain any relevant information about the COMPANY or PRODUCT, please write "No relevant information found."

Format the results as a Markdown list of quotes, each with a permalink to the source of the quote like so:
- "quote" [Author on Date](permalink)

For example:

Input comment:
## Comment ID hrmpl3t with +3 score by [MarketWorldly9908 on 2022-01-07](https://www.reddit.com/r/povertyfinance/comments/bg7ip2/internet_medicine_is_awesome_98point6_was_so_so/hrmpl3t/) (in reply to ID bg7ip2):
My husband 

'# Positive Sentiments\n\n## Accessibility and Convenience\n- "It was amazing... I chatted with some weird robot for 10 minutes or so and answered questions about what my problem was, then I got connected to this nice doctor lady." [FrugalChef13 on 2019-04-22](https://www.reddit.com/r/povertyfinance/comments/bg7ip2/internet_medicine_is_awesome_98point6_was_so_so/)\n- "I really appreciate the accessibility of it." [Somethingtacos on 2021-01-27](https://www.reddit.com/r/Chipotle/comments/l5bbt9/has_anyone_used_the_98point6_primary_care/gkwtoqb/)\n- "You can talk to a Dr for almost any condition 24/7!" [shortninja29 on 2020-09-09](https://www.reddit.com/r/TTC_PCOS/comments/ipmklh/98point6_pcos_appointment_experience/)\n- "Overall it was an awesome experience and I got to avoid long waits and my copay." [shortninja29 on 2020-09-09](https://www.reddit.com/r/TTC_PCOS/comments/ipmklh/98point6_pcos_appointment_experience/)\n\n## Positive Experiences with Doctors\n- "She was so kind, and didn\'

In [43]:
# Baseline: The existing summary
from collections import ChainMap
from praw.models import Submission
from typing import List

def process_reddit(target: CompanyProduct, post_submissions: List[Submission]) -> Optional[RedditSummary]:
    # Summarize each
    post_summaries = [reddit.summarizer.summarize_submission(target, submission) for submission in post_submissions]

    # Filter out any with over 1 hallucinated comment id
    post_summaries = [summary for summary in post_summaries if summary.is_under_max_hallucinations(1, debug=True)]

    if len(post_summaries) == 0:
        print(f"No valid summaries found for {target}")
        return None
    
    # Index permalinks
    permalinks = ChainMap(*[reddit.fetch.index_permalinks(summary.submission) for summary in post_summaries])

    # Aggregate the summaries
    aggregate_summary = reddit.summarizer.summarize_summaries(target, post_summaries)

    return RedditSummary(
        sources=None,
        overall_summary=aggregate_summary,
        summaries=post_summaries,
        permalinks=permalinks
    )

reddit_result = process_reddit(target, threads)

with open(f"{experiment_dir}/handcrafted_baseline.md", "w") as f:
    f.write(reddit_result.overall_summary.to_markdown())

reddit_result


Text too long: 72307 > 40000. Truncating.


RedditSummary(sources=None, overall_summary=AggregatedSummaryResult(target=CompanyProduct(company='98point6', product='98point6'), summaries=[ThreadSummaryResult(submission=Submission(id='bg7ip2'), text='\n# Post ID bg7ip2:  Internet medicine is awesome, 98point6 was so so helpful for me by FrugalChef13 on 2019-04-22 [+54 votes]\n**TL;DR- $20 got me an awesome appointment with a nice doctor and a prescription for a medication I could afford that solved my issue.**\n\n*Disclaimer: This particular thing worked well for me so I\'m going to tell you about it. Everyone is different, so it might not work as well (or at all) for you.  Take what you find useful from this post and ignore the rest.  I\'m not compensated or connected to the website I\'m discussing.*\n\nSo like a lot of people on here I\'m usually either uninsured or underinsured.  Right now it\'s underinsured with a high deductible, so when I messed my back up badly enough that I could barely move I freaked.  I\'ve got scoliosis,

In [44]:
# TODO: Implement a basic version of OpinionDigest