# load_summarize_chain experiment

If I can get load_summarize_chain to work it may simplify my pipeline considerably. Reddit is the main source in which there's too much content.

In [1]:
# Step 1: Load a bunch of Reddit data
import reddit.summarizer
import reddit.search
import reddit.fetch

from core import CompanyProduct
from search import SearchResult

from dataclasses import dataclass
from typing import List, Mapping, Optional

@dataclass
class RedditSummary:
    sources: List[SearchResult]

    overall_summary: reddit.summarizer.AggregatedSummaryResult
    summaries: List[reddit.summarizer.ThreadSummaryResult]
    permalinks: Mapping[str, str]

def load_submissions(target: CompanyProduct, num_threads=2, min_comments=2) -> Optional[RedditSummary]:
    reddit_client = reddit.fetch.init()

    # Search for URLs
    search_results = reddit.search.find_submissions(target, num_results=num_threads)

    # Fetch the Submissions from Reddit
    post_submissions = [reddit_client.submission(url=result.link) for result in search_results]

    # Filter Submissions to only those with enough comments
    post_submissions = [submission for submission in post_submissions if submission.num_comments >= min_comments]

    if len(post_submissions) == 0:
        print(f"No posts with enough comments found for {target}")
        return None

    # Limit the number of threads
    return post_submissions[:num_threads]

target = CompanyProduct.same("98point6")
threads = load_submissions(target, num_threads=10)
threads

[Submission(id='bg7ip2'),
 Submission(id='nqxfli'),
 Submission(id='rgxxbw'),
 Submission(id='l5bbt9'),
 Submission(id='ipmklh'),
 Submission(id='14n48uy'),
 Submission(id='lx0zjb'),
 Submission(id='tz9sws'),
 Submission(id='u7mkpr')]

In [2]:
thread_markdowns = [reddit.fetch.submission_to_markdown(thread) for thread in threads]
thread_markdowns

['\n# Post ID bg7ip2:  Internet medicine is awesome, 98point6 was so so helpful for me by FrugalChef13 on 2019-04-22 [+54 votes]\n**TL;DR- $20 got me an awesome appointment with a nice doctor and a prescription for a medication I could afford that solved my issue.**\n\n*Disclaimer: This particular thing worked well for me so I\'m going to tell you about it. Everyone is different, so it might not work as well (or at all) for you.  Take what you find useful from this post and ignore the rest.  I\'m not compensated or connected to the website I\'m discussing.*\n\nSo like a lot of people on here I\'m usually either uninsured or underinsured.  Right now it\'s underinsured with a high deductible, so when I messed my back up badly enough that I could barely move I freaked.  I\'ve got scoliosis, a fucked up spine, bad knees, and muscles that love to spasm uncontrollably for days on end.  I\'d run out of my prescription muscle relaxants last fall and hadn\'t been able to afford another appointm

In [3]:
combined_markdown = "\n\n".join(thread_markdowns)

In [4]:
# about the input
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)

for i, document in enumerate(thread_markdowns):
    print(f"Document {i}: {len(document):,} chars, {llm.get_num_tokens(document):,} tokens")

Document 0: 8,672 chars, 2,071 tokens
Document 1: 1,443 chars, 474 tokens
Document 2: 630 chars, 179 tokens
Document 3: 1,125 chars, 311 tokens
Document 4: 1,439 chars, 371 tokens
Document 5: 744 chars, 231 tokens
Document 6: 3,845 chars, 1,173 tokens
Document 7: 1,249 chars, 353 tokens
Document 8: 72,307 chars, 21,352 tokens


In [5]:
from core import make_experiment_dir

experiment_dir = make_experiment_dir(target)
experiment_dir

'experiments/98point6_98point6/20240809_171540'

In [6]:
from langchain_openai import ChatOpenAI
from langchain.chains.summarize import load_summarize_chain
from langchain_core.documents import Document

from dotenv import load_dotenv
load_dotenv()

llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)
# llm.get_num_tokens(combined_markdown)

summary_chain = load_summarize_chain(
    llm=llm, 
    chain_type='map_reduce',
    verbose=True
)

documents = [Document(page_content=thread_markdown) for thread_markdown in thread_markdowns]
output = summary_chain.run(documents)

with open(f"{experiment_dir}/map_reduce_baseline.md", "w") as f:
    f.write(output)

output

  warn_deprecated(




[1m> Entering new MapReduceDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mWrite a concise summary of the following:


"
# Post ID bg7ip2:  Internet medicine is awesome, 98point6 was so so helpful for me by FrugalChef13 on 2019-04-22 [+54 votes]
**TL;DR- $20 got me an awesome appointment with a nice doctor and a prescription for a medication I could afford that solved my issue.**

*Disclaimer: This particular thing worked well for me so I'm going to tell you about it. Everyone is different, so it might not work as well (or at all) for you.  Take what you find useful from this post and ignore the rest.  I'm not compensated or connected to the website I'm discussing.*

So like a lot of people on here I'm usually either uninsured or underinsured.  Right now it's underinsured with a high deductible, so when I messed my back up badly enough that I could barely move I freaked.  I've got scoliosis, a fucked up spine, bad knees, a

"FrugalChef13 shares a positive experience with the online medical service 98point6, highlighting its $20 membership for unlimited consultations, which helped them manage back pain affordably. They received a prescription for anti-inflammatory medication and cost assistance through GoodRx. The author encourages others to consider online services for non-emergency health issues, despite their limitations for serious conditions. \n\nIn a Reddit thread, users discuss various virtual doctor apps, including Doctor on Demand and CVS Minute Clinic, and share experiences with 98point6 for conditions like PCOS and mental health. Some users express concerns about the service's mental health capabilities and prescription practices. Others share positive experiences with telehealth services for skincare and mental health resources. \n\nAdditionally, a separate discussion highlights the benefits of Costco membership, including various services and savings, with users sharing personal anecdotes abou

In [9]:
from langchain import PromptTemplate

map_prompt = """
Review the following Reddit thread and extract quotes that express positive or negative experiences with 98point6.

REDDIT THREAD:
{text}

KEY QUOTES:
"""
map_prompt_template = PromptTemplate(template=map_prompt, input_variables=["text"])

combine_prompt = """
Write a summary of the user experience of 98point6 based on the following extracts from Reddit threads.
Separately list out the strengths and weaknesses of 98piont6.

SUMMARISED THREADS:
{text}

SUMMARY IN MARKDOWN FORMAT:
"""
combine_prompt_template = PromptTemplate(template=combine_prompt, input_variables=["text"])


summary_chain = load_summarize_chain(
    llm=llm, 
    chain_type='map_reduce',
    map_prompt=map_prompt_template,
    combine_prompt=combine_prompt_template,
    token_max=30000,
    verbose=True
)

documents = [Document(page_content=thread_markdown) for thread_markdown in thread_markdowns]
output = summary_chain.run(documents)

with open(f"{experiment_dir}/map_reduce_directed_30k.md", "w") as f:
    f.write(output)


output



[1m> Entering new MapReduceDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3m
Review the following Reddit thread and extract quotes that express positive or negative experiences with 98point6.

REDDIT THREAD:

# Post ID bg7ip2:  Internet medicine is awesome, 98point6 was so so helpful for me by FrugalChef13 on 2019-04-22 [+54 votes]
**TL;DR- $20 got me an awesome appointment with a nice doctor and a prescription for a medication I could afford that solved my issue.**

*Disclaimer: This particular thing worked well for me so I'm going to tell you about it. Everyone is different, so it might not work as well (or at all) for you.  Take what you find useful from this post and ignore the rest.  I'm not compensated or connected to the website I'm discussing.*

So like a lot of people on here I'm usually either uninsured or underinsured.  Right now it's underinsured with a high deductible, so when I messed my back up badly enough t

'# Summary of User Experience with 98point6\n\nThe user experience with 98point6, as reflected in various Reddit threads, showcases a blend of positive and negative feedback. Many users appreciate the accessibility, affordability, and efficiency of the service, particularly in obtaining prescriptions and avoiding long waits typically associated with traditional healthcare visits. Users have reported quick responses from doctors, often receiving prescriptions within a short time frame, which has been especially beneficial during urgent health situations like COVID-19.\n\nHowever, there are notable criticisms regarding the limitations of the service. Some users expressed dissatisfaction with the inability to receive necessary prescriptions, leading them to seek care from their family doctors instead. Additionally, there are concerns about the specificity required for medical notes and the overall effectiveness of the service in addressing certain health issues.\n\n## Strengths of 98point

In [14]:
# A map_reduce version that adapts our custom prompts
from langchain import PromptTemplate

map_prompt = """
Please read the following Reddit thread and write an evidence-based summary of the key points relating to the COMPANY {company} and PRODUCT {product}.

The summary should begin with a brief 1-2 sentence summary of the thread.
Then write three sections summarizing key facts and opinions about the COMPANY or PRODUCT from different perspectives:
1. User experience perspective: The key strengths and weaknesses of the PRODUCT from the perspective of current users.
2. Prospective employee perspective: The key strengths and weaknesses of the COMPANY from the perspective of employees. For example this could include information about the benefits, company culture, work-life balance, or other relevant information.
3. Prospective investor perspective: Any key information about the COMPANY from the perspective of a prospective investor, such as fundraising, valuation, layoffs, partnerships, or other information indicating that the company is improving or worsening. 

Provide a clear and concise summary of the key points, avoiding unnecessary details.
Do not make speculations, simply summarize the key facts and opinions stated in the thread.

Only include information about the COMPANY {company} and PRODUCT {product}. Do not extract information about other companies or products.

Reddit thread: 
{text}

SUMMARY:
"""
map_prompt_template = PromptTemplate(template=map_prompt, input_variables=["text"])

combine_prompt = """
Please read the following summaries of Reddit threads and write a comprehensive summary of the key points relating to the COMPANY {company} and PRODUCT {product}.

The summary should begin with an overview paragraph.

Then it should three sections summarizing key facts and opinions from different perspectives:
1. User experience perspective: The key strengths and weaknesses of the PRODUCT from the perspective of current users.
2. Prospective employee perspective: The key strengths and weaknesses of the COMPANY from the perspective of employees. For example this could include information about the benefits, company culture, work-life balance, or other relevant information. 
3. Prospective investor perspective: Any key information about the COMPANY from the perspective of a prospective investor, such as fundraising, valuation, layoffs, partnerships, or other information indicating that the company is improving or worsening. 

Do not make speculations, simply summarize the key facts and opinions stated in the thread.

Summaries: 
{text}


OVERALL SUMMARY IN MARKDOWN FORMAT:
"""
combine_prompt_template = PromptTemplate(template=combine_prompt, input_variables=["text"])


summary_chain = load_summarize_chain(
    llm=llm, 
    chain_type='map_reduce',
    map_prompt=map_prompt_template,
    combine_prompt=combine_prompt_template,
    token_max=30000,
    verbose=True
)

documents = [Document(page_content=thread_markdown) for thread_markdown in thread_markdowns]
output = summary_chain.run({
    "company": target.company,
    "product": target.product,
    "input_documents": documents
})

with open(f"{experiment_dir}/map_reduce_specialized_prompts.md", "w") as f:
    f.write(output)


output



[1m> Entering new MapReduceDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3m
Please read the following Reddit thread and write an evidence-based summary of the key points relating to the COMPANY 98point6 and PRODUCT 98point6.

The summary should begin with a brief 1-2 sentence summary of the thread.
Then write three sections summarizing key facts and opinions about the COMPANY or PRODUCT from different perspectives:
1. User experience perspective: The key strengths and weaknesses of the PRODUCT from the perspective of current users.
2. Prospective employee perspective: The key strengths and weaknesses of the COMPANY from the perspective of employees. For example this could include information about the benefits, company culture, work-life balance, or other relevant information.
3. Prospective investor perspective: Any key information about the COMPANY from the perspective of a prospective investor, such as fundraising, valu

"# Summary of 98point6 and Its Product\n\n98point6 is an online healthcare service that provides virtual doctor consultations for a low annual fee, making healthcare more accessible and convenient for users. The service has garnered a mix of positive and critical feedback from users, employees, and potential investors, highlighting its strengths and weaknesses across various perspectives.\n\n## User Experience Perspective\n**Strengths:**\n- **Affordability:** Users appreciate the low cost of $20 for a year of unlimited consultations, making healthcare accessible for uninsured or underinsured individuals.\n- **Convenience:** The ability to consult with a doctor from home is a significant advantage, especially for those with mobility issues or time constraints.\n- **Empathetic Care:** Positive interactions with doctors who are understanding and willing to help find affordable medication options enhance the overall experience.\n- **Accessibility:** Users value the 24/7 access to healthcar

In [18]:
# A map_reduce version that adapts our custom prompts
from langchain import PromptTemplate

map_prompt = """
Please read the following Reddit thread and extract key opinions and facts relating to the user experience of the PRODUCT {product} by the COMPANY {company} from the perspective of current users.
Only include information about the COMPANY {company} and PRODUCT {product}. Do not extract information about other companies or products.

Format the results as a Markdown list of quotes like this:
- "quote" [comment_id](comment_id)

Each quote should be a short, concise statement that captures the essence of the sentiment or information.
Be sure to extract a representative sample of both positive and negative opinions, as well as any factual statements about the product.

Reddit thread: 
{text}

MARKDOWN LIST OF KEY QUOTES ABOUT THE COMPANY AND PRODUCT:
"""
map_prompt_template = PromptTemplate(template=map_prompt, input_variables=["text"])

combine_prompt = """
Please cluster all of the quotes below, organizing them into thematic topics of feedback about the COMPANY {company} and PRODUCT {product}.
Use the following top-level headings:
# Positive Sentiments
# Negative Sentiments
# Statements of Fact

If there are many quotes under a heading, please subdivide into headings to group similar quotes together.

Summaries: 
{text}


GROUPED KEY QUOTES IN MARKDOWN FORMAT:
"""
combine_prompt_template = PromptTemplate(template=combine_prompt, input_variables=["text"])


summary_chain = load_summarize_chain(
    llm=llm, 
    chain_type='map_reduce',
    map_prompt=map_prompt_template,
    combine_prompt=combine_prompt_template,
    token_max=30000,
    verbose=True
)

documents = [Document(page_content=thread_markdown) for thread_markdown in thread_markdowns]
output = summary_chain.run({
    "company": target.company,
    "product": target.product,
    "input_documents": documents
})

with open(f"{experiment_dir}/map_reduce_specialized_prompts_v5.md", "w") as f:
    f.write(output)


output



[1m> Entering new MapReduceDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3m
Please read the following Reddit thread and extract key opinions and facts relating to the user experience of the PRODUCT 98point6 by the COMPANY 98point6 from the perspective of current users.
Only include information about the COMPANY 98point6 and PRODUCT 98point6. Do not extract information about other companies or products.

Format the results as a Markdown list of quotes like this:
- "quote" [comment_id](comment_id)

Each quote should be a short, concise statement that captures the essence of the sentiment or information.
Be sure to extract a representative sample of both positive and negative opinions, as well as any factual statements about the product.

Reddit thread: 

# Post ID bg7ip2:  Internet medicine is awesome, 98point6 was so so helpful for me by FrugalChef13 on 2019-04-22 [+54 votes]
**TL;DR- $20 got me an awesome appointment with 

'# Positive Sentiments\n\n## General Positive Experiences\n- "It was amazing. I chatted with some weird robot for 10 minutes or so and then I got connected to this nice doctor lady." [elj1h0o](elj1h0o)\n- "I would 100% use 98point6 again, especially since my $20 gets me unlimited visits for a year." [bg7ip2](bg7ip2)\n- "Helped me get paxlovid for covid more quickly than I otherwise would have been able to." [idl52u3](idl52u3)\n- "Overall it was an awesome experience and I got to avoid long waits and my copay." [ipmklh](ipmklh)\n- "I had a great experience with 98point6; they made it easy to get care when I needed it." [i5i3faa](i5i3faa)\n\n## Cost and Accessibility\n- "It\'s $20 for a one year membership with unlimited online appointments." [bg7ip2](bg7ip2)\n- "Even people who do have a local PCP use their service because it\'s faster and cheaper and easier than an office visit." [bg7ip2](bg7ip2)\n- "I really appreciate the accessibility of it." [gkwtoqb](gkwtoqb)\n- "98point6 offers a

In [11]:
# Baseline: The existing summary
from collections import ChainMap
from praw.models import Submission
from typing import List

def process_reddit(target: CompanyProduct, post_submissions: List[Submission]) -> Optional[RedditSummary]:
    # Summarize each
    post_summaries = [reddit.summarizer.summarize_submission(target, submission) for submission in post_submissions]

    # Filter out any with over 1 hallucinated comment id
    post_summaries = [summary for summary in post_summaries if summary.is_under_max_hallucinations(1, debug=True)]

    if len(post_summaries) == 0:
        print(f"No valid summaries found for {target}")
        return None
    
    # Index permalinks
    permalinks = ChainMap(*[reddit.fetch.index_permalinks(summary.submission) for summary in post_summaries])

    # Aggregate the summaries
    aggregate_summary = reddit.summarizer.summarize_summaries(target, post_summaries)

    return RedditSummary(
        sources=None,
        overall_summary=aggregate_summary,
        summaries=post_summaries,
        permalinks=permalinks
    )

reddit_result = process_reddit(target, threads)

with open(f"{experiment_dir}/handcrafted_baseline.md", "w") as f:
    f.write(reddit_result.overall_summary.to_markdown())

reddit_result


Text too long: 72307 > 40000. Truncating.


RedditSummary(sources=None, overall_summary=AggregatedSummaryResult(target=CompanyProduct(company='98point6', product='98point6'), summaries=[ThreadSummaryResult(submission=Submission(id='bg7ip2'), text='\n# Post ID bg7ip2:  Internet medicine is awesome, 98point6 was so so helpful for me by FrugalChef13 on 2019-04-22 [+54 votes]\n**TL;DR- $20 got me an awesome appointment with a nice doctor and a prescription for a medication I could afford that solved my issue.**\n\n*Disclaimer: This particular thing worked well for me so I\'m going to tell you about it. Everyone is different, so it might not work as well (or at all) for you.  Take what you find useful from this post and ignore the rest.  I\'m not compensated or connected to the website I\'m discussing.*\n\nSo like a lot of people on here I\'m usually either uninsured or underinsured.  Right now it\'s underinsured with a high deductible, so when I messed my back up badly enough that I could barely move I freaked.  I\'ve got scoliosis,

In [8]:
# TODO: Implement a basic version of OpinionDigest