In [5]:
import os
import csv
import json
import time
import openai
from dotenv import load_dotenv
from langchain_community.tools.reddit_search.tool import RedditSearchRun
from langchain_community.utilities.reddit_search import RedditSearchAPIWrapper
from langchain_community.tools.reddit_search.tool import RedditSearchSchema
from langchain_openai.chat_models import ChatOpenAI
from langchain.schema import HumanMessage

In [6]:
# Load environment variables
dotenv_path = os.path.join('/Users/mariacatalinavilloutareyes/dev', '.eds.env')
load_dotenv(dotenv_path)
client_id = os.environ.get('client_id')
client_secret = os.environ.get('client_secret')
user_agent = os.environ.get('user_agent')

In [7]:
# Model
OPENAI_API_KEY = os.environ.get('OPENAI_API_KEY')
model = ChatOpenAI(openai_api_key=OPENAI_API_KEY, model="gpt-3.5-turbo")

In [8]:
# Initialize the Reddit Search API wrapper
search = RedditSearchRun(
    api_wrapper=RedditSearchAPIWrapper(
        reddit_client_id=client_id,
        reddit_client_secret=client_secret,
        reddit_user_agent=user_agent,
    )
)

In [9]:
def search_reddit(query, sort_by="relevance", time_filter="all", subreddit="EhlersDanlos+ChronicPain+Hypermobility", limit=10000):
    """
    Search Reddit for a given query.
    Args:
        query (str): The search query to use.
        sort_by (str, optional): The sorting method for the search results. Accepted values are "relevance", "hot", "top", "new", and "comments".
        time_filter (str, optional): The time filter for the search results. Accepted values are "all", "year", "month", "week", "day", and "hour".
        subreddit (str, optional): The subreddit to search in.
        limit (int, optional): The maximum number of search results to return.
    Returns:
        list: A list of dictionaries containing the search results from Reddit.
    """
    search_params = RedditSearchSchema(
        query=query,
        sort=sort_by,
        time_filter=time_filter,
        subreddit=subreddit,
        limit=limit
    )
    result = search.run(tool_input=search_params.dict())
    
    posts = []
    if isinstance(result, str):
        result_lines = result.split("\n\nPost Title: '")
        for i, post_str in enumerate(result_lines):
            if i == 0:
                post_str = post_str.split("Post Title: '", 1)[-1]
            lines = post_str.strip().split("\n")
            post = {}
            if lines:
                post["Post Title"] = lines[0].strip("'")
            text_body = []
            for line in lines[1:]:
                line = line.strip()
                if ": " in line:
                    key, value = line.split(": ", 1)
                    if key.strip() == "Text body":
                        text_body.append(value.strip())
                    else:
                        post[key.strip()] = value.strip()
                else:
                    text_body.append(line.strip())
            post["Text body"] = " ".join(text_body).strip()
            posts.append(post)
            
    return posts

In [27]:
def summarize_output(detailed_output):
    """
    Generate a concise summary of a detailed output.
    Args:
        detailed_output (str): The detailed output text to summarize.
    Returns:
        str: A concise summary of the detailed output.
    """
    summary_prompt = f"""
    Summarize the following detailed answer into a concise statement:
    {detailed_output}
    """
    summary = model.invoke(summary_prompt).content.strip()
    
    return summary

In [38]:
def generate_question_answer(post, previous_question=None):
    
    """
    Generate a question and answer based on a given Reddit post about Ehlers-Danlos Syndrome (EDS).

    Args:
        post (dict): A dictionary containing information about a Reddit post, including the post's text body.
        previous_question (str, optional): The previously generated question, if available. Used to ensure that the new question is different from the previous one.

    Returns:
        dict: A dictionary containing the generated question and answer, with keys "input", "instruction", and "output".
              - "input" is an empty string.
              - "instruction" contains the generated question.
              - "output" contains the generated answer.

    The function generates a question that explores the experiences, challenges, or insights related to living with EDS, based on the given Reddit post.
    The question is phrased in a general manner, avoiding the use of "you" or references to specific individuals, and instead uses inclusive language that applies to the EDS community as a whole.
    If a previous question is provided, the function ensures that the new question is different from the previous one while still following the same guidelines.
    The function then generates an answer to the question, using the information from the Reddit post, and formats it as a single, coherent paragraph.
    """
    
    text_body = post["Text body"]

    question_prompt = f"""
    Based on the following Reddit post about Ehlers-Danlos Syndrome (EDS), please generate a question that explores the experiences, challenges, or insights related to living with EDS. The question should be broad and applicable to the EDS community as a whole, rather than directed at any specific individual. Focus on the general themes or topics discussed in the post.

    Please ensure that the question is phrased in a general manner without using "you" or referring to specific individuals. Instead, use inclusive language that applies to people with EDS collectively, such as "individuals with EDS," "those living with EDS," or "the EDS community."

    Do not start the question with phrases like "General question:" or "Question:". Simply provide the question itself.
    
    Please use standard characters only, without any special formatting or encoded characters like slashes or escaped quotes. 


    Reddit post:
    {text_body}
    """
    
    if previous_question:
        question_prompt += f"""
        \n\nPreviously generated question: {previous_question}
        
        Please generate a question that is different from the previous one, while still following the guidelines and formatting mentioned above. Ensure that the new question does not use "you" or refer to specific individuals, and instead uses inclusive language that applies to the EDS community as a whole.
        """
    
    question = model.invoke(question_prompt).content.strip()
    
    answer_prompt = f"""
    Please provide a thoughtful and informative answer to the following question, based on the experiences and insights shared in the Reddit post about Ehlers-Danlos Syndrome (EDS). Use language that is sensitive and respectful to all individuals with EDS, and avoid making direct references to the post's author or any specific individuals mentioned.

    When generating the answer, please ensure that the response is formatted as a single, coherent paragraph without any special characters or escape sequences like "\\n\\n". Use only standard characters without any special formatting or encoded characters like slashes (/) or escaped quotes (\"). The answer should be easy to read and understand.
    
    Question: {question}
    
    Reddit post:
    {text_body}
    """
    answer = model.invoke(answer_prompt).content.strip()
    
    summary = summarize_output(answer)
    
    return {"input": summary, "instruction": question, "output": answer}

In [39]:
def generate_questions_answers(posts):
    
    """
    Generate questions and answers based on a list of Reddit posts about Ehlers-Danlos Syndrome (EDS).

    Args:
        posts (list): A list of dictionaries, where each dictionary contains information about a Reddit post, including the post's text body.

    Returns:
        list: A list of dictionaries, where each dictionary represents a generated question and answer.
              Each dictionary has keys "input", "instruction", and "output".
              - "input" is an empty string.
              - "instruction" contains the generated question.
              - "output" contains the generated answer.

    The function iterates over each post in the provided list and generates two questions and answers for each post using the `generate_question_answer` function.
    The first question is generated without any previous question context, while the second question takes into account the first generated question to ensure that it is different.
    The generated questions and answers are stored in the `generated_data` list and returned.
    """
    
    generated_data = []
    for post in posts:
        question1_data = generate_question_answer(post)
        generated_data.append(question1_data)
        
        question2_data = generate_question_answer(post, previous_question=question1_data["instruction"])
        generated_data.append(question2_data)
    
    return generated_data

In [40]:
def save_results_to_json(data, file_path):
    """
    Save the processed results to a JSON file.
    Args:
        data (list): The data to save to JSON.
        file_path (str): Path to the output JSON file.
    """
    with open(file_path, 'w') as jsonfile:
        json.dump(data, jsonfile, indent=4)

In [44]:
# Search Reddit for EDS-related posts
posts = search_reddit(query="EDS")

In [47]:
# Generate questions and answers for each post
generated_data = generate_questions_answers(posts)

In [50]:
save_results_to_json(generated_data, '../../eds_data/reddit_data/reddit_generated_questions.json')

In [49]:
print(f"{len(generated_data)} out of {len(posts)} questions")

484 out of 242 questions
