In [28]:
import os
import csv
import json
import time
import openai
from dotenv import load_dotenv
from langchain_community.tools.reddit_search.tool import RedditSearchRun
from langchain_community.utilities.reddit_search import RedditSearchAPIWrapper
from langchain_community.tools.reddit_search.tool import RedditSearchSchema
from langchain_openai.chat_models import ChatOpenAI

In [29]:
def load_questions(file_path):
    """
    Load questions from a CSV file.

    Args:
        file_path (str): Path to the CSV file containing questions.

    Returns:
        list: A list of questions.
    """
    questions = []
    with open(file_path, newline='') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            questions.append(row['question'])
    return questions

In [30]:
# Load questions from the CSV file
questions = load_questions('../../eds_data/llm_generated_questions/eds_questions_llm_generated.csv')

In [31]:
#Load environment variables
dotenv_path = os.path.join('/Users/mariacatalinavilloutareyes/dev', '.eds.env')
load_dotenv(dotenv_path)

client_id = os.environ.get('client_id')
client_secret = os.environ.get('client_secret')
user_agent = os.environ.get('user_agent')

In [32]:
#Model
OPENAI_API_KEY = os.environ.get('OPENAI_API_KEY')
model = ChatOpenAI(openai_api_key=OPENAI_API_KEY, model="gpt-3.5-turbo")

In [33]:
# Initialize the Reddit Search API wrapper
search = RedditSearchRun(
    api_wrapper=RedditSearchAPIWrapper(
        reddit_client_id=client_id,
        reddit_client_secret=client_secret,
        reddit_user_agent=user_agent,
    )
)

In [34]:
def search_reddit(query, sort_by="relevance", time_filter="all", subreddit="EhlersDanlos", limit=3):
    """
    Search Reddit for a given query.

    Args:
        query (str): The search query to use.
        sort_by (str, optional): The sorting method for the search results.
            Accepted values are "relevance", "hot", "top", "new", and "comments".
        time_filter (str, optional): The time filter for the search results.
            Accepted values are "all", "year", "month", "week", "day", and "hour".
        subreddit (str, optional): The subreddit to search in.
        limit (int, optional): The maximum number of search results to return.

    Returns:
        list: A list of dictionaries containing the search results from Reddit.
    """
    search_params = RedditSearchSchema(
        query=query,
        sort=sort_by,
        time_filter=time_filter,
        subreddit=subreddit,
        limit=limit
    )
    result = search.run(tool_input=search_params.dict())

    # Convert the result string into a list of dictionaries
    posts = []
    if isinstance(result, str):
        result_lines = result.split("\n\nPost Title: '")

        for i, post_str in enumerate(result_lines):
            if i == 0:
                # This is the first chunk, handle separately to remove leading text
                post_str = post_str.split("Post Title: '", 1)[-1]
            
            lines = post_str.strip().split("\n")
            post = {}

            # Reattach 'Post Title:' for the first line
            if lines:
                post["Post Title"] = lines[0].strip("'")

            # Combine lines for the text body
            text_body = []
            for line in lines[1:]:
                line = line.strip()
                if ": " in line:
                    key, value = line.split(": ", 1)
                    if key.strip() == "Text body":
                        text_body.append(value.strip())
                    else:
                        post[key.strip()] = value.strip()
                else:
                    text_body.append(line.strip())
            
            # Join the text body
            post["Text body"] = " ".join(text_body).strip()
            
            posts.append(post)
    
    return posts

In [35]:
def process_post(question, post):
    """
    Confirm relevance of a post and generate structured output using LLM.
    Args:
        question (str): The question to confirm relevance against.
        post (dict): The Reddit post to process.
    Returns:
        str or None: A summarized response if relevant, otherwise None.
    """
    instruction = f"Please analyze the following Reddit post and determine if it is related to the question: '{question}'. If it is related, respond with 'Yes, the post is related to the question' and provide a concise summary of the relevant information, focusing on the general experiences or insights shared rather than attributing them to specific individuals. When summarizing, use language that is sensitive and respectful to all people, and avoid making direct references to the post's author or any individuals mentioned. If the post is not related to the question, simply respond with 'No, the post is not related to the question'."
    input_text = post.get("Text body", "")
    prompt = f"Instruction: {instruction}\n\nInput: {input_text}\n\nOutput:"
    
    response = model.invoke(prompt)
    response_text = response.content
    
    if "yes, the post is related to the question" in response_text.lower():
        summarized_response = response_text.split("Summarized answer:")[-1].strip() if "Summarized answer:" in response_text else response_text.split("Output:")[-1].strip()
        return summarized_response
    return None

In [36]:
def combine_responses(question, responses):
    """
    Combine multiple responses into a single summarized answer using LLM.
    Args:
        question (str): The question being answered.
        responses (list): A list of individual summarized responses.
    Returns:
        str: A single summarized answer.
    """
    combined_input = " ".join(responses)
    prompt = f"Instruction: Based on discussions on online platforms like Reddit, summarize the following responses to the question: '{question}'. In your summary, avoid using phrases like 'the author of the post or individual reported'.\n\nInput: {combined_input}\n\nOutput:"

    combined_response = model.invoke(prompt)
    combined_response_text = combined_response.content

    return combined_response_text.strip()

In [37]:
def summarize_output(output):
    """
    Generate a concise summary of the output.

    Args:
        output (str): Detailed output text to be summarized.

    Returns:
        str: A short summary of the output.
    """
    prompt = f"Instruction: Summarize the following text into a concise statement.\n\nInput: {output}\n\nOutput:"
    summary_response = model.invoke(prompt)
    summary_text = summary_response.content

    return summary_text.split("Output:")[-1].strip()

In [38]:
def search_and_process_questions(questions, delay=1):
    """
    Search Reddit for each question and process the results.

    Args:
        questions (list): A list of questions to search and process.
        delay (int): Delay in seconds between API requests to avoid rate limiting.

    Returns:
        list: A list of dictionaries containing processed results.
    """
    final_results = []

    for question in questions:
        reddit_results = search_reddit(question)
        relevant_responses = []
        for post in reddit_results:
            processed_post = process_post(question, post)
            if processed_post:
                relevant_responses.append(processed_post)
        
        if relevant_responses:
            combined_response = combine_responses(question, relevant_responses)
            input_summary = summarize_output(combined_response)
            final_results.append({
                "input": input_summary,
                "instruction": question,
                "output": combined_response
            })
        
        # Sleep to avoid hitting the API rate limit
        time.sleep(delay)

    return final_results

In [39]:
def save_results_to_json(data, file_path):
    """
    Save the processed results to a JSON file.

    Args:
        data (list): The data to save to JSON.
        file_path (str): Path to the output JSON file.
    """
    with open(file_path, 'w') as jsonfile:
        json.dump(data, jsonfile, indent=4)

In [40]:
final_results = search_and_process_questions(questions)

In [None]:
# Save final output to JSON file:
save_results_to_json(final_results, '../../eds_data/reddit_data/reddit_ouput_generation.json')

In [45]:
print(f"{len(final_results)} out of 602 questions")

257 out of 602 questions
