<a href="https://colab.research.google.com/github/ktynski/Marketing_Automations_Notebooks_With_GPT/blob/main/Automatic_Reddit_Trend_Research_with_GPT3_(Public).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

1. Install dependencies

In [None]:
!pip install praw
!pip install openai
!pip install nltk

2. Run the cell below to scrape reddit for top posts and a readout analysis of those posts as CSVs (available in the file folder on the left after running the cell below. You will need an OpenAI api key, and credentials from a reddit app, which you can get here:https://www.reddit.com/prefs/apps

In [None]:
import praw
import pandas as pd
import openai
import time
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')
import concurrent.futures

# OpenAI API key
openai.api_key = "Your OpenAI Api Key"

# Keyword:
keyword = "Elon Musk"
#number of posts to get from reddit:
num_results = 500
#Enter the date range (1 day, 1 week, 1 month, 6 months, 1 year, all time):
date_range = "6 months"


prompts = [
"Please summarize the main topic discussed in the Reddit posts in my corpus as a short essay.",
"Please extract the 20 most common key points or highlights from the the corpus that summarize the main ideas discussed.",
"Please identify the 20 most common words or phrasesin the corpus and their frequency of use.",
"Please find the 20 most frequently asked questions in the corpus.",
"Please identify 20 important or noteworthy patterns or trends in the corpus.",
"Please generate a comprehensive summary of the positive parts of the corpus in the form of bullet points.",
"Please generate a comprehensive summary of the negative parts of the corpus in the form of bullet points.",
"Please generate a list of 20 common questions found in the corpus, covering the main topics and issues discussed.",
"Please create a hierarchy of topics/subtopics/subsubtopics encountered most frequently in the corpus.",
"Please list all of entities mentioned in the corpus, sorted by most to least mentioned.",
"Please suggest 10 newsworthy and clickable data-journalism story ideas titles based on what you know was popular in the corpus."
]

def get_posts(keyword, num_results, date_rangea):
    # Connect to Reddit API using PRAW, use the credentials you get from reddit after creating an app.
    reddit = praw.Reddit(client_id='your client id',
                     client_secret='your client secret',
                     user_agent='name of your reddit app')

    # Calculate the start and end date based on the selected date range
    end_date = int(time.time())
    if date_range == "1 day":
        start_date = end_date - 86400
    elif date_range == "1 week":
        start_date = end_date - 604800
    elif date_range == "1 month":
        start_date = end_date - 2592000
    elif date_range == "6 months":
        start_date = end_date - 15552000
    elif date_range == "1 year":
        start_date = end_date - 31104000
    else:
        start_date = 0

    # Get the specified number of posts with keyword in the title
    posts = []
    for post in reddit.subreddit("all").search(keyword, limit=num_results):
        if start_date <= post.created_utc <= end_date:
            posts.append([post.title, post.created_utc, post.permalink])

    # Create a Pandas DataFrame from the list of posts
    df = pd.DataFrame(posts, columns=["Title", "Timestamp", "Permalink"])
    df.to_csv("redditposts.csv")
    return df

def GPT3_evaluation(posts_df,prompt):
    # Get the list of popular post titles
    post_titles = posts_df['Title'].tolist()
    post_titles = ' '.join(post_titles)

    # Tokenize the list of post titles
    tokens = word_tokenize(post_titles)

    # Truncate the list of tokens to 4000 or less
    if len(tokens) > 2800:
        tokens = tokens[:2800]

    post_titles = ' '.join(tokens)
    prompt = f"{prompt} \n Corpus: {post_titles} \n Readout:"


    # Send the prompt to the OpenAI API
    response = openai.Completion.create(
        engine="text-davinci-003",
        prompt=prompt,
        max_tokens=500,
        n=1,
        stop=None,
        temperature=0.5,
    )

    # Display the answer from GPT-3
    print(response["choices"][0]["text"])
    return response["choices"][0]["text"]


def run_prompt(prompt):
    posts_df = get_posts(keyword, num_results, date_range)
    response = GPT3_evaluation(posts_df, prompt)
    return prompt, response

results = []
with concurrent.futures.ThreadPoolExecutor() as executor:
    future_to_prompt = {executor.submit(run_prompt, prompt): prompt for prompt in prompts}
    for future in concurrent.futures.as_completed(future_to_prompt):
        prompt = future_to_prompt[future]
        try:
            result = future.result()
            results.append(result)
        except Exception as exc:
            print(f'{prompt} generated an exception: {exc}')

GPT3answers = pd.DataFrame(results, columns=["Prompt", "Answer from GPT-3"])
GPT3answers.to_csv("GPT3_readout.csv")
