In [None]:
import requests
import time
from datetime import datetime, timezone
import pandas as pd

In [None]:
# Max number of requests
max_requests = 60/5
items_per_request = 100
max_posts = max_requests * items_per_request
print("The maximum number of posts we can get per minute is", max_posts)
print("The maximum number of posts we can get per day is", max_posts * 24 * 60)

The maximum number of posts we can get per minute is 1200.0
The maximum number of posts we can get per day is 1728000.0


In [None]:
def search_reddit(query, limit):
    base_url = 'https://www.reddit.com/search.json'
    headers = {'User-agent': 'yourbot'}
    posts = []
    after = None

    while len(posts) < limit:
        params = {
        'q': query,
        'limit': 100,  # Reddit API returns max 100 posts per request
        'after': after,
        'sort': sort,
        't': t
            }

        try:
            response = requests.get(base_url, headers=headers, params=params)
            response.raise_for_status()
            data = response.json()

            new_posts = data['data']['children']
            if not new_posts:
                print('No more posts found.')
                break
            posts.extend(new_posts)
            
            after = data['data']['after']
            print("after:", after)
            if after is None:
                print('No more posts to fetch.')
                break
                
            
            # Print progress
            print(f'Retrieved {len(posts)} posts so far...')
            
            # Sleep to avoid hitting rate limits
            time.sleep(5)
        except requests.exceptions.HTTPError as err:
            print(f'HTTP error occurred: {err}')
            break
        except Exception as e:
            print(f'An Error Occurred: {e}')
            break

    return posts[:limit]

def fetch_comments(post_permalink):
    base_url = f'https://www.reddit.com{post_permalink}.json'
    headers = {'User-agent': 'yourbot'}
    try:
        response = requests.get(base_url, headers=headers)
        response.raise_for_status()
        data = response.json()
        comments = []
        for comment in data[1]['data']['children']:
            if comment['kind'] == 't1':  # Ensure it's a comment
                comments.append(comment['data']['body'])
        return comments
        time.sleep(5)
    except requests.exceptions.HTTPError as err:
        print(f'HTTP error occurred: {err}')
    except Exception as e:
        print(f'An Error Occurred: {e}')
    return []

def convert_timestamp_to_date(posts):
    for post in posts:
        timestamp = post['data']['created_utc']
        date = datetime.fromtimestamp(timestamp, tz=timezone.utc).strftime('%Y/%m/%d')
        post['data']['created_date'] = date
    return posts

# export to Excel
def get_post_data(posts, comments=False):
    data = []
    for post in posts:
        x = post['data']
        if comments:
            comments = fetch_comments(x['permalink'])
            print(x["author_fullname"],len(comments))
            data.append([x['title'], x['selftext'], x.get("author_fullname", "N/A"),
                         x['url'], x['created_date'], x['num_comments'], x['score'], x['subreddit_name_prefixed'], comments])
        else:
            data.append([x['title'], x['selftext'], x["author_fullname"],
                         x['url'], x['created_date'], x['num_comments'], x['score'], x['subreddit_name_prefixed']])
    return data

In [None]:
subreddits = ["careerguidance"]
# subreddits = ["careerguidance", "jobs", "antiwork", "Economics",
#               "MachineLearning", "ChatGPT", "technology", "artificial", "OpenAI",
#               "changemyview", "AskReddit", "Futurology", "Showerthoughts", "NoStupidQuestions"]


for subreddit in subreddits:
    query = f'(ai OR "artificial intelligence" OR chatgpt) AND (job OR jobs OR work OR career OR employment OR profession) AND (replace OR replaced OR replaces OR replacement OR affected OR affect OR affecting OR disappear OR disappearing OR disappeared OR fired OR hiring OR hire OR lose OR lost OR losing OR eliminate OR eliminates OR eliminating OR redundant OR safe OR obsolete OR threaten) subreddit:{subreddit}'#Max 500 characters 
    
    limit = 1000
    sort = 'top'  # 'relevance', 'hot', 'top', 'new', 'comments'
    t = 'all'  # all, day, hour, month, week, year

    posts = search_reddit(query, limit)
    posts = convert_timestamp_to_date(posts)

    print(f"Retrieved {len(posts)} posts")

    if len(posts) != 0:
        data = get_post_data(posts, comments=True)
        # export_query = query.replace(":", "_").replace('"', '-') + f" sort_{sort} t_{t}"
        export_query = f"long_query subreddit_{subreddit} sort_{sort} t_{t} n_{len(posts)}"
        pd.DataFrame(data).to_excel(f'../data/{export_query}_with_comments.xlsx', header=['Title', 'Body', 'Author', 'URL', 'Created', 'Comments', 'Score', 'Subreddit'], index=False)

In [None]:
## Use .py instead