In [106]:
import requests
import time
from datetime import datetime, timezone
import pandas as pd

In [107]:
# Max number of requests
max_requests = 60/5
items_per_request = 100
max_posts = max_requests * items_per_request
print("The maximum number of posts we can get per minute is", max_posts)
print("The maximum number of posts we can get per day is", max_posts * 24 * 60)

The maximum number of posts we can get per minute is 1200.0
The maximum number of posts we can get per day is 1728000.0


In [108]:
sort = 'relevance'  # 'relevance', 'hot', 'top', 'new', 'comments'
t = 'all'  # all, day, hour, month, week, year

def search_reddit(query, limit):
    base_url = 'https://www.reddit.com/search.json'
    headers = {'User-agent': 'yourbot'}
    posts = []
    params = {
        'q': query,
        'limit': 100,  # Reddit API returns max 100 posts per request
        'after': None,
        'sort': sort,
        't': t
            }
    while len(posts) < limit:
        try:
            response = requests.get(base_url, headers=headers, params=params)
            response.raise_for_status()
            data = response.json()

            new_posts = data['data']['children']
            if not new_posts:
                print('No more posts found.')
                break
            posts.extend(new_posts)
            after = data['data']['after']
            if after is None:
                print('No more posts to fetch.')
                break
            
            # Print progress
            print(f'Retrieved {len(posts)} posts so far...')
            
            # Sleep to avoid hitting rate limits (50 calls per minute -> 1.2 seconds per call)
            time.sleep(5)
        except requests.exceptions.HTTPError as err:
            print(f'HTTP error occurred: {err}')
            break
        except Exception as e:
            print(f'An Error Occurred: {e}')
            break

    return posts[:limit]

def convert_timestamp_to_date(posts):
    for post in posts:
        timestamp = post['data']['created_utc']
        date = datetime.fromtimestamp(timestamp, tz=timezone.utc).strftime('%Y/%m/%d')
        post['data']['created_date'] = date
    return posts

# export to Excel
def get_post_data(posts):
    data = []
    for post in posts:
        x = post['data']
        data.append([x['title'], x['selftext'], x["author_fullname"], x['url'], x['created_date'], x['num_comments'], x['score']])
    return data

In [109]:
for subreddit in ["antiwork", "jobs", "careerguidance"]:
    query = f'(ai OR chatgpt OR "artificial intelligence") subreddit:{subreddit}'
    limit = 2000

    posts = search_reddit(query, limit)
    posts = convert_timestamp_to_date(posts)

    print(f"Retrieved {len(posts)} posts")

    data = get_post_data(posts)

    export_query = query.replace(":", "_").replace('"', ' ') + f" sort_{sort} t_{t}"
    pd.DataFrame(data).to_excel(f'../data/{export_query}.xlsx', header=['Title', 'Body', 'Author', 'URL', 'Created', 'Comments', 'Score'], index=False)

Retrieved 100 posts so far...
Retrieved 200 posts so far...
Retrieved 300 posts so far...
Retrieved 400 posts so far...
Retrieved 500 posts so far...
Retrieved 600 posts so far...
Retrieved 700 posts so far...
Retrieved 800 posts so far...
Retrieved 900 posts so far...
Retrieved 1000 posts so far...
Retrieved 1100 posts so far...
Retrieved 1200 posts so far...
Retrieved 1300 posts so far...
Retrieved 1400 posts so far...
Retrieved 1500 posts so far...
Retrieved 1600 posts so far...
Retrieved 1700 posts so far...
Retrieved 1800 posts so far...
Retrieved 1900 posts so far...
Retrieved 2000 posts so far...
Retrieved 2000 posts
Retrieved 100 posts so far...
Retrieved 200 posts so far...
Retrieved 300 posts so far...
Retrieved 400 posts so far...
Retrieved 500 posts so far...
Retrieved 600 posts so far...
Retrieved 700 posts so far...
Retrieved 800 posts so far...
Retrieved 900 posts so far...
Retrieved 1000 posts so far...
Retrieved 1100 posts so far...
Retrieved 1200 posts so far...
Retri