In [34]:
import requests
import time
from datetime import datetime, timezone
import pandas as pd

In [35]:
# Max number of requests
max_requests = 60/5
items_per_request = 100
max_posts = max_requests * items_per_request
print("The maximum number of posts we can get per minute is", max_posts)
print("The maximum number of posts we can get per day is", max_posts * 24 * 60)

The maximum number of posts we can get per minute is 1200.0
The maximum number of posts we can get per day is 1728000.0


In [36]:
def search_reddit(query, limit):
    base_url = 'https://www.reddit.com/search.json'
    headers = {'User-agent': 'yourbot'}
    posts = []
    params = {
        'q': query,
        'limit': 100,  # Reddit API returns max 100 posts per request
        'after': None,
        'sort': sort,
        't': t
            }
    while len(posts) < limit:
        try:
            response = requests.get(base_url, headers=headers, params=params)
            response.raise_for_status()
            data = response.json()

            new_posts = data['data']['children']
            if not new_posts:
                print('No more posts found.')
                break
            posts.extend(new_posts)
            after = data['data']['after']
            if after is None:
                print('No more posts to fetch.')
                break
            
            # Print progress
            print(f'Retrieved {len(posts)} posts so far...')
            
            # Sleep to avoid hitting rate limits (50 calls per minute -> 1.2 seconds per call)
            time.sleep(5)
        except requests.exceptions.HTTPError as err:
            print(f'HTTP error occurred: {err}')
            break
        except Exception as e:
            print(f'An Error Occurred: {e}')
            break

    return posts[:limit]

def convert_timestamp_to_date(posts):
    for post in posts:
        timestamp = post['data']['created_utc']
        date = datetime.fromtimestamp(timestamp, tz=timezone.utc).strftime('%Y/%m/%d')
        post['data']['created_date'] = date
    return posts

# export to Excel
def get_post_data(posts):
    data = []
    for post in posts:
        x = post['data']
        data.append([x['title'], x['selftext'], x["author_fullname"], x['url'], x['created_date'], x['num_comments'], x['score'], x['subreddit_name_prefixed']])
    return data

In [37]:
# for subreddit in ["antiwork", "jobs", "careerguidance"]:
#     query = f'(ai OR chatgpt OR "artificial intelligence") subreddit:{subreddit}'
#     limit = 2000
#     sort = 'relevance'  # 'relevance', 'hot', 'top', 'new', 'comments'
#     t = 'all'  # all, day, hour, month, week, year
#     posts = search_reddit(query, limit)
#     posts = convert_timestamp_to_date(posts)

#     print(f"Retrieved {len(posts)} posts")

#     data = get_post_data(posts)

#     export_query = query.replace(":", "_").replace('"', '-') + f" sort_{sort} t_{t}"
#     pd.DataFrame(data).to_excel(f'../data/{export_query}.xlsx', header=['Title', 'Body', 'Author', 'URL', 'Created', 'Comments', 'Score'], index=False)

In [40]:
subreddits = ["careerguidance", "jobs", "antiwork", "Economics", "business",
              "MachineLearning", "ChatGPT", "technology", "artificial", "OpenAI",
              "changemyview", "AskReddit", "Futurology", "Showerthoughts", "NoStupidQuestions"]

for subreddit in subreddits:
    query = f'(ai OR "artificial intelligence" OR chatgpt) AND (job OR jobs OR work OR career OR employment OR profession) AND (replace OR replaced OR replaces OR replacement OR take OR taking OR takes OR affected OR affect OR affecting OR disappear OR disappearing OR disappeared OR fired OR hiring OR hire OR exist OR lose OR lost OR losing OR eliminate OR eliminates OR eliminating OR redundant OR safe OR obsolete OR threaten OR impact OR impacting) subreddit:{subreddit}' #Max 500 characters
    
    limit = 1000
    sort = 'new'  # 'relevance', 'hot', 'top', 'new', 'comments'
    t = 'all'  # all, day, hour, month, week, year


    posts = search_reddit(query, limit)
    posts = convert_timestamp_to_date(posts)

    print(f"Retrieved {len(posts)} posts")

    if len(posts) != 0:
        data = get_post_data(posts)
        # export_query = query.replace(":", "_").replace('"', '-') + f" sort_{sort} t_{t}"
        export_query = f"long_query subreddit_{subreddit} sort_{sort} t_{t} n_{len(posts)}"
        pd.DataFrame(data).to_excel(f'../data/{export_query}.xlsx', header=['Title', 'Body', 'Author', 'URL', 'Created', 'Comments', 'Score', 'Subreddit'], index=False)

Retrieved 100 posts so far...
Retrieved 200 posts so far...
Retrieved 300 posts so far...
Retrieved 400 posts so far...
Retrieved 500 posts so far...
Retrieved 600 posts so far...
Retrieved 700 posts so far...
Retrieved 800 posts so far...
Retrieved 900 posts so far...
Retrieved 1000 posts so far...
Retrieved 1000 posts
Retrieved 100 posts so far...
Retrieved 200 posts so far...
Retrieved 300 posts so far...
Retrieved 400 posts so far...
Retrieved 500 posts so far...
Retrieved 600 posts so far...
Retrieved 700 posts so far...
Retrieved 800 posts so far...
Retrieved 900 posts so far...
Retrieved 1000 posts so far...
Retrieved 1000 posts
Retrieved 100 posts so far...
Retrieved 200 posts so far...
Retrieved 300 posts so far...
Retrieved 400 posts so far...
Retrieved 500 posts so far...
Retrieved 600 posts so far...
Retrieved 700 posts so far...
Retrieved 800 posts so far...
Retrieved 900 posts so far...
Retrieved 1000 posts so far...
Retrieved 1000 posts
No more posts to fetch.
Retrieved 

In [39]:
posts[0]

{'kind': 't3',
 'data': {'approved_at_utc': None,
  'subreddit': 'careerguidance',
  'selftext': "Hi everyone,\n\nI’m a last-year medical student, and in addition to med school, I currently work in a wet lab on an Alzheimer’s project involving rat brains. For my final thesis, however, I’m working on a pathology project with a focus on using AI. This experience made me realise I don't want to go into clinical work and really want to stay in research, and while I love lab work, I’ve developed an interest in coding and AI and see it as maybe having more perspectives in the future.\n\nRight now, I feel like I only know the basics of coding—enough to write research models, but not much beyond that. My long-term goal is to work in labs, ideally at the intersection of AI, robotics, and bioinformatics. However, I’m lost on about how to proceed once I graduate.\n\nHere are my main concerns:\n\n1. **Further Education vs. Industry**: I’m contemplating whether to apply for a second master's degree