In [1]:
import requests
import json
import pandas as pd
import time
from datetime import datetime

In [2]:
URL = "http://www.reddit.com/hot.json"

In [3]:
# Make our first row
res = requests.get(URL, headers = {'User-agent': 'JPH Bot 0.1'})
time_fetched = pd.Timestamp.utcnow()
data = res.json()
posts = [child['data'] for child in data['data']['children']]
big_df = pd.DataFrame.from_dict(posts)
big_df['time_fetched'] = time_fetched

In [4]:
# Loop to get 1000 more rows
next_page = data['data']['after']


for _ in range(0,40):
    temp_url = "http://www.reddit.com/hot.json?after="+next_page
    temp_res = requests.get(temp_url,headers = {'User-agent': 'JPH Bot 0.1'})
    temp_time_fetched = pd.Timestamp.utcnow()
    temp_data = temp_res.json()
    temp_posts = [child['data'] for child in temp_data['data']['children']]
    temp_df = pd.DataFrame.from_dict(temp_posts)
    temp_df['time_fetched'] = temp_time_fetched
    big_df = pd.concat([big_df, temp_df], ignore_index=True)
    next_page = temp_data['data']['after']

In [5]:
# Save current data as csv
now = str((pd.Timestamp.utcnow().to_datetime64()).astype('datetime64[m]'))
big_df.to_csv(now+".csv", index=False)

In [6]:
# Save a cumulative version
cumulative_df = pd.read_csv("reddit-cumulative.csv")
cumulative_df = pd.concat([cumulative_df, big_df], ignore_index=True)
cumulative_df.to_csv("reddit-cumulative.csv",index=False)

In [3]:
def get_subreddit_hot_posts(subreddit,number):
    URL = f'https://www.reddit.com/r/{subreddit}/hot.json'
    res = requests.get(URL, headers = {'User-agent': 'JPH Bot 0.1'})
    time_fetched = pd.Timestamp.utcnow()
    data = res.json()
    posts = [child['data'] for child in data['data']['children']]
    big_df = pd.DataFrame.from_dict(posts)
    big_df['time_fetched'] = time_fetched
    next_page = data['data']['after']
    for i in range(number):
        temp_url = URL + f"?after={next_page}"
        temp_res = requests.get(temp_url, headers = {'User-agent':'JPH Bot 0.1'})
        temp_time_fetched = pd.Timestamp.utcnow()
        temp_data = temp_res.json()
        temp_posts = [child['data'] for child in temp_data['data']['children']]
        temp_df = pd.DataFrame.from_dict(temp_posts)
        temp_df['time_fetched'] = temp_time_fetched
        big_df = pd.concat([big_df, temp_df], ignore_index = True)
        next_page = temp_data['data']['after']
    # Record our dataframe using current time and subreddit
    big_df.to_csv(f"{subreddit}-{time_fetched}.csv", index=False)
    # Merge with cumulative subreddit data
    cumulative_subreddit_df = pd.read_csv(f"{subreddit}-cumulative.csv")
    cumulative_subreddit_df = pd.concat([cumulative_subreddit_df, big_df], ignore_index=True)
    cumulative_subreddit_df.to_csv(f"{subreddit}-cumulative.csv", index=False)

In [4]:
def initialize_cumulative_csvs(subreddit_list):
    for subreddit in subreddit_list:
        URL = f'https://www.reddit.com/r/{subreddit}/hot.json'
        res = requests.get(URL, headers = {'User-agent': 'JPH Bot 0.1'})
        time_fetched = pd.Timestamp.utcnow()
        data = res.json()
        posts = [child['data'] for child in data['data']['children']]
        big_df = pd.DataFrame.from_dict(posts)
        big_df['time_fetched'] = time_fetched
        next_page = data['data']['after']
        big_df.to_csv(f"{subreddit}-cumulative.csv", index=False)
        big_df.to_csv(f"{subreddit}-{time_fetched}.csv", index=False)

In [5]:
def scrape_subreddits(subreddit_list, num):
    for subreddit in subreddit_list:
        get_subreddit_hot_posts(subreddit, num)

In [6]:
subreddit_list = ['news',
 'AskReddit',
 'worldnews',
 'pics',
 'funny',
 'videos',
 'gaming',
 'gifs',
 'mildlyinteresting',
 'MemeEconomy']

In [11]:
# Only run this once (for each unique subreddit)
#initialize_cumulative_csvs(subreddit_list)

In [7]:
scrape_subreddits(subreddit_list, 40)