In [1]:
#Using the PushShift Reddit API, get post data from the last 3 months and export it to a CSV file

In [10]:
import requests
import pandas as pd
import json
import datetime
from dateutil.relativedelta import relativedelta

In [34]:
# pushshift helper function
def get_posts(post_type,params, limit, after):
    #if a limit was specified by the user, set the size variable
    if limit != -1:
        if limit >= 100:
            #pushshift caps requests at 100 so if the limit is more than 100, we'll have to do multiple passes
            size = 100
        else:
            size = limit
    else:
        size = 100
    last = int(datetime.datetime.now().timestamp())
    got = 0
    posts = []
    while True:
        human_date = datetime.datetime.fromtimestamp(last).strftime("%B %d, %Y, %H:%M:%S")
        print(f"Fetching posts made before {human_date}")
        req_params = {
                **params,
                'size':size,
                'before':last
                }
        req_headers = {
                'User-Agent':'Python requests - Redditstat.py'
                }
        res = requests.get(f'https://api.pushshift.io/reddit/{post_type}/search', params=req_params, headers=req_headers)
        res.raise_for_status()
        data = res.json()["data"]
        posts += data
        last_date_fetched = data[-1]["created_utc"]
        #stop fetching posts if we've passed the start date or if we've hit the limit
        if last_date_fetched < after or (limit != -1 and got >= limit):
            got += len(data)
            print(f"Total of {got} posts fetched from r/{params['subreddit']}")
            return posts
        else:
            last = last_date_fetched
            got += 100

In [35]:
#Fetch posts from the last 3 months from the hardwareswap subreddit
subreddit = 'hardwareswap'
limit = -1
#farthest back posts should be fetched from
n_months = 3
after = (datetime.datetime.now() - relativedelta(months=n_months)).timestamp()

#save the posts
posts = get_posts('submission', {'subreddit':subreddit}, limit, int(after))

Fetching posts made before March 20, 2022, 10:27:03
Fetching posts made before March 19, 2022, 18:25:16
Fetching posts made before March 19, 2022, 14:48:45
Fetching posts made before March 19, 2022, 10:29:26
Fetching posts made before March 18, 2022, 22:04:05
Fetching posts made before March 18, 2022, 18:00:33
Fetching posts made before March 18, 2022, 13:41:51
Fetching posts made before March 18, 2022, 07:51:04
Fetching posts made before March 17, 2022, 20:09:30
Fetching posts made before March 17, 2022, 15:54:07
Fetching posts made before March 17, 2022, 11:23:18
Fetching posts made before March 16, 2022, 22:48:28
Fetching posts made before March 16, 2022, 19:17:44
Fetching posts made before March 16, 2022, 15:31:40
Fetching posts made before March 16, 2022, 11:33:35
Fetching posts made before March 16, 2022, 00:11:46
Fetching posts made before March 15, 2022, 18:58:49
Fetching posts made before March 15, 2022, 14:59:30
Fetching posts made before March 15, 2022, 10:17:20
Fetching pos

Fetching posts made before February 09, 2022, 11:36:21
Fetching posts made before February 09, 2022, 00:48:01
Fetching posts made before February 08, 2022, 20:15:32
Fetching posts made before February 08, 2022, 16:46:34
Fetching posts made before February 08, 2022, 13:35:50
Fetching posts made before February 08, 2022, 06:42:35
Fetching posts made before February 07, 2022, 21:46:50
Fetching posts made before February 07, 2022, 18:47:55
Fetching posts made before February 07, 2022, 15:03:09
Fetching posts made before February 07, 2022, 11:11:02
Fetching posts made before February 06, 2022, 23:52:47
Fetching posts made before February 06, 2022, 19:22:13
Fetching posts made before February 06, 2022, 15:46:35
Fetching posts made before February 06, 2022, 12:54:23
Fetching posts made before February 06, 2022, 08:14:41
Fetching posts made before February 05, 2022, 21:25:05
Fetching posts made before February 05, 2022, 17:04:22
Fetching posts made before February 05, 2022, 13:58:01
Fetching p

Fetching posts made before January 08, 2022, 14:13:57
Fetching posts made before January 08, 2022, 11:23:53
Fetching posts made before January 08, 2022, 00:16:51
Fetching posts made before January 07, 2022, 20:24:29
Fetching posts made before January 07, 2022, 16:49:23
Fetching posts made before January 07, 2022, 12:59:59
Fetching posts made before January 07, 2022, 08:39:59
Fetching posts made before January 06, 2022, 22:06:57
Fetching posts made before January 06, 2022, 18:19:22
Fetching posts made before January 06, 2022, 15:21:25
Fetching posts made before January 06, 2022, 11:25:01
Fetching posts made before January 05, 2022, 22:58:53
Fetching posts made before January 05, 2022, 18:52:47
Fetching posts made before January 05, 2022, 15:43:48
Fetching posts made before January 05, 2022, 12:08:48
Fetching posts made before January 05, 2022, 00:32:23
Fetching posts made before January 04, 2022, 21:20:23
Fetching posts made before January 04, 2022, 18:03:07
Fetching posts made before J

In [48]:
#preview of post data
posts[-1]

{'all_awardings': [],
 'allow_live_comments': False,
 'author': 'AlternateWitness',
 'author_flair_css_class': None,
 'author_flair_richtext': [],
 'author_flair_text': None,
 'author_flair_type': 'text',
 'author_fullname': 't2_attayt20',
 'author_is_blocked': False,
 'author_patreon_flair': False,
 'author_premium': False,
 'awarders': [],
 'can_mod_post': False,
 'contest_mode': False,
 'created_utc': 1639972982,
 'domain': 'self.hardwareswap',
 'full_link': 'https://www.reddit.com/r/hardwareswap/comments/rke5o6/usain_h_asus_tuf_rtx_3080_oc_nonlhr_w_3080_ti/',
 'gildings': {},
 'id': 'rke5o6',
 'is_created_from_ads_ui': False,
 'is_crosspostable': True,
 'is_meta': False,
 'is_original_content': False,
 'is_reddit_media_domain': False,
 'is_robot_indexable': True,
 'is_self': True,
 'is_video': False,
 'link_flair_background_color': '#d73a00',
 'link_flair_css_class': 'trading',
 'link_flair_richtext': [{'e': 'text', 't': 'TRADING'}],
 'link_flair_template_id': '7f3c370e-347c-11e3-8

In [46]:
#columns to save to CSV
export_columns = [
    'author',
    'subreddit',
    'link_flair_text',
    'created_utc',
    'full_link',
    'title',
    'selftext',
    'upvote_ratio',
    'author_fullname',
    'pinned'
]
#create dataframe for the array of posts
exportable_df = pd.DataFrame.from_records(posts)[export_columns]

In [47]:
#export the submissions from the past 3 months to a CSV
exportable_df.to_csv('hwswap_posts.csv')