# Reddit get-comment tool, covid-19 sentiment analysis

## Set-up

In [19]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plot
import datetime as dt
import time
import requests
from bs4 import BeautifulSoup

In [20]:
sub_url = 'https://api.pushshift.io/reddit/search/submission'
comment_url = 'https://api.pushshift.io/reddit/search/comment'

#### Define subreddits, fields gathered

In [21]:
# Play with different localities here
subreddits = ['nyc', 'houston']

In [22]:
submission_fields = ['id','title', 'created_utc','num_comments','subreddit']
comment_fields = ['link_id','body','created_utc', 'subreddit', 'score']

#### Set key terms; Name data

In [23]:
# Batch name (suffix to add to all saved data)
prefix = '2019'

In [24]:
# Search terms
keywords = 'covid|quarantine|pandemic|coronavirus'

#### Set Time

In [63]:
# time flags (search starts at t2 and goes back by 'span_days' find t1)
# 86400 utc = 1 day

# Start Time
# t2 = round(time.time()) # Now
t2 = round((dt.datetime.now() - dt.timedelta(days=365)).timestamp()) #5/10/2019, 12 am

# Search Span
span_days = 80

t1 = str(int(t2) - span_days*86400)

## Submissions

In [64]:
# get submissions
submissions = pd.DataFrame(columns = submission_fields)
df_list = []

for subreddit in subreddits:
    start_time = previous_date
    # start_time = round(time.time())
    res = requests.get(
        sub_url,
        params={
            'subreddit' : subreddit,
            # 'q' : keywords,
            'fields': submission_fields,
            'size' : 400,
            'sort_type' : 'num_comments',
            'sort' : 'desc',
            'before': start_time,  
            'after': t1,
        })
    # Make sure we got a 2xx response
    res.raise_for_status()

    df = pd.DataFrame(res.json()['data'])
    
    # Filter out non-commented; could also set 'sort_type' parameter to get most commented
    df = df[df['num_comments'] >0]
    
    df_list.append(df)

start_time = df.created_utc.min()
submissions = pd.concat(df_list, axis=0)
submissions['date'] = [dt.date.fromtimestamp(x).isoformat() for x in submissions['created_utc']]

## Comments

In [65]:
# Generate id dictionary for cross-referencing submissions with comments
link_ids = {sub: submissions[submissions["subreddit"] == sub]["id"] for sub in subreddits}

In [66]:
# get comments
df_list = []

for subreddit in subreddits:
    start_time = previous_date
    c = 0
    while c < submissions[submissions['subreddit'] == subreddit]['num_comments'].sum():
        time.sleep(2)
        res = requests.get(
            comment_url,
            params={
                'subreddit' : subreddit,
                'fields': comment_fields,
                'link_id' : (['t3_' + n for n in link_ids[subreddit]]),  #/comment?link_id : /submission?ids
                'size' : 1000,
                'before' : start_time,
            })
        # Make sure we got a 2xx response
        res.raise_for_status()

        # Don't parse data unless we got at least one post
        if len(res.json()['data']) == 0:
            break
        
        df = pd.DataFrame(res.json()['data'])

        # raise counter by number of rows in df
        c += df.shape[0]

        print(f"Fetched {c} comments from r/{subreddit} since {dt.datetime.fromtimestamp(start_time).isoformat()}")

        df_list.append(df)
        start_time = df['created_utc'].min()
        
comments = pd.concat(df_list, axis=0)
comments['date'] = [dt.date.fromtimestamp(x).isoformat() for x in comments['created_utc']]

Fetched 1000 comments from r/nyc since 2019-05-14T18:38:24
Fetched 2000 comments from r/nyc since 2019-05-13T14:13:35
Fetched 3000 comments from r/nyc since 2019-05-12T11:52:44
Fetched 4000 comments from r/nyc since 2019-05-10T13:52:11
Fetched 5000 comments from r/nyc since 2019-05-08T17:20:52
Fetched 6000 comments from r/nyc since 2019-05-07T20:30:22
Fetched 7000 comments from r/nyc since 2019-05-06T15:47:36
Fetched 8000 comments from r/nyc since 2019-05-04T10:51:02
Fetched 9000 comments from r/nyc since 2019-05-02T00:35:05
Fetched 10000 comments from r/nyc since 2019-04-29T21:21:24
Fetched 11000 comments from r/nyc since 2019-04-28T00:55:55
Fetched 12000 comments from r/nyc since 2019-04-26T09:27:37
Fetched 13000 comments from r/nyc since 2019-04-24T20:00:21
Fetched 14000 comments from r/nyc since 2019-04-23T13:49:55
Fetched 15000 comments from r/nyc since 2019-04-21T21:25:06
Fetched 16000 comments from r/nyc since 2019-04-19T12:22:18
Fetched 17000 comments from r/nyc since 2019-04-1

## Sample/Save Data

In [67]:
# Fewest posts we got from a subreddit
smallest = comments['subreddit'].value_counts().min()

# Pare every subreddit down to this number by random sampling
comments_sampled = pd.concat([
        comments[comments['subreddit'] == subreddit].sample(smallest, random_state=101)
        for subreddit in subreddits
    ])

In [68]:
# verify we now have equal classes
comments_sampled['subreddit'].value_counts()

houston    45381
nyc        45381
Name: subreddit, dtype: int64

In [69]:
# Save to local hard drive with prefix
today = dt.date.today().isoformat()

comments.to_csv(f'../data/{prefix}_reddit-comments_all-{today}.csv.bz2', index=False, compression='bz2')
comments_sampled.to_csv(f'../data/{prefix}_reddit-comments_sampled-{today}.csv.bz2', index=False, compression='bz2')
submissions.to_csv(f'../data/{prefix}_reddit-submissions-{today}.csv.bz2', index=False, compression='bz2')