# Reddit get-comment tool, covid-19 sentiment analysis

## Set-up

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plot
import datetime as dt
import time
import requests
from bs4 import BeautifulSoup

In [2]:
sub_url = 'https://api.pushshift.io/reddit/search/submission'
comment_url = 'https://api.pushshift.io/reddit/search/comment'

#### Define subreddits, fields gathered

In [5]:
# Play with different localities here
subreddits = ['nyc', 'houston']

In [6]:
submission_fields = ['id','title', 'created_utc','num_comments','subreddit']
comment_fields = ['link_id','body','created_utc', 'subreddit']

#### Set key terms; Name data

In [50]:
# Batch name (suffix to add to all saved data)
prefix = '2019_nokeywords'

In [10]:
# Search terms
keywords = 'covid|quarantine|pandemic|coronavirus'

#### Set Time

In [39]:
# time flags (search starts at t2 and goes back by 'span_days' find t1)
# 86400 utc = 1 day

# Start Time
t2 = round(time.time()) # Now
# t2 = '1557446400' #5/10/2019, 12 am

# Search Span
span_days = 80

t1 = str(int(t2) - span_days*86400) 

## Submissions

In [43]:
# get submissions
submissions = pd.DataFrame(columns = submission_fields)
df_list = []

for subreddit in subreddits:
    start_time = t2
    res = requests.get(
        sub_url,
        params={
            'subreddit' : subreddit,
            'q' : keywords,
            'fields': submission_fields,
            'size' : 400,
            'sort_type' : 'num_comments',
            'sort' : 'desc',
            'before': start_time,  
            'after': t1,      
        })
    # Make sure we got a 2xx response
    res.raise_for_status()

    df = pd.DataFrame(res.json()['data'])
    
    # Filter out non-commented; could also set 'sort_type' parameter to get most commented
    df = df[df['num_comments'] >0]
    
    df_list.append(df)

start_time = df.created_utc.min()
submissions = pd.concat(df_list, axis=0)
submissions['date'] = [dt.date.fromtimestamp(x).isoformat() for x in submissions['created_utc']]

## Comments

In [45]:
# Generate id dictionary for cross-referencing submissions with comments
link_ids = {sub: submissions[submissions["subreddit"] == sub]["id"] for sub in subreddits}

In [46]:
# get comments
df_list = []

for subreddit in subreddits:
    start_time = t2
    c = 0
    while c < submissions[submissions['subreddit'] == subreddit]['num_comments'].sum():
        res = requests.get(
            comment_url,
            params={
                'subreddit' : subreddit,
                'fields': comment_fields,
                'link_id' : (['t3_' + n for n in link_ids[subreddit]]),  #/comment?link_id : /submission?ids
                'size' : 1000,
                'before' : start_time,
            })
        # Make sure we got a 2xx response
        res.raise_for_status()

        # Don't parse data unless we got at least one post
        if len(res.json()['data']) == 0:
            break
        
        df = pd.DataFrame(res.json()['data'])

        # raise counter by number of rows in df
        c += df.shape[0]

        print(f"Fetched {c} comments from r/{subreddit}")

        df_list.append(df)
        start_time = df['created_utc'].min()
        
comments = pd.concat(df_list, axis=0)
comments['date'] = [dt.date.fromtimestamp(x).isoformat() for x in comments['created_utc']]

<Response [200]>
1000
<Response [200]>
2000
<Response [200]>
3000
<Response [200]>
4000
<Response [200]>
5000
<Response [200]>
6000
<Response [200]>
7000
<Response [200]>
8000
<Response [200]>
9000
<Response [200]>
10000
<Response [200]>
11000
<Response [200]>
12000
<Response [200]>
13000
<Response [200]>
14000
<Response [200]>
15000
<Response [200]>
16000
<Response [200]>
17000
<Response [200]>
18000
<Response [200]>
19000
<Response [200]>
20000
<Response [200]>
21000
<Response [200]>
22000
<Response [200]>
23000
<Response [200]>
24000
<Response [200]>
25000
<Response [200]>
26000
<Response [200]>
27000
<Response [200]>
28000
<Response [200]>
29000
<Response [200]>
30000
<Response [200]>
31000
<Response [200]>
32000
<Response [200]>
33000
<Response [200]>
34000
<Response [200]>
35000
<Response [200]>
36000
<Response [200]>
37000
<Response [200]>
38000
<Response [200]>
39000
<Response [200]>
40000
<Response [200]>
41000
<Response [200]>
42000
<Response [200]>
43000
<Response [200]>
440

## Sample/Save Data

In [47]:
# Fewest posts we got from a subreddit
smallest = comments['subreddit'].value_counts().min()

# Pare every subreddit down to this number by random sampling
comments_sampled = pd.concat([
        comments[comments['subreddit'] == subreddit].sample(smallest, random_state=101)
        for subreddit in subreddits
    ])

In [48]:
# verify we now have equal classes
data['subreddit'].value_counts()

nyc        45130
houston    45130
Name: subreddit, dtype: int64

In [51]:
# Save to local hard drive with prefix
data.to_csv(f'./data/{prefix}_comments_sampled.csv', index=False)
comments.to_csv(f'./data/{prefix}_comments_all.csv', index=False)
submissions.to_csv(f'./data/{prefix}_submissions.csv', index=False)