# Reddit get-comment tool, covid-19 sentiment analysis

## Set-up

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plot
import datetime as dt
import time
import requests
from bs4 import BeautifulSoup

seed = 101

In [2]:
sub_url = 'https://api.pushshift.io/reddit/search/submission'
comment_url = 'https://api.pushshift.io/reddit/search/comment'

In [3]:
# Play with different localities here
subreddits = ['nyc', 'houston']

In [4]:
# Fields to fetch from Reddit API
submission_fields = ['id','title', 'created_utc','num_comments','subreddit']
comment_fields = ['link_id','body','created_utc', 'subreddit']

## Submissions

In [5]:
df_list = []

for subreddit in subreddits:
    start_time = round(time.time())
    res = requests.get(
        sub_url,
        params={
            'subreddit' : subreddit,
            'q' : 'covid|quarantine|pandemic|coronavirus',
            'fields': submission_fields,
            'size' : 400,
            'sort_type' : 'num_comments',
            'sort' : 'desc',
            'before': start_time,  # We can also manually set this 
            'after': '80d',      # With this on, the unbalanced classes get moreso
        })
    # Make sure we got a 2xx response
    res.raise_for_status()

    df = pd.DataFrame(res.json()['data'])
    
    # Filter out non-commented; could also set 'sort_type' parameter to get most commented
    df = df[df['num_comments'] > 0]
    
    df_list.append(df)

# Put all posts in one data frame
submissions = pd.concat(df_list, axis=0)

# Convert date to YYYY-MM-DD format
submissions['date'] = [dt.date.fromtimestamp(x).isoformat() for x in submissions['created_utc']]

## Comments

In [6]:
# Get lists of link ids from each subreddit
link_ids = {sub: submissions[submissions["subreddit"] == sub]["id"] for sub in subreddits}

In [9]:
# get comments
comments = pd.DataFrame(columns = comment_fields)
df_list = []

for subreddit in subreddits:
    start_time = round(time.time())
    c = 0
    while c < submissions[submissions['subreddit'] == subreddit]['num_comments'].sum():
        time.sleep(10) # ~6 requests/minute
        res = requests.get(
            comment_url,
            params={
                'subreddit' : subreddit,
                'fields': comment_fields,
                'link_id' : (['t3_' + n for n in link_ids[subreddit]]),
                'size' : 1000,
                'before' : start_time,
            })
        # Make sure we got a 2xx response
        res.raise_for_status()
        
        # Don't parse data unless we got at least one post
        if len(res.json()['data']) == 0:
            break

        df = pd.DataFrame(res.json()['data'])

        # raise counter by number of rows in df
        c += df.shape[0]
        
        # Add these new comments to our big data frame
        df_list.append(df)

        # Reset start time so we're getting earlier comments next iteration
        start_time = df['created_utc'].min()
        
        # Show status message
        print(f"Fetched {c} comments from r/{subreddit}")

# Put all comments in one data frame
comments = pd.concat(df_list, axis=0)

# Convert date to YYYY-MM-DD format
comments['date'] = [dt.date.fromtimestamp(x).isoformat() for x in comments['created_utc']]

Fetched 1000 comments from r/nyc
Fetched 2000 comments from r/nyc
Fetched 3000 comments from r/nyc
Fetched 4000 comments from r/nyc
Fetched 5000 comments from r/nyc
Fetched 6000 comments from r/nyc
Fetched 7000 comments from r/nyc
Fetched 8000 comments from r/nyc
Fetched 9000 comments from r/nyc
Fetched 10000 comments from r/nyc
Fetched 11000 comments from r/nyc
Fetched 12000 comments from r/nyc
Fetched 13000 comments from r/nyc
Fetched 14000 comments from r/nyc
Fetched 15000 comments from r/nyc
Fetched 16000 comments from r/nyc
Fetched 17000 comments from r/nyc
Fetched 18000 comments from r/nyc
Fetched 19000 comments from r/nyc
Fetched 20000 comments from r/nyc
Fetched 21000 comments from r/nyc
Fetched 22000 comments from r/nyc
Fetched 23000 comments from r/nyc
Fetched 24000 comments from r/nyc
Fetched 25000 comments from r/nyc
Fetched 26000 comments from r/nyc
Fetched 27000 comments from r/nyc
Fetched 28000 comments from r/nyc
Fetched 29000 comments from r/nyc
Fetched 30000 comments 

## Balance classes

In [10]:
# Fewest posts we got from a subreddit
smallest = comments['subreddit'].value_counts().min()

# Pare every subreddit down to this number by random sampling
comments_sampled = pd.concat([
        comments[comments['subreddit'] == subreddit].sample(smallest, random_state=seed)
        for subreddit in subreddits
    ])

## Write out data

In [11]:
comments_sampled.to_csv('../data/comments_sampled.csv', index=False)
comments.to_csv('../data/comments_all.csv', index=False)
submissions.to_csv('../data/submissions.csv', index=False)