# Reddit get-comment tool, covid-19 sentiment analysis

## Set-up

In [110]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plot
import datetime as dt
import time
import requests
from bs4 import BeautifulSoup 


In [2]:
sub_url = 'https://api.pushshift.io/reddit/search/submission?'

In [3]:
comment_url = 'https://api.pushshift.io/reddit/search/comment?'

In [4]:
#/comment?link_id : /submission?ids

In [5]:
# Play with different localities here
subreddits = ['nyc', 'houston']

In [6]:
submission_fields = ['id','title', 'created_utc','num_comments','subreddit']

In [9]:
comment_fields = ['link_id','body','created_utc', 'subreddit']

## Submissions

In [180]:
# get submissions
submissions = pd.DataFrame(columns = submission_fields)
df_list = []

for subreddit in subreddits:
    start_time = round(time.time())
    res = requests.get(
        sub_url,
        params={
            'subreddit' : subreddit,
            'q' : 'covid|quarantine|pandemic|coronavirus',
            'fields': submission_fields,
            'size' : 400,
            'sort_type' : 'num_comments',
            'sort' : 'desc',
            'before': start_time,  # We can also manually set this 
            'after': '80d',      # With this on, the unbalanced classes get moreso
        })

    df = pd.DataFrame(res.json()['data'])
    
    # Filter out non-commented; could also set 'sort_type' parameter to get most commented
    df = df[df['num_comments'] >0]
    
    df_list.append(df)

start_time = df.created_utc.min()
submissions = pd.concat(df_list, axis=0)
submissions['date'] = [dt.date.fromtimestamp(x).isoformat() for x in submissions['created_utc']]


## Comments

In [98]:
link_ids = {sub: submissions[submissions["subreddit"] == sub]["id"] for sub in subreddits}

In [188]:
# get comments
comments = pd.DataFrame(columns = comment_fields)
df_list = []

for subreddit in subreddits:
    start_time = round(time.time())
    c = 0
    while c < submissions[submissions['subreddit'] == subreddit]['num_comments'].sum():
        res = requests.get(
            comment_url,
            params={
                'subreddit' : subreddit,
                'fields': comment_fields,
                'link_id' : (['t3_' + n for n in link_ids[subreddit]]),
                'size' : 1000,
                'before' : start_time,
            })
        print(res)
        if len(res.json()['data']) == 0:
            break
        df = pd.DataFrame(res.json()['data'])

#         print(res.json()['data'])
        # raise counter by number of rows in df
        c += df.shape[0]
        print(c)
        
        df_list.append(df)
#         display(df.columns)
#         display(df.head())
#         display(df['created_utc'].min())
        start_time = df['created_utc'].min()
        
comments = pd.concat(df_list, axis=0)
comments['date'] = [dt.date.fromtimestamp(x).isoformat() for x in comments['created_utc']]

<Response [200]>
1000
<Response [200]>
2000
<Response [200]>
3000
<Response [200]>
4000
<Response [200]>
5000
<Response [200]>
6000
<Response [200]>
7000
<Response [200]>
8000
<Response [200]>
9000
<Response [200]>
10000
<Response [200]>
11000
<Response [200]>
12000
<Response [200]>
13000
<Response [200]>
14000
<Response [200]>
15000
<Response [200]>
16000
<Response [200]>
17000
<Response [200]>
18000
<Response [200]>
19000
<Response [200]>
20000
<Response [200]>
21000
<Response [200]>
22000
<Response [200]>
23000
<Response [200]>
24000
<Response [200]>
25000
<Response [200]>
26000
<Response [200]>
27000
<Response [200]>
28000
<Response [200]>
29000
<Response [200]>
30000
<Response [200]>
31000
<Response [200]>
32000
<Response [200]>
33000
<Response [200]>
34000
<Response [200]>
35000
<Response [200]>
36000
<Response [200]>
37000
<Response [200]>
38000
<Response [200]>
39000
<Response [200]>
40000
<Response [200]>
41000
<Response [200]>
42000
<Response [200]>
43000
<Response [200]>
440

In [195]:
comments['subreddit'].value_counts().min()

14136

In [197]:
data = pd.concat([comments[comments['subreddit'] == subreddit].sample(comments['subreddit'].value_counts().min(),
                                               random_state = 101) for subreddit in subreddits])

In [200]:
data['subreddit'].value_counts()

houston    14136
nyc        14136
Name: subreddit, dtype: int64

In [204]:
data.to_csv('./data/sampled_comments.csv')

In [205]:
comments.to_csv('./data/all_comments.csv')

In [206]:
submissions.to_csv('./data/submissions.csv')