In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import requests
import time
from datetime import datetime

from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, plot_confusion_matrix, plot_roc_curve, roc_auc_score
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV

In [3]:
def get_posts(subreddit, n):
    url = 'https://api.pushshift.io/reddit/search/submission'
    if n < 100:
        params = {
        'subreddit' : subreddit, 
        'size': n 
        }
        res = requests.get(url, params)
        data = res.json()
        posts = data['data']
    else:
# note:  Pushshift.io now has a hard limit of 100 posts returned per API hit, so I'm setting this 100 limit here and will loop through this call until I hit n posts
        #get now in epoch date time format
        today = datetime.now()
        now = today.replace(hour=0, minute=0, second=0, microsecond=0)
        epoch = int(now.timestamp()) #get now in epoch date time format

        params = {
            'subreddit' : subreddit,
            'size' : 100, #pull 100 posts at a time
            'before' : epoch #set to now
        }
        posts = []
        # until I have as many posts as called for
        while len(posts) <  n:
            # get the posts
            res = requests.get(url, params)
            # convert to list
            data = res.json()
            # add to list
            print(data['data'][99]['created_utc'])
            posts.extend(data['data'])
            print(len(posts))
            # set params 'before' to oldest post's utc
            params['before'] = data['data'][99]['created_utc']
            # pause for 5 seconds so we're not hitting the API too fast and maxing it out.
            time.sleep(5)

    return pd.DataFrame(posts) #

In [4]:
reddit_coronavirus_posts = get_posts('coronavirus', 2_500)

1600971276
100
1600951109
200
1600917845
300
1600894509
400
1600876783
500
1600861216
600
1600825421
700
1600802468
800
1600745658
900
1600716276
1000
1600692610
1100
1600649803
1200
1600625979
1300
1600588448
1400
1600551005
1500
1600523384
1600
1600486554
1700
1600458074
1800
1600438346
1900
1600382746
2000
1600361450
2100
1600346397
2200
1600319134
2300
1600300535
2400
1600280035
2500


In [5]:
reddit_df = reddit_coronavirus_posts['title'].copy()

In [6]:
reddit_df.to_csv('./datasets/reddit_coronavirus_titles.csv', index = False)

In [7]:
reddit_coronavirus_posts.columns

Index(['all_awardings', 'allow_live_comments', 'author',
       'author_flair_css_class', 'author_flair_richtext', 'author_flair_text',
       'author_flair_type', 'author_fullname', 'author_patreon_flair',
       'author_premium', 'awarders', 'can_mod_post', 'contest_mode',
       'created_utc', 'domain', 'full_link', 'gildings', 'id',
       'is_crosspostable', 'is_meta', 'is_original_content',
       'is_reddit_media_domain', 'is_robot_indexable', 'is_self', 'is_video',
       'link_flair_background_color', 'link_flair_richtext',
       'link_flair_template_id', 'link_flair_text', 'link_flair_text_color',
       'link_flair_type', 'locked', 'media_only', 'no_follow', 'num_comments',
       'num_crossposts', 'over_18', 'parent_whitelist_status', 'permalink',
       'pinned', 'post_hint', 'preview', 'pwls', 'retrieved_on', 'score',
       'selftext', 'send_replies', 'spoiler', 'stickied', 'subreddit',
       'subreddit_id', 'subreddit_subscribers', 'subreddit_type',
       'suggested_

In [8]:
def rule_5_broken(cell):
    if "Rule 5:" in cell: # Rule 5 in the r/coronavirus subreddit is 'keep information quality high'
        return True
    else:
        return False

In [9]:
reddit_coronavirus_posts['link_flair_text'] = reddit_coronavirus_posts['link_flair_text'].fillna("")
reddit_coronavirus_posts['link_flair_text']

0                                                Europe
1                               Central &amp; East Asia
2                                                Europe
3                                         Latin America
4       Removed - Rule 5: Keep information quality high
                             ...                       
2495                                                USA
2496                                    Academic Report
2497                                                USA
2498                                              World
2499                                                USA
Name: link_flair_text, Length: 2500, dtype: object

In [10]:
reddit_coronavirus_posts[reddit_coronavirus_posts['link_flair_text'].map(rule_5_broken) == True]['title']

4                         Meanwhile in The Netherlands...
5       Japanese firm develops first UV lamp that safe...
43              The Overwhelming Racism Of COVID Coverage
55               The Second COVID-19 Wave is Already Here
71      Unilever, Consumer Giants Push Suppliers to Re...
                              ...                        
2472    Opinion: Big Ten's decision to play football s...
2476    Racebaiter Hilary Brueck and Business Insider ...
2477    Echo from the past: China has been telling us ...
2488    CDC Director: Masks Are 'The Most Important, P...
2490          Quick News: Trump Calls for Fiscal Stimulus
Name: title, Length: 173, dtype: object