In [149]:
import sys
import time
import requests
import datetime
import json
import fasttext
import re

MAX_RETRIEVED_ELEMENTS = 1000
PUSHSHIFT_ENDPOINT = 'https://api.pushshift.io/reddit/search/'
THRESHOLD = 0.6
PATTERN = re.compile("&gt; (.*)\n\n")

In [50]:
PRETRAINED_MODEL_PATH = 'lid.176.bin'
model = fasttext.load_model(PRETRAINED_MODEL_PATH)




In [150]:
def none_or_empty(text):
    return text is None or len(text) == 0 or text == "[removed]" or text == '[deleted]'

In [219]:
def preprocess(text):
    # Need to get rid of &gt for quotes
    preproc = PATTERN.sub("", text)
    
    # Need to get rid of newlines for fasttext to work
    preproc = re.sub("\n", " ", preproc)
    return preproc

In [220]:
def valid_text(text):
    if none_or_empty(text):
        return False, ""
    preproc_text = preprocess(text)
    labels, prob = model.predict(preproc_text)
    if labels[0].endswith('en') and prob[0] >= THRESHOLD:
        return True, preproc_text
    return False, ""
    

In [233]:
def retrieve_subreddit_data(subreddit, retrieval_type, dirname, fields, keywords):
    today = datetime.datetime.utcnow()
    today_timestamp = int((today - datetime.datetime(1970, 1, 1)).total_seconds())
    before_date = today_timestamp
    
    fields = ",".join(fields)
    query_words = "|".join(keywords)
    text_tag = 'body' if retrieval_type == 'comment' else 'selftext'
    
   
    count = 0
    total_valid = 0
    done = False
    output_filename = 'r.' + subreddit + '.' + retrieval_type + '.json'
    with open(dirname + output_filename, 'w') as fout:
        while not done:
            count += 1
            
            query = PUSHSHIFT_ENDPOINT + retrieval_type + '/?subreddit=' + subreddit + \
            '&sort=desc&size=' + str(MAX_RETRIEVED_ELEMENTS) + '&before=' + str(before_date) + '&fields=' + fields + \
            "&q=" + query_words
            
            print(query, 'request #', count)
            sys.stdout.flush()
            try:
                r = requests.get(query)
            except:
                print('exception thrown...')
                time.sleep(5)
                continue

            if r.status_code != 200:
                print('bad response code:', r.status_code)
                before_date -= 1  # slight change to the query
                continue  # retry

# record a submission/comment only if it's not empty

            for i, element in enumerate(r.json()['data']):
                if 'created_utc' in element.keys():
                    before_date = element['created_utc']
                    try:
                        is_valid, formatted = valid_text(element[text_tag])
                    except KeyError:
                        pass
                    if is_valid:
                        element[text_tag] = formatted
                        json.dump(element, fout)
                        fout.write('\n')
                        total_valid += 1
            if len(r.json()['data']) < MAX_RETRIEVED_ELEMENTS:  # end of data
                done = True
                
    print("SUBREDDIT: r/{}".format(subreddit))
    print(retrieval_type)
    print(text_tag)
    print(total_valid)
    return total_valid

In [244]:
retrieval_types = ['submission', 'comment']
subreddits = ['europe', 'spain', 'italy', 'unitedkingdom', 'germany', 'iran', 'japan', 'china', 'singapore',
              'india', 'korea', 'france', 'unitedstatesofamerica', 'canada', 'onguardforthee', 'coronavirusus', 'canadacoronavirus']  # and many more
dirname = 'data/'
fields = {'submission': ['author', 'id', 'selftext', 'permalink', 'title', 'created_utc'], 
          'comment': ['author', 'body', 'created_utc', 'permalink']}
keywords = ['covid', 'covid-19', 'coronavirus', 'corona', 'covid19']
#optionally: pandemic?

if __name__ == '__main__':
    output_filename = 'overall_stats.csv'
    with open(dirname + output_filename, 'w') as fout:
        fout.write('Subreddit,Content Type,Sample Size\n')
        for sub in subreddits:
            for r_type in retrieval_types:
                total_valid = retrieve_subreddit_data(sub, r_type, dirname, fields[r_type], keywords)
                fout.write(f"{sub},{r_type},{total_valid}\n")

https://api.pushshift.io/reddit/search/submission/?subreddit=europe&sort=desc&size=1000&before=1587341780&fields=author,id,selftext,permalink,title,created_utc&q=covid|covid-19|coronavirus|corona|covid19 request # 1
https://api.pushshift.io/reddit/search/submission/?subreddit=europe&sort=desc&size=1000&before=1585947098&fields=author,id,selftext,permalink,title,created_utc&q=covid|covid-19|coronavirus|corona|covid19 request # 2
https://api.pushshift.io/reddit/search/submission/?subreddit=europe&sort=desc&size=1000&before=1585306079&fields=author,id,selftext,permalink,title,created_utc&q=covid|covid-19|coronavirus|corona|covid19 request # 3
https://api.pushshift.io/reddit/search/submission/?subreddit=europe&sort=desc&size=1000&before=1584515780&fields=author,id,selftext,permalink,title,created_utc&q=covid|covid-19|coronavirus|corona|covid19 request # 4
https://api.pushshift.io/reddit/search/submission/?subreddit=europe&sort=desc&size=1000&before=1583531138&fields=author,id,selftext,perm

https://api.pushshift.io/reddit/search/comment/?subreddit=italy&sort=desc&size=1000&before=1443453110&fields=author,body,created_utc,permalink&q=covid|covid-19|coronavirus|corona|covid19 request # 17
SUBREDDIT: r/italy
comment
body
977
https://api.pushshift.io/reddit/search/submission/?subreddit=unitedkingdom&sort=desc&size=1000&before=1587341831&fields=author,id,selftext,permalink,title,created_utc&q=covid|covid-19|coronavirus|corona|covid19 request # 1
https://api.pushshift.io/reddit/search/submission/?subreddit=unitedkingdom&sort=desc&size=1000&before=1585730004&fields=author,id,selftext,permalink,title,created_utc&q=covid|covid-19|coronavirus|corona|covid19 request # 2
https://api.pushshift.io/reddit/search/submission/?subreddit=unitedkingdom&sort=desc&size=1000&before=1584448869&fields=author,id,selftext,permalink,title,created_utc&q=covid|covid-19|coronavirus|corona|covid19 request # 3
SUBREDDIT: r/unitedkingdom
submission
selftext
326
https://api.pushshift.io/reddit/search/comme

SUBREDDIT: r/singapore
submission
selftext
641
https://api.pushshift.io/reddit/search/comment/?subreddit=singapore&sort=desc&size=1000&before=1587341882&fields=author,body,created_utc,permalink&q=covid|covid-19|coronavirus|corona|covid19 request # 1
https://api.pushshift.io/reddit/search/comment/?subreddit=singapore&sort=desc&size=1000&before=1586843040&fields=author,body,created_utc,permalink&q=covid|covid-19|coronavirus|corona|covid19 request # 2
https://api.pushshift.io/reddit/search/comment/?subreddit=singapore&sort=desc&size=1000&before=1586323343&fields=author,body,created_utc,permalink&q=covid|covid-19|coronavirus|corona|covid19 request # 3
https://api.pushshift.io/reddit/search/comment/?subreddit=singapore&sort=desc&size=1000&before=1585883615&fields=author,body,created_utc,permalink&q=covid|covid-19|coronavirus|corona|covid19 request # 4
https://api.pushshift.io/reddit/search/comment/?subreddit=singapore&sort=desc&size=1000&before=1585281269&fields=author,body,created_utc,perm

https://api.pushshift.io/reddit/search/submission/?subreddit=france&sort=desc&size=1000&before=1581156244&fields=author,id,selftext,permalink,title,created_utc&q=covid|covid-19|coronavirus|corona|covid19 request # 4
SUBREDDIT: r/france
submission
selftext
60
https://api.pushshift.io/reddit/search/comment/?subreddit=france&sort=desc&size=1000&before=1587341928&fields=author,body,created_utc,permalink&q=covid|covid-19|coronavirus|corona|covid19 request # 1
https://api.pushshift.io/reddit/search/comment/?subreddit=france&sort=desc&size=1000&before=1586802713&fields=author,body,created_utc,permalink&q=covid|covid-19|coronavirus|corona|covid19 request # 2
https://api.pushshift.io/reddit/search/comment/?subreddit=france&sort=desc&size=1000&before=1586261347&fields=author,body,created_utc,permalink&q=covid|covid-19|coronavirus|corona|covid19 request # 3
https://api.pushshift.io/reddit/search/comment/?subreddit=france&sort=desc&size=1000&before=1585750605&fields=author,body,created_utc,permali

SUBREDDIT: r/canada
comment
body
18383
https://api.pushshift.io/reddit/search/submission/?subreddit=onguardforthee&sort=desc&size=1000&before=1587341979&fields=author,id,selftext,permalink,title,created_utc&q=covid|covid-19|coronavirus|corona|covid19 request # 1
SUBREDDIT: r/onguardforthee
submission
selftext
72
https://api.pushshift.io/reddit/search/comment/?subreddit=onguardforthee&sort=desc&size=1000&before=1587341980&fields=author,body,created_utc,permalink&q=covid|covid-19|coronavirus|corona|covid19 request # 1
https://api.pushshift.io/reddit/search/comment/?subreddit=onguardforthee&sort=desc&size=1000&before=1585532943&fields=author,body,created_utc,permalink&q=covid|covid-19|coronavirus|corona|covid19 request # 2
SUBREDDIT: r/onguardforthee
comment
body
1773
https://api.pushshift.io/reddit/search/submission/?subreddit=coronavirusus&sort=desc&size=1000&before=1587341983&fields=author,id,selftext,permalink,title,created_utc&q=covid|covid-19|coronavirus|corona|covid19 request # 1
h

https://api.pushshift.io/reddit/search/comment/?subreddit=canadacoronavirus&sort=desc&size=1000&before=1584652114&fields=author,body,created_utc,permalink&q=covid|covid-19|coronavirus|corona|covid19 request # 4
https://api.pushshift.io/reddit/search/comment/?subreddit=canadacoronavirus&sort=desc&size=1000&before=1584065968&fields=author,body,created_utc,permalink&q=covid|covid-19|coronavirus|corona|covid19 request # 5
SUBREDDIT: r/canadacoronavirus
comment
body
4644
