Using the PSAW wrapper for Pushshift API to retrieve original posts in CovidET

Created on Wed Oct 26, 2022

@author: Sarah Seraj, Hongli Zhan, John Henry Cruz

In [1]:
import re
import csv
import json
import string
import requests
import numpy as np
import pandas as pd
import datetime as dt
from time import sleep
from psaw import PushshiftAPI
from matplotlib import pyplot as plt
from nltk.tokenize import sent_tokenize, word_tokenize

In [2]:
def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

In [3]:
# Extract reddit ids
dataset_df = pd.read_json("./data/CovidET_anonymized.json", orient="index")
reddit_ids = list(dataset_df["Reddit ID"])
print (len(reddit_ids))
reddit_ids_chunks = chunks(reddit_ids, 20)

1883


In [4]:
#EDIT: List subreddits separated by commas
subreddit_name = ['COVID19_support']

In [5]:
#EDIT: beginning & end dates of the period for which you want data
start_epoch=int(dt.datetime(2020, 2, 1).timestamp())
end_epoch=int(dt.datetime(2022, 3, 1).timestamp())
print (start_epoch)
print (end_epoch)

1580536800
1646114400


In [6]:
api = PushshiftAPI()
cache = []
max_response_cache = 1000000

In [7]:
for reddit_id_chunk in reddit_ids_chunks:
    try:
        for name in subreddit_name:
            gen = api.search_submissions(ids=reddit_id_chunk, subreddit=name,
                                         filter=['author', 'created_utc', 'subreddit', 'selftext',
                                                 'id','parent_id', 'score', 'author_flair_css_class',
                                                 'author_flair_text', 'metadata'], after=start_epoch, before=end_epoch)
            for c in gen:
                cache.append(c)

            # Omit this test to actually return all results. Could take a while
            if len(cache) >= max_response_cache:
                break

    except ConnectionAbortedError:
        #sleep(20)
        print("ConnectionAbortedError occurred")
    except:
        #print("other exception occurred")
        #sleep(60)
        pass



In [95]:
cleaned_cache = [c.d_ for c in cache]

df = pd.DataFrame(cleaned_cache)
df = df.drop(columns="author_flair_text")
df = df.drop(columns="created")
df = df.drop(columns="score")
print (len(df))

1883


In [96]:
df.head()

Unnamed: 0,author,created_utc,id,selftext,subreddit
0,ibalbalu,1624481483,o6lpwn,I don’t even know how to speak of this grief. ...,COVID19_support
1,PKNinja69,1624639614,o7riyw,"Hello,\n\nI am about 19 and it's been about 2 ...",COVID19_support
2,Hotzendorf1918,1624619535,o7lkru,"Recently, the Israeli government reinstituted ...",COVID19_support
3,tp151234,1624656076,o7wv0a,This makes me really just not want to go out a...,COVID19_support
4,Mikoaimi,1624675779,o82alq,I have a question about the delta variant. I’v...,COVID19_support


In [97]:
selftext_list = df['selftext'].tolist()
selftext_length = []
punctuations = list(string.punctuation)

In [98]:
for i in range(len(selftext_list)):
    file_content = selftext_list[i]
    file_content = file_content.encode('ascii', 'ignore').decode('ascii')
    file_content = re.sub("\s+"," ", file_content)
    selftext_list[i] = file_content
    
    file_content_no_punct = re.sub(r'(?<=[.,!?:])(?=[^\s])', r' ', file_content)
    file_content_no_punct = re.sub(r'\s([?.!,:"](?:\s|$))', r'\1', file_content_no_punct)
    selftext_tokenized = word_tokenize(file_content_no_punct)
    selftext_tokenized = [i for i in selftext_tokenized if i not in punctuations]
    selftext_length.append(len(selftext_tokenized))

In [99]:
df['selftext_cleaned'] = selftext_list
df['selftext_length'] = selftext_length

In [100]:
df['selftext_cleaned'] = df['selftext_cleaned'].str.replace(r'http\S+', '<URL>', regex=True).str.strip()
df['selftext_cleaned'] = df['selftext_cleaned'].str.replace(
    r'''(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|'''\
    '''(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’]))''', '<URL>',
    regex=True).str.strip()

In [101]:
# Anonymize

ORG_lst = [
    'Pfizer',
    'Pfizers',
    'J&J',
    'Johnson & Johnson',
    "Johnson & Johnson's",
    'AstraZeneca',
    'SINOVAC',
    'Ivermectin',
    'Novavax',
    'Astrazeneca',
    'the New York Times',
    'the NY Times',
    'CNBC',
    'NBC News',
    'Gottlieb',
    'Royal Mail',
    'the Mayo Clinic',
    'People.com',
    'Fox News',
    'Roche',
    'QuickVue',
    'Amtrak',
    "Moderma",
    'Moderna',
    'regencov',
]

PERSON_lst = [
    'Trump',
    'Fauci',
    'Modi',
    'Cyrus Shahpar',
    "Cyrus Shahpar's",
    'Vin Gupta',
    'Anna',
    'Ostenholm',
    'Osterholm',
    'Aaron Astor',
    'Steve',
    'Janssen',
    'jiu jitsu',
    'Jiu Jitsu',
    'Biden',
    'Astra Zeneca',
    'Zedd',
    'Alex Jones',
    'Bill de Blasio',
    'Angier',
    'Laurel Bristow',
    'Laurel Bastrow',
    'Jessica Wildfire',
    'Herman Cain',
    'JTurner',
    'Hotez',
]

In [102]:
for i in ORG_lst:
    df['selftext_cleaned'] = df['selftext_cleaned'].str.replace(i, '<ORG>', regex=False).str.strip()
for i in PERSON_lst:
    df['selftext_cleaned'] = df['selftext_cleaned'].str.replace(i, '<PERSON>', regex=False).str.strip()

In [103]:
df = df.drop(columns="selftext")
df

Unnamed: 0,author,created_utc,id,subreddit,selftext_cleaned,selftext_length
0,ibalbalu,1624481483,o6lpwn,COVID19_support,I dont even know how to speak of this grief. I...,271
1,PKNinja69,1624639614,o7riyw,COVID19_support,"Hello, I am about 19 and it's been about 2 day...",107
2,Hotzendorf1918,1624619535,o7lkru,COVID19_support,"Recently, the Israeli government reinstituted ...",203
3,tp151234,1624656076,o7wv0a,COVID19_support,This makes me really just not want to go out a...,97
4,Mikoaimi,1624675779,o82alq,COVID19_support,I have a question about the delta variant. Ive...,60
...,...,...,...,...,...,...
1878,lostmelater,1642353271,s5gelw,COVID19_support,"I ate a very large meal Friday, felt really na...",101
1879,ConsciousEconomist53,1642004130,s2a0rb,COVID19_support,"In my opinion, it shouldn't be forever. When t...",73
1880,bivalverights,1642465022,s6jcqk,COVID19_support,"I took the above test twice, roughly 24 hours ...",59
1881,procrast1natrix,1642477603,s6nr73,COVID19_support,Anybody want to read about something happy abo...,152
