https://medium.com/swlh/how-to-scrape-large-amounts-of-reddit-data-using-pushshift-1d33bde9286

In [1]:
import os
import sys
# //*** Imports and Load Data
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import hashlib

from pmaw import PushshiftAPI


#//*** Use the whole window in the IPYNB editor
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

#//*** Maximize columns and rows displayed by pandas
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', None)


api = PushshiftAPI()

import datetime as dt
before = int(dt.datetime(2021,6,26,0,0).timestamp())
after = int(dt.datetime(2020,12,1,0,0).timestamp())

In [7]:
#//*** Update a target subreddit with new data
#//*** subreddit = subbreddit name
#//*** Method = 'before' or 'after' indicating if the records to be retrieved are before or after the target_utc. Defaults to After
#//*** limit is the number or records to grab
def update_subreddit(subreddit,method,limit):
    import time
    filename = f".\\data\\{subreddit}_comments.csv.zip"
    print(f"Reading csv: {filename}")
    start_time = time.time()
    update_df = pd.read_csv(filename, compression = 'zip')
    
    print(f"csv loaded: {round(time.time()-start_time,2)}s")
    
    print(f"csv Record count: {len(update_df)}")

    #//*** If before, get the largest (latest) utc
    if method == 'before':
        #//*** Get the Before utc from the stored csv
        before = update_df['created_utc'].min() 
        
        print(f"Getting {limit} records before {before} utc")
        start_time = time.time()
        
        #//*** Download comments
        comments = api.search_comments(subreddit=subreddit, limit=limit, before=before)
        
        print(f"Download Time: {round(time.time()-start_time,2)}s")
        
    elif method == 'after':
        after = update_df['created_utc'].max() 
        
        print(f"Getting {limit} records after {after} utc")
        start_time = time.time()

        #//*** Download comments
        comments = api.search_comments(subreddit=subreddit, limit=limit, after=after)
        print(f"Download Time: {round(time.time()-start_time,2)}s")

    else:
        print(f"Method needs to be 'before' or 'after': [{method}] is invalid")
        return
    
    
    #//***************************************************************************
    #//*** Download Complete
    #//***************************************************************************

    #//*** Convert comments to Dataframe
    raw_df = pd.DataFrame(comments)
    
    #//*** Columns to keep
    keep_cols = ["score","total_awards_received","created_utc","is_submitter","author_fullname","body","id","link_id","parent_id","stickied","permalink","retrieved_on","subreddit","subreddit_id"]
    
    #//*** Not all columns appear. This usually happens with small samples used for testing.
    #//*** Only use the keep_cols that are actually in the sample. The missing columns will be added during concat later
    actual_cols = []
    
    #//*** Loop through each column we want to keep
    for col in keep_cols:
        #//*** Add col to actual_cols if it exists
        if col in raw_df.columns:
            actual_cols.append(col)

    #//*** Keep the important columns
    raw_df = raw_df[actual_cols]

    print(f"Checking For Duplicates - Length Before: {len(raw_df)}")
    
    #//*** Hash the body, will use to check for duplicates
    #//*** Hash the body using sha-256
    #Sha256: Reference https://www.pythonpool.com/python-sha256/

    raw_df['hash'] = raw_df['body'].apply(lambda x:hashlib.sha256(x.encode()).hexdigest())


    # dropping Duplicates First. No sense in processing these
    raw_df.drop_duplicates(subset ="hash",keep = False, inplace = True)
    
    print(f"Checking For Duplicates - Length After: {len(raw_df)}")

    #print("Begin Cleaning")

    #//*** Clean text, tokenize and remove stop words
    #raw_df['clean'] = remove_stop_words(tokenize_series(mr_clean_text(raw_df['body'],{"remove_empty":False})))
    
    #//*** encode the comments
    #//*** Breaking this out into a separate function for readability and possible future flexibility
    #raw_df = encode_comments(raw_df)
    
    #//*** Combining existing dataframe with raw_df
    update_df = pd.concat([update_df,raw_df])
    print(f"Combined Dataframe Size:{len(update_df)}")

    # Check for Duplicates. 
    update_df.drop_duplicates(subset ="hash",keep = False, inplace = True)
    print(f"Dropping Duplicates - New Size:{len(update_df)}")

    #print("Replace NaN with Zeros")
    update_df = update_df.fillna(0)
    
    #//*** Sort the Dataframe by UTC date. This keeps the time series chronological. 
    #//*** No need to reindex, since index will be ignored at csv read/write
    update_df = update_df.sort_values('created_utc')

    #//*** Convert is Submitter,Stickied field to Boolean.
    #//*** Some early values are Integers and Strings
    update_df['is_submitter'] = update_df['is_submitter'].astype('bool')
    update_df['stickied'] = update_df['stickied'].astype('bool')
    update_df['author_fullname'] = update_df['author_fullname'].astype('str')
    
    print(f"Writing {filename}")

    start_time = time.time()

    update_df.to_csv(filename,compression="zip",index=False)    
    
    print(f"File Written: {round(time.time()-start_time,2)}s")
    
    print(f"update_subreddit() Complete: {len(update_df)} records")
    
    del update_df
    del raw_df
    





In [8]:
#//*** Add 100k Comments to wallstreetbets
update_subreddit("wallstreetbets","before",100)



Reading csv: .\data\wallstreetbets_comments.csv.zip
csv loaded: 2.77s
csv Record count: 399786
Getting 100 records before 1368383569 utc
Total:: Success Rate: 100.00% - Requests: 1 - Batches: 1 - Items Remaining: 0
Download Time: 1.47s
Checking For Duplicates - Length Before: 100
Checking For Duplicates - Length After: 82
Combined Dataframe Size:399868
Dropping Duplicates - New Size:399868
Writing .\data\wallstreetbets_comments.csv.zip
File Written: 9.0s
update_subreddit() Complete: 399868 records


In [4]:
#print(len(raw_df['hash'].unique()))
#print(len(raw_df.tail()))
#raw_df

In [5]:
#//*** Reference to manually read & write to Data Frame
#filename = f".\\data\\wallstreetbets_comments.csv.zip"
#update_df = pd.read_csv(filename, compression = 'zip')
#print(len(update_df))
#update_df
#filename = f".\\data\\wallstreetbets_comments_comments.csv"
#update_df.to_csv(filename, compression = 'zip',index=False)    

In [None]:
#raw_df.to_csv("reddit_comments.csv.zip",compression="zip",index=False)

In [None]:
#//***************************************
#//*** Apply Common Cleanup operations
#//***************************************
#//*** In anticpation that I'll be re-using text cleanup code. I'm adding some robustness to the function.
#//*** Adding kwargs to disable features that default to true.
#//*** Whether an action is skipped or executed is based on a boolean value stored in action_dict.
#//*** Key values will default to true. If code needs to be defaulted to False, a default_false list can be added later
#//*** All Boolean kwarg keya are stored in kwarg list. This speeds up the coding of the action_dict.
#//*** As Kwargs are added 
def mr_clean_text(input_series, input_options={}):
    
    #//*** import time library
    try:
        type(time)
    except:
        import time
    
    #//*** Start Timing the process
    start_time = time.time()
    
    #//*** Add some data validation. I'm preparing this function for additional use. I'm checking if future users (ie future me)
    #//*** may throw some garbage at this function. Experience has taught me to fail safely wherever possible.

    #//*** All kwargs are listed here. These initialize TRUE by default.
    key_list = [ "lower", "newline", "html", "remove_empty", "punctuation" ]
    
    #//*** Build Action Dictionary
    action_dict = { } 
    
    #//*** Build the keys from kwarg_list and default them to TRUE
    for key in key_list:
        action_dict[key] = True
        
    #//*** Loop through the input kwargs (if any). Assign the action_dict values based on the kwargs:
    for key,value in input_options.items():
        print(key,value)
        action_dict[key] = value
    
    
    #//*************************************************************************
    #//*** The Cleanup/Processing code is a straight lift from DSC550 - Week02
    #//*************************************************************************
    #//*** Convert to Lower Case, Default to True
    if action_dict["lower"]:
        input_series = input_series.str.lower()
    
   
    #//*** Remove New Lines
    if action_dict["newline"]:
        #//*** Rmove \r\n
        input_series = input_series.str.replace(r'\r?\n',"")

        #//*** Remove \n new lines
        input_series = input_series.str.replace(r'\n',"")

    #//*** Remove html entities, observed entities are &gt; and &lt;. All HTML entities begin with & and end with ;.
    #//*** Let's use regex to remove html entities
    if action_dict["html"]:
        input_series = input_series.str.replace(r'&.*;',"")

    #//*** Remove the empty lines
    if action_dict["remove_empty"]:
        input_series = input_series[ input_series.str.len() > 0]

    #//*** Remove punctuation
    if action_dict["punctuation"]:
        #//*** Load libraries for punctuation if not already loaded.
        #//*** Wrapping these in a try, no sense in importing libraries that already exist.
        #//*** Unsure of the cost of reimporting libraries (if any). But testing if library is already loaded feels
        #//*** like a good practice
        try:
            type(sys)
        except:
            import sys

        try:
            type(unicodedata)
        except:
            import unicodedata
        
        #//*** replace Comma and Period with a space.
        for punct in [",",".","$"]:
            input_series = input_series.str.replace(punct," ")

        #//*** Remove punctuation using the example from the book
        punctuation = dict.fromkeys(i for i in range(sys.maxunicode) if unicodedata.category(chr(i)).startswith('P') )
        input_series = input_series.str.translate(punctuation)

    print(f"Text Cleaning Time: {time.time() - start_time}")

    return input_series
#//*** Remove Stop words from the input list
def remove_stop_words(input_series):
    
    #//*** This function removes stop_words from a series.
    #//*** Works with series.apply()
    def apply_stop_words(input_list):

        #//*** Load Stopwords   
        for word in input_list:
            if word in stop_words:
                input_list.remove(word)
        return input_list

    #//*** import nltk if needed
    try:
        type(nltk)
    except:
        import nltk
        
    stopwords = nltk.corpus.stopwords

    #//*** Stopwords requires an additional download
    try:
        type(stopwords)
    except:
        nltk.download('stopwords')


    #//*** import time library
    try:
        type(time)
    except:
        import time

    #//*** Start Timing the process
    start_time = time.time()


    #//*** The stop_words include punctuation. Stop Word Contractions will not be filtered out.
    stop_words = []

    #//*** Remove apostrophies from the stop_words
    for stop in stopwords.words('english'):
        stop_words.append(stop.replace("'",""))

    
    #//*** Remove Stop words from the tokenized strings in the 'process' column
    #input_series = input_series.apply(remove_stop_words,stop_words)
    
    input_series = input_series.apply(apply_stop_words)

    print(f"Stop Words Time: {time.time() - start_time}")
    
    return input_series
#//*** Tokenize a Series containing Strings.
#//*** Breaking this out into it's own function for later reuse.
#//*** Not a lot of code here, but it helps to keep the libraries localized. This creates standarization for future
#//*** Stoneburner projects. Also has the ability to add functionality as needed.

def tokenize_series(input_series):
    
    try:
        type(nltk)
    except:
        import nltk
    
    word_tokenize = nltk.tokenize.word_tokenize 
    
    #//*** import time library
    try:
        type(time)
    except:
        import time
    
    #//*** Start Timing the process
    start_time = time.time()
    
    input_series = input_series.apply(word_tokenize)
    
    print(f"Tokenize Time: {time.time() - start_time}")
    
    return input_series

In [None]:
#//*** Encodes the dataframe with a count of Ticker symbols in each comment.
#//*** Called from update_subreddit(). This is broken out since we will likely need to adjust encoding parameters
def encode_comments(raw_df):
    import time
    
    print("Begin dataframe ticker symbol coding")
    start_time = time.time()
    
    #//*** Build list of nasdaq and NYSE ticker symbols
    #//*** Reads from Excel file.
    #//*** Gets the Symbol column, and converts to lower case, 
    nyse = pd.read_csv("NYSE_20210625.csv",header=None)[0].str.lower()
    nasdaq = pd.read_csv("NASDAQ_20210625.csv",header=None)[0].str.lower()

    #//*** Removes symbols with 1 and 2 character listings
    nyse = list(nyse[nyse.apply(lambda x: len(x)>2) ])
    nasdaq = list(nasdaq[nasdaq.apply(lambda x: len(x)>2) ])

    #//*** Combines both lists
    symbols = nyse + nasdaq
    

    #//*** Count each Stock mention add it to a dictionary of lists. Each list is filled with 0s. The Specific row index is updated with the relevant count. 
    #//*** This Generates a word count matrix
    stock_dict = {}

    #//*** Keep Track of Rows
    index = 0

    for row in raw_df.iterrows():

        #//*** Get the cleaned body text
        body = row[1]['clean']

        #//*** For Each Stock Symbol
        for stock in symbols:

            #//*** Check if Stock exists in Body
            if stock in body:

                #//*** Reset the stock counter
                count = 0

                #//*** Loop through body and county ticker mentions
                for word in body:
                    #//*** If word found increment count
                    if stock == word:
                        count += 1

                #//*** Check if symbol is in stock_dict
                if stock not in stock_dict.keys():    

                    #//*** If not, then build it
                    stock_dict[stock] = np.zeros(len(raw_df))

                #//*** Update the stock value at the 
                stock_dict[stock][index] = count

        #//*** Increment Index to keep with row index
        index +=1   

    #//*** Loop through the dictionary key and lists
    for col,values in stock_dict.items():

        #//*** Add each key (which is a stock ticker symbol) as a column using the list of ticker counts for Data
        raw_df[col] = values.astype('int')

    print(f"Encoding Time: {round(time.time()-start_time,2)}s")
    
    return raw_df

In [None]:
#//*** Push shift scraper reference
"""
#//*** Download the First 100,000 Comments from reddit pushsift
subreddit="wallstreetbets"
limit=100
#comments = api.search_comments(subreddit=subreddit, limit=limit, before=before after=after)
comments = api.search_comments(subreddit=subreddit, limit=limit, after=after)
print(f'Retrieved {len(comments)} comments from Pushshift')

#//*** Convert comments to Dataframe
comments_df = pd.DataFrame(comments)

#//*** Save DataFrame to a file for processing
comments_df.to_csv(f"{subreddit}_raw_comments.csv.zip",compression="zip",index=False)
"""
print()

In [None]:
#//*** Reference to create Stock ticker count matrix
"""

#//*** Build list of ticker symbols from NYSE and NASDAQ
#//*** Reads from Excel file.
#//*** Gets the Symbol column, and converts to lower case, 
nyse = pd.read_csv("NYSE_20210625.csv",header=None)[0].str.lower()
nasdaq = pd.read_csv("NASDAQ_20210625.csv",header=None)[0].str.lower()

#//*** Removes symbols with 1 and 2 character listings
nyse = list(nyse[nyse.apply(lambda x: len(x)>2) ])
nasdaq = list(nasdaq[nasdaq.apply(lambda x: len(x)>2) ])

symbols = nyse + nasdaq

#//*** Count each Stock mention add it to a dictionary of lists. Each list is filled with 0s. The Specific row index is updated with the relevant count. 
#//*** This Generates a word count matrix
stock_dict = {}

#//*** Keep Track of Rows
index = 0

for row in raw_df.iterrows():
    
    #//*** Get the cleaned body text
    body = row[1]['clean']
    
    #//*** For Each Stock Symbol
    for stock in symbols:
        
        #//*** Check if Stock exists in Body
        if stock in body:
            
            #//*** Reset the stock counter
            count = 0
            
            #//*** Loop through body and county ticker mentions
            for word in body:
                #//*** If word found increment count
                if stock == word:
                    count += 1
                    
            #//*** Check if symbol is in stock_dict
            if stock not in stock_dict.keys():    

                #//*** If not, then build it
                stock_dict[stock] = np.zeros(len(raw_df))
            
            #//*** Update the stock value at the 
            stock_dict[stock][index] = count

    #//*** Increment Index to keep with row index
    index +=1   

#//*** Loop through the dictionary key and lists
for col,values in stock_dict.items():
    
    #//*** Add each key (which is a stock ticker symbol) as a column using the list of ticker counts for Data
    raw_df[col] = values.astype('int')
    
"""
print()