In [1]:
import os
import sys
import time
from datetime import date
from datetime import datetime
import time

# //*** Imports and Load Data
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

#//*** Use the whole window in the IPYNB editor
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

#//*** Maximize columns and rows displayed by pandas
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', None)

In [2]:
#//***************************************
#//*** Apply Common Cleanup operations
#//***************************************
#//*** In anticpation that I'll be re-using text cleanup code. I'm adding some robustness to the function.
#//*** Adding kwargs to disable features that default to true.
#//*** Whether an action is skipped or executed is based on a boolean value stored in action_dict.
#//*** Key values will default to true. If code needs to be defaulted to False, a default_false list can be added later
#//*** All Boolean kwarg keya are stored in kwarg list. This speeds up the coding of the action_dict.
#//*** As Kwargs are added 
def mr_clean_text(input_series, input_options={}):
    
    #//*** import time library
    try:
        type(time)
    except:
        import time
    
    #//*** Start Timing the process
    start_time = time.time()
    
    #//*** Add some data validation. I'm preparing this function for additional use. I'm checking if future users (ie future me)
    #//*** may throw some garbage at this function. Experience has taught me to fail safely wherever possible.

    #//*** All kwargs are listed here. These initialize TRUE by default.
    key_list = [ "lower", "newline", "html", "remove_empty", "punctuation" ]
    
    #//*** Build Action Dictionary
    action_dict = { } 
    
    #//*** Build the keys from kwarg_list and default them to TRUE
    for key in key_list:
        action_dict[key] = True
        
    #//*** Loop through the input kwargs (if any). Assign the action_dict values based on the kwargs:
    for key,value in input_options.items():
        print(key,value)
        action_dict[key] = value
    
    
    #//*************************************************************************
    #//*** The Cleanup/Processing code is a straight lift from DSC550 - Week02
    #//*************************************************************************
    #//*** Convert to Lower Case, Default to True
    if action_dict["lower"]:
        input_series = input_series.str.lower()
    
   
    #//*** Remove New Lines
    if action_dict["newline"]:
        #//*** Rmove \r\n
        input_series = input_series.str.replace(r'\r?\n',"")

        #//*** Remove \n new lines
        input_series = input_series.str.replace(r'\n',"")

    #//*** Remove html entities, observed entities are &gt; and &lt;. All HTML entities begin with & and end with ;.
    #//*** Let's use regex to remove html entities
    if action_dict["html"]:
        input_series = input_series.str.replace(r'&.*;',"")

    #//*** Remove the empty lines
    if action_dict["remove_empty"]:
        input_series = input_series[ input_series.str.len() > 0]

    #//*** Remove punctuation
    if action_dict["punctuation"]:
        #//*** Load libraries for punctuation if not already loaded.
        #//*** Wrapping these in a try, no sense in importing libraries that already exist.
        #//*** Unsure of the cost of reimporting libraries (if any). But testing if library is already loaded feels
        #//*** like a good practice
        try:
            type(sys)
        except:
            import sys

        try:
            type(unicodedata)
        except:
            import unicodedata
        
        #//*** replace Comma and Period with a space.
        for punct in [",",".","$"]:
            input_series = input_series.str.replace(punct," ")

        #//*** Remove punctuation using the example from the book
        punctuation = dict.fromkeys(i for i in range(sys.maxunicode) if unicodedata.category(chr(i)).startswith('P') )
        input_series = input_series.str.translate(punctuation)

    print(f"Text Cleaning Time: {time.time() - start_time}")

    return input_series
#//*** Remove Stop words from the input list
def remove_stop_words(input_series):
    
    #//*** This function removes stop_words from a series.
    #//*** Works with series.apply()
    def apply_stop_words(input_list):

        #//*** Load Stopwords   
        for word in input_list:
            if word in stop_words:
                input_list.remove(word)
        return input_list

    #//*** import nltk if needed
    try:
        type(nltk)
    except:
        import nltk
        
    stopwords = nltk.corpus.stopwords

    #//*** Stopwords requires an additional download
    try:
        type(stopwords)
    except:
        nltk.download('stopwords')


    #//*** import time library
    try:
        type(time)
    except:
        import time

    #//*** Start Timing the process
    start_time = time.time()


    #//*** The stop_words include punctuation. Stop Word Contractions will not be filtered out.
    stop_words = []

    #//*** Remove apostrophies from the stop_words
    for stop in stopwords.words('english'):
        stop_words.append(stop.replace("'",""))

    
    #//*** Remove Stop words from the tokenized strings in the 'process' column
    #input_series = input_series.apply(remove_stop_words,stop_words)
    
    input_series = input_series.apply(apply_stop_words)

    print(f"Stop Words Time: {time.time() - start_time}")
    
    return input_series
#//*** Tokenize a Series containing Strings.
#//*** Breaking this out into it's own function for later reuse.
#//*** Not a lot of code here, but it helps to keep the libraries localized. This creates standarization for future
#//*** Stoneburner projects. Also has the ability to add functionality as needed.

def tokenize_series(input_series):
    
    try:
        type(nltk)
    except:
        import nltk
    
    word_tokenize = nltk.tokenize.word_tokenize 
    
    #//*** import time library
    try:
        type(time)
    except:
        import time
    
    #//*** Start Timing the process
    start_time = time.time()
    
    input_series = input_series.apply(word_tokenize)
    
    print(f"Tokenize Time: {time.time() - start_time}")
    
    return input_series

In [7]:
#//*** Load Clean and Prepare data for aggregation
start_time = time.time()
print("Reading Compressed CSV")
raw_df = pd.read_csv(f".\\data\\wallstreetbets_comments.csv.zip", )
print(f"File Loaded: {round(time.time()-start_time,2)}s")

#//*** Convert UTC to date (not datetime)
#//** Second pass goes from 12-21 to 4-19
try:
    raw_df['created_utc'] = raw_df['created_utc'].apply(lambda x: date.fromtimestamp(x))
except:
    print()


raw_df['clean'] = remove_stop_words(tokenize_series(mr_clean_text(raw_df['body'],{"remove_empty":False})))

raw_df

Reading Compressed CSV
File Loaded: 2.97s
remove_empty False
Text Cleaning Time: 4.415989637374878
Tokenize Time: 48.00304889678955
Stop Words Time: 7.168112754821777


Unnamed: 0,score,total_awards_received,created_utc,is_submitter,author_fullname,body,id,link_id,parent_id,stickied,permalink,retrieved_on,subreddit,subreddit_id,hash,clean
0,1,0.0,2012-08-24,False,0,I will accept payments for my research...what'...,c5y9v5z,t3_yqtpn,t1_c5y49wz,False,0,1.429727e+09,wallstreetbets,t5_2th52,77c0f32dcf506571815f3d4839454f2b3e550f0e1efecd...,"[will, accept, payments, my, research, whats, ..."
1,3,0.0,2012-08-24,False,t2_4p1mf,"Because previously (until this post), when som...",c5yaaki,t3_yqtpn,t1_c5y9v5z,False,0,1.429728e+09,wallstreetbets,t5_2th52,b61949552c3a5d559111ba44d17a71113e408126dd198d...,"[previously, post, someone, claimed, looking, ..."
2,-2,0.0,2012-08-24,False,0,So you thought I was just going to give all of...,c5yahuo,t3_yqtpn,t1_c5yaaki,False,0,1.429728e+09,wallstreetbets,t5_2th52,97bf3c55f77337de3d6b83b05fc5601e2b346845b7b778...,"[thought, was, going, give, research, time, aw..."
3,2,0.0,2012-08-24,False,t2_3o5bc,I would also see some proof to back up your cl...,c5yaloj,t3_yqtpn,t1_c5y7jem,False,0,1.429728e+09,wallstreetbets,t5_2th52,5a3885658ca462a1fc0dd3b0af8858e183b05f4f474b84...,"[would, also, see, proof, back, claims, your, ..."
4,2,0.0,2012-08-24,False,t2_4p1mf,&gt; So you thought I was just going to give a...,c5yamfs,t3_yqtpn,t1_c5yahuo,False,0,1.429728e+09,wallstreetbets,t5_2th52,405314aad1e814100e822f7ac89274b17d66c4e11e5521...,"[because, post, chart, their, site, do, includ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
399863,3,0.0,2021-06-25,False,t2_9z0opsh3,Nice,h32dw7m,t3_o7jx7p,t1_h329ycg,False,/r/wallstreetbets/comments/o7jx7p/fraternal_as...,1.624958e+09,wallstreetbets,t5_2th52,fdc96ffbf256523aec8846ae56321053c7ab751c99eb76...,[nice]
399864,3,0.0,2021-06-25,False,t2_qc6iq,Ah so horoscopes ARE real,h32dwab,t3_o7z71d,t3_o7z71d,False,/r/wallstreetbets/comments/o7z71d/amc_what_hap...,1.624958e+09,wallstreetbets,t5_2th52,20c5bee31dc23c8937aea64670a4157c6547621f763ff4...,"[ah, horoscopes, real]"
399865,3,0.0,2021-06-25,False,t2_bl7b6dkh,If one did this I would tell them Fuck Outta Here,h32dwip,t3_o7vagy,t1_h32daow,False,/r/wallstreetbets/comments/o7vagy/weekend_disc...,1.624958e+09,wallstreetbets,t5_2th52,96ac9390a4f9c8c36aeabf6d3f0358478ce7b3a295c9b6...,"[one, this, would, tell, fuck, outta]"
399866,4,0.0,2021-06-25,False,t2_5b0a37kn,Canada has still those stupid Covid restrictio...,h32dwhz,t3_o7vagy,t3_o7vagy,False,/r/wallstreetbets/comments/o7vagy/weekend_disc...,1.624958e+09,wallstreetbets,t5_2th52,6fa28767c5b75399225dd168a981c8570422fab812afab...,"[canada, still, stupid, covid, restrictions, p..."


In [5]:
#//*** Encodes the dataframe with a count of Ticker symbols in each comment.
#//*** Called from update_subreddit(). This is broken out since we will likely need to adjust encoding parameters
def aggregate_comments(raw_df):
    import time

    #print("Begin Cleaning")

    #//*** Clean text, tokenize and remove stop words
    #raw_df['clean'] = remove_stop_words(tokenize_series(mr_clean_text(raw_df['body'],{"remove_empty":False})))


    #//*** Stock Ticker Symbols to track
    symbols = ["CLOV","SOFI","WKHS","AMD","GME","X","AMC","CLNE","NIO","MU","SPCE","BB"]

    #//*** Count each Stock mention add it to a dictionary of lists. Each list is filled with 0s. The Specific row index is updated with the relevant count. 
    #//*** This Generates a word count matrix
    stock_dict = {}
    




    #//*** Group 
    for group in raw_df.groupby('created_utc'):
        print(len(group[1]))
        loop_df = group[1].copy()
        
        #//*** Initialze Stock Dict 
        for symbol in symbols:
            stock_dict[symbol] = 0
        


        break

#//*** Initialize Agregated DataFrame
df = aggregate_comments(raw_df)
df


Begin Cleaning
remove_empty False
Text Cleaning Time: 4.3419764041900635
Tokenize Time: 47.86150026321411
Stop Words Time: 7.036121129989624


Unnamed: 0,score,total_awards_received,created_utc,is_submitter,author_fullname,body,id,link_id,parent_id,stickied,permalink,retrieved_on,subreddit,subreddit_id,hash,clean
0,1,0.0,2012-08-24,False,0,I will accept payments for my research...what'...,c5y9v5z,t3_yqtpn,t1_c5y49wz,False,0,1.429727e+09,wallstreetbets,t5_2th52,77c0f32dcf506571815f3d4839454f2b3e550f0e1efecd...,"[will, accept, payments, my, research, whats, ..."
1,3,0.0,2012-08-24,False,t2_4p1mf,"Because previously (until this post), when som...",c5yaaki,t3_yqtpn,t1_c5y9v5z,False,0,1.429728e+09,wallstreetbets,t5_2th52,b61949552c3a5d559111ba44d17a71113e408126dd198d...,"[previously, post, someone, claimed, looking, ..."
2,-2,0.0,2012-08-24,False,0,So you thought I was just going to give all of...,c5yahuo,t3_yqtpn,t1_c5yaaki,False,0,1.429728e+09,wallstreetbets,t5_2th52,97bf3c55f77337de3d6b83b05fc5601e2b346845b7b778...,"[thought, was, going, give, research, time, aw..."
3,2,0.0,2012-08-24,False,t2_3o5bc,I would also see some proof to back up your cl...,c5yaloj,t3_yqtpn,t1_c5y7jem,False,0,1.429728e+09,wallstreetbets,t5_2th52,5a3885658ca462a1fc0dd3b0af8858e183b05f4f474b84...,"[would, also, see, proof, back, claims, your, ..."
4,2,0.0,2012-08-24,False,t2_4p1mf,&gt; So you thought I was just going to give a...,c5yamfs,t3_yqtpn,t1_c5yahuo,False,0,1.429728e+09,wallstreetbets,t5_2th52,405314aad1e814100e822f7ac89274b17d66c4e11e5521...,"[because, post, chart, their, site, do, includ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
399863,3,0.0,2021-06-25,False,t2_9z0opsh3,Nice,h32dw7m,t3_o7jx7p,t1_h329ycg,False,/r/wallstreetbets/comments/o7jx7p/fraternal_as...,1.624958e+09,wallstreetbets,t5_2th52,fdc96ffbf256523aec8846ae56321053c7ab751c99eb76...,[nice]
399864,3,0.0,2021-06-25,False,t2_qc6iq,Ah so horoscopes ARE real,h32dwab,t3_o7z71d,t3_o7z71d,False,/r/wallstreetbets/comments/o7z71d/amc_what_hap...,1.624958e+09,wallstreetbets,t5_2th52,20c5bee31dc23c8937aea64670a4157c6547621f763ff4...,"[ah, horoscopes, real]"
399865,3,0.0,2021-06-25,False,t2_bl7b6dkh,If one did this I would tell them Fuck Outta Here,h32dwip,t3_o7vagy,t1_h32daow,False,/r/wallstreetbets/comments/o7vagy/weekend_disc...,1.624958e+09,wallstreetbets,t5_2th52,96ac9390a4f9c8c36aeabf6d3f0358478ce7b3a295c9b6...,"[one, this, would, tell, fuck, outta]"
399866,4,0.0,2021-06-25,False,t2_5b0a37kn,Canada has still those stupid Covid restrictio...,h32dwhz,t3_o7vagy,t3_o7vagy,False,/r/wallstreetbets/comments/o7vagy/weekend_disc...,1.624958e+09,wallstreetbets,t5_2th52,6fa28767c5b75399225dd168a981c8570422fab812afab...,"[canada, still, stupid, covid, restrictions, p..."


In [6]:
raw_df['clean'] = remove_stop_words(tokenize_series(mr_clean_text(raw_df['body'],{"remove_empty":False})))

remove_empty False
Text Cleaning Time: 4.485838174819946
Tokenize Time: 49.1232545375824
Stop Words Time: 7.268815994262695


In [3]:


"""
Original Encode Comments, Keeping for reference
#//*** Encodes the dataframe with a count of Ticker symbols in each comment.
#//*** Called from update_subreddit(). This is broken out since we will likely need to adjust encoding parameters
def encode_comments(raw_df):
    import time
    
    print("Begin dataframe ticker symbol coding")
    start_time = time.time()
    
    #//*** Build list of nasdaq and NYSE ticker symbols
    #//*** Reads from Excel file.
    #//*** Gets the Symbol column, and converts to lower case, 
    nyse = pd.read_csv("NYSE_20210625.csv",header=None)[0].str.lower()
    nasdaq = pd.read_csv("NASDAQ_20210625.csv",header=None)[0].str.lower()

    #//*** Removes symbols with 1 and 2 character listings
    nyse = list(nyse[nyse.apply(lambda x: len(x)>2) ])
    nasdaq = list(nasdaq[nasdaq.apply(lambda x: len(x)>2) ])

    #//*** Combines both lists
    symbols = nyse + nasdaq
    

    #//*** Count each Stock mention add it to a dictionary of lists. Each list is filled with 0s. The Specific row index is updated with the relevant count. 
    #//*** This Generates a word count matrix
    stock_dict = {}

    #//*** Keep Track of Rows
    index = 0

    for row in raw_df.iterrows():

        #//*** Get the cleaned body text
        body = row[1]['clean']

        #//*** For Each Stock Symbol
        for stock in symbols:

            #//*** Check if Stock exists in Body
            if stock in body:

                #//*** Reset the stock counter
                count = 0

                #//*** Loop through body and county ticker mentions
                for word in body:
                    #//*** If word found increment count
                    if stock == word:
                        count += 1

                #//*** Check if symbol is in stock_dict
                if stock not in stock_dict.keys():    

                    #//*** If not, then build it
                    stock_dict[stock] = np.zeros(len(raw_df))

                #//*** Update the stock value at the 
                stock_dict[stock][index] = count

        #//*** Increment Index to keep with row index
        index +=1   

    #//*** Loop through the dictionary key and lists
    for col,values in stock_dict.items():

        #//*** Add each key (which is a stock ticker symbol) as a column using the list of ticker counts for Data
        raw_df[col] = values.astype('int')

    print(f"Encoding Time: {round(time.time()-start_time,2)}s")
    
    return raw_df

#//*** Initialize Agregated DataFrame
df = pd.DataFrame()

#//*** Group 
for group in raw_df.groupby('created_utc'):
    print(len(group[1]))
    loop_df = group[1].copy()
    
    #//*** Clean text, tokenize and remove stop words
    loop_df['clean'] = remove_stop_words(tokenize_series(mr_clean_text(loop_df['body'],{"remove_empty":False})))
    
    #//*** encode the comments
    #//*** Breaking this out into a separate function for readability and possible future flexibility
    loop_df = encode_comments(loop_df)
    
    break
    
"""