In [8]:
import os
import sys
import time
from datetime import date
from datetime import datetime
import time
import json
import platform

import stoneburner
#//*** Custom Functions:
#//*** mr_clean_text(input_series)
#//*** tokenize_series(input_series)
#//*** remove_stop_words(input_series)

# //*** Imports and Load Data
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

#//*** Use the whole window in the IPYNB editor
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

#//*** Maximize columns and rows displayed by pandas
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', None)

In [5]:
#//*** Input_filename: Comments to Process.
#//*** This will eventually be a list of files
input_filename  =".\\data\\wallstreetbets_comments.csv.zip"

#//*** Path to processed files
output_filename = ".\\data\\processed_reddit_basic_v2.csv.zip"

#//*** Path to the stock ticker JSON file
stock_ticker_filename = ".\\data\\stock_tickers.json"

#//*** Convert Path to Mac formatting if needed
if platform.system() == 'Darwin':
    input_filename = input_filename.replace("\\","/")
    output_filename = output_filename.replace("\\","/")
    stock_ticker_filename = stock_ticker_filename.replace("\\","/")

#//*** Load the Stock Tickers
f = open(stock_ticker_filename, "r")
symbols = json.loads(f.read())['symbols']
f.close()

print(symbols)
#//*** Convert symbols to lower case
symbols = [x.lower() for x in symbols]

    
#//*** Load Clean and Prepare data for aggregation
start_time = time.time()
print("Reading Compressed CSV")
raw_df = pd.read_csv(input_filename,compression='zip' )
print(f"File Loaded: {round(time.time()-start_time,2)}s")

#//*** Convert UTC to date (not datetime)
#//** Second pass goes from 12-21 to 4-19
try:
    raw_df['created_utc'] = raw_df['created_utc'].apply(lambda x: date.fromtimestamp(x))
except:
    print()

#//*************************************************************************
#//*** Clean the Body Text, Tokenize and Remove Stop Words.
#//*************************************************************************
raw_df['clean'] = stoneburner.remove_stop_words(stoneburner.tokenize_series(stoneburner.mr_clean_text(raw_df['body'],{"remove_empty":False})))


raw_df

['clov', 'sofi', 'wkhs', 'amd', 'gme', 'x', 'amc', 'clne', 'nio', 'mu', 'spce', 'bb']
Reading Compressed CSV
File Loaded: 3.38s
remove_empty False
Text Cleaning Time: 4.486855745315552
Tokenize Time: 49.75190496444702
Stop Words Time: 7.277637481689453


Unnamed: 0,score,total_awards_received,created_utc,is_submitter,author_fullname,body,id,link_id,parent_id,stickied,permalink,retrieved_on,subreddit,subreddit_id,hash,clean
0,1,0.0,2012-08-24,False,0,I will accept payments for my research...what'...,c5y9v5z,t3_yqtpn,t1_c5y49wz,False,0,1.429727e+09,wallstreetbets,t5_2th52,77c0f32dcf506571815f3d4839454f2b3e550f0e1efecd...,"[will, accept, payments, my, research, whats, ..."
1,3,0.0,2012-08-24,False,t2_4p1mf,"Because previously (until this post), when som...",c5yaaki,t3_yqtpn,t1_c5y9v5z,False,0,1.429728e+09,wallstreetbets,t5_2th52,b61949552c3a5d559111ba44d17a71113e408126dd198d...,"[previously, post, someone, claimed, looking, ..."
2,-2,0.0,2012-08-24,False,0,So you thought I was just going to give all of...,c5yahuo,t3_yqtpn,t1_c5yaaki,False,0,1.429728e+09,wallstreetbets,t5_2th52,97bf3c55f77337de3d6b83b05fc5601e2b346845b7b778...,"[thought, was, going, give, research, time, aw..."
3,2,0.0,2012-08-24,False,t2_3o5bc,I would also see some proof to back up your cl...,c5yaloj,t3_yqtpn,t1_c5y7jem,False,0,1.429728e+09,wallstreetbets,t5_2th52,5a3885658ca462a1fc0dd3b0af8858e183b05f4f474b84...,"[would, also, see, proof, back, claims, your, ..."
4,2,0.0,2012-08-24,False,t2_4p1mf,&gt; So you thought I was just going to give a...,c5yamfs,t3_yqtpn,t1_c5yahuo,False,0,1.429728e+09,wallstreetbets,t5_2th52,405314aad1e814100e822f7ac89274b17d66c4e11e5521...,"[because, post, chart, their, site, do, includ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
399863,3,0.0,2021-06-25,False,t2_9z0opsh3,Nice,h32dw7m,t3_o7jx7p,t1_h329ycg,False,/r/wallstreetbets/comments/o7jx7p/fraternal_as...,1.624958e+09,wallstreetbets,t5_2th52,fdc96ffbf256523aec8846ae56321053c7ab751c99eb76...,[nice]
399864,3,0.0,2021-06-25,False,t2_qc6iq,Ah so horoscopes ARE real,h32dwab,t3_o7z71d,t3_o7z71d,False,/r/wallstreetbets/comments/o7z71d/amc_what_hap...,1.624958e+09,wallstreetbets,t5_2th52,20c5bee31dc23c8937aea64670a4157c6547621f763ff4...,"[ah, horoscopes, real]"
399865,3,0.0,2021-06-25,False,t2_bl7b6dkh,If one did this I would tell them Fuck Outta Here,h32dwip,t3_o7vagy,t1_h32daow,False,/r/wallstreetbets/comments/o7vagy/weekend_disc...,1.624958e+09,wallstreetbets,t5_2th52,96ac9390a4f9c8c36aeabf6d3f0358478ce7b3a295c9b6...,"[one, this, would, tell, fuck, outta]"
399866,4,0.0,2021-06-25,False,t2_5b0a37kn,Canada has still those stupid Covid restrictio...,h32dwhz,t3_o7vagy,t3_o7vagy,False,/r/wallstreetbets/comments/o7vagy/weekend_disc...,1.624958e+09,wallstreetbets,t5_2th52,6fa28767c5b75399225dd168a981c8570422fab812afab...,"[canada, still, stupid, covid, restrictions, p..."


In [10]:
#//*************************************************************
#//*** Load the Encode_comments Function
#//*** Counts the Stock mentions in each Post.
#//*** Adds the stock as a column to the Dataframe
#//*************************************************************

def encode_comments(input_df):
    import time
    
    print("Begin dataframe ticker symbol coding")
    start_time = time.time()
       
    
    
    #//*** Count each Stock mention add it to a dictionary of lists. Each list is filled with 0s. The Specific row index is updated with the relevant count. 
    #//*** This Generates a word count matrix
    stock_dict = {}

    #//*** Keep Track of Rows
    index = 0

    for row in input_df.iterrows():

        #//*** Get the cleaned body text
        body = row[1]['clean']

        #//*** For Each Stock Symbol
        for stock in symbols:
            
            #//*** Check if Stock exists in Body
            if stock in body:

                #//*** Reset the stock counter
                count = 0

                #//*** Loop through body and county ticker mentions
                for word in body:
                    #//*** If word found increment count
                    if stock == word:
                        count += 1

                #//*** Check if symbol is in stock_dict
                if stock not in stock_dict.keys():    

                    #//*** If not, then build it
                    stock_dict[stock] = np.zeros(len(raw_df))

                #//*** Update the stock value at the 
                stock_dict[stock][index] = count

        #//*** Increment Index to keep with row index
        index +=1   

    #//*** Loop through the dictionary key and lists
    for col,values in stock_dict.items():

        #//*** Add each key (which is a stock ticker symbol) as a column using the list of ticker counts for Data
        raw_df[col] = values.astype('int')

    print(f"Encoding Time: {round(time.time()-start_time,2)}s")
    
    return input_df

In [24]:
#//*** Encodes the dataframe with a count of Ticker symbols in each comment.
#//*** Called from update_subreddit(). This is broken out since we will likely need to adjust encoding parameters
def aggregate_comments(input_df):
    
    from sklearn.feature_extraction.text import TfidfVectorizer
    
    to_sum_cols = ['score','total_awards_received']
    to_count_col = ['author_fullname','link_id']
    
    
    df_cols = ['date','total_posts','tfidf']
    
    rename_cols = {
        'total_awards_received' : 'awards',
        'author_fullname' : 'authors',
        'link_id' : 'threads'
    }
    
    #//*** Build the OUtput Dataframe Column names from the Columns to sum, the columns to count, and the stock ticker columns
    #//*** Loop through each list
    for cols in [ to_sum_cols, to_count_col, symbols ]:
        
        #//*** Get individual column name from each column list
        for col in cols:
            print
            #//*** Rename the column if in rename_col
            #//*** Add col to df_cols....The out_df column names
            if col in rename_cols.keys():
                df_cols.append(rename_cols[col])
            else:
                df_cols.append(col)
                
    print(df_cols)
    
    out_df = pd.DataFrame(columns = df_cols)
    
   
    #//*** Group 
    for group in input_df.groupby('created_utc'):
        
        #//*** Start Timing the process
        start_time = time.time()

        loop_df = group[1].copy()
        
        loop_list = []
        
        #//*** Build the aggregated row for the Dataframe.
        #//*** 5 Parts: 
        #//******** 1.) Date & Total Posts
        #//******** 2.) tfidf - Bag of Words for the Day
        #//******** 2.) Columns to sum
        #//******** 3.) Columns to count
        #//******** 4.) Stock Ticker columns to sum
        
        #//********************************************
        #//******** 1.) Date & Total Posts
        #//********************************************
        #//*** Add the Date
        loop_list.append(group[0])
        
        #//*** Add Total number of posts
        loop_list.append(len(loop_df))
        
        #//********************************************
        #//******** 2.) Build tfidf
        #//********************************************
        
        
        
        #//*** Initialize the Vectorizer
        tfidf = TfidfVectorizer()

        #//*** Build the feature matrix, which is a weighted sparse matrix
        loop_list.append(tfidf.fit_transform(input_df['tfidf']))
        
        #//********************************************
        #//******** 2.) Columns to sum
        #//********************************************
        for col in to_sum_cols:
            loop_list.append(loop_df[col].sum())

            
        #//********************************************
        #//******** 3.) Columns to count
        #//********************************************
        for col in to_count_col:
            loop_list.append(len(loop_df[col].unique()))
    
        
        #//********************************************
        #//******** 4.) Stock Ticker columns to sum
        #//********************************************
        for col in symbols:
            loop_list.append(loop_df[col].sum())

        #print(len(out_df.columns),len(loop_list))
        #print(out_df.columns)
        out_df.loc[len(out_df.index)] = loop_list 
        
        print(f"{group[0]} {len(loop_df)} Comments in {round(time.time() - start_time,2)}s")
    print("Aggregation Complete!")
    return out_df

#for col in df.columns[16:]:
#    print(df[df[col] > 0 ].iloc[0]['created_utc'],col)

In [23]:
#//*** Encode Comments
df = encode_comments(raw_df)

#//*** Aggregate and Process Comments
ag_df = aggregate_comments(df)
ag_df

Begin dataframe ticker symbol coding
Encoding Time: 32.1s
['date', 'total_posts', 'tfidf', 'score', 'awards', 'authors', 'threads', 'clov', 'sofi', 'wkhs', 'amd', 'gme', 'x', 'amc', 'clne', 'nio', 'mu', 'spce', 'bb']
2012-08-24 29 Comments in 5.65s
2012-08-25 19 Comments in 5.55s
2012-08-26 13 Comments in 5.71s
2012-08-27 11 Comments in 5.47s
2012-08-28 5 Comments in 5.49s
2012-08-29 5 Comments in 5.98s
2013-05-12 6 Comments in 5.57s
2013-05-13 22 Comments in 5.55s
2013-05-14 30 Comments in 5.55s
2013-05-15 20 Comments in 5.57s
2013-05-16 8 Comments in 5.6s
2013-05-17 3 Comments in 5.61s
2014-04-10 19 Comments in 5.51s
2014-04-11 65 Comments in 5.51s
2014-07-30 20 Comments in 5.5s
2014-07-31 25 Comments in 5.56s
2014-08-01 16 Comments in 5.61s
2014-08-02 4 Comments in 6.17s
2014-08-03 1 Comments in 5.47s
2014-08-04 13 Comments in 5.43s
2014-08-05 2 Comments in 5.49s
2014-08-06 4 Comments in 5.53s
2015-04-18 4 Comments in 5.54s
2015-04-19 89 Comments in 5.6s
2016-02-05 84 Comments in 5.

2020-11-09 1099 Comments in 6.23s
2020-11-10 1183 Comments in 6.31s
2020-11-11 821 Comments in 6.43s
2020-11-12 725 Comments in 6.29s
2020-11-13 1087 Comments in 5.92s
2020-11-14 909 Comments in 6.34s
2020-11-15 820 Comments in 6.53s
2020-11-16 866 Comments in 6.29s
2020-11-17 874 Comments in 6.34s
2020-11-18 1332 Comments in 6.43s
2020-11-19 1138 Comments in 6.35s
2020-11-20 791 Comments in 6.42s
2020-11-21 880 Comments in 6.42s
2020-11-22 1084 Comments in 6.32s
2020-11-23 834 Comments in 6.43s
2020-11-24 743 Comments in 6.26s
2020-11-25 748 Comments in 6.39s
2020-11-26 912 Comments in 6.38s
2020-11-27 987 Comments in 6.3s
2020-11-28 695 Comments in 6.45s
2020-11-29 994 Comments in 6.39s
2020-11-30 808 Comments in 6.31s
2020-12-01 1780 Comments in 6.59s
2020-12-02 1603 Comments in 6.23s
2020-12-03 1279 Comments in 6.41s
2020-12-04 1475 Comments in 6.63s
2020-12-05 1567 Comments in 6.26s
2020-12-06 2259 Comments in 6.53s
2020-12-07 1883 Comments in 6.34s
2020-12-08 2228 Comments in 6.4

Unnamed: 0,date,total_posts,tfidf,score,awards,authors,threads,clov,sofi,wkhs,amd,gme,x,amc,clne,nio,mu,spce,bb
0,2012-08-24,29,"(0, 74752)\t0.20781542118106996\n (0, 15154...",80,0.0,13,6,0,0,0,0,0,0,0,0,0,0,0,0
1,2012-08-25,19,"(0, 74752)\t0.20781542118106996\n (0, 15154...",18,0.0,7,3,0,0,0,0,0,0,0,0,0,0,0,0
2,2012-08-26,13,"(0, 74752)\t0.20781542118106996\n (0, 15154...",11,0.0,9,5,0,0,0,0,0,0,0,0,0,0,0,0
3,2012-08-27,11,"(0, 74752)\t0.20781542118106996\n (0, 15154...",21,0.0,6,6,0,0,0,0,0,0,0,0,0,0,0,0
4,2012-08-28,5,"(0, 74752)\t0.20781542118106996\n (0, 15154...",9,0.0,4,3,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
446,2021-06-21,567,"(0, 74752)\t0.20781542118106996\n (0, 15154...",2878,8.0,429,89,10,0,7,4,9,1,10,7,5,3,0,12
447,2021-06-22,557,"(0, 74752)\t0.20781542118106996\n (0, 15154...",3076,2.0,400,113,14,2,2,5,8,1,10,7,1,0,0,10
448,2021-06-23,694,"(0, 74752)\t0.20781542118106996\n (0, 15154...",4543,7.0,488,118,21,0,11,3,12,0,9,3,6,0,3,8
449,2021-06-24,513,"(0, 74752)\t0.20781542118106996\n (0, 15154...",2857,5.0,397,72,18,0,12,4,1,0,6,3,0,1,1,5


In [25]:
output_filename = ".\\data\\processed_reddit_basic_v2.csv.zip"

In [26]:
#//*** Write File to disk
ag_df.to_csv(output_filename,compression="zip",index=False) 