In [1]:
import os
import sys
import time
from datetime import date
from datetime import datetime
import time
import json
import platform

import stoneburner
#//*** Custom Functions:
#//*** mr_clean_text(input_series)
#//*** tokenize_series(input_series)
#//*** remove_stop_words(input_series)

# //*** Imports and Load Data
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

#//*** Use the whole window in the IPYNB editor
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
subreddits = ["wallstreetbets", "stocks", "wallstreetbetsOGs", "spacs", "investing", "pennystocks", "stockmarket", "options", "robinhoodpennystocks", "wallstreetbetsnew", "smallstreetbets"]
filepath = "./data/"
filename_suffix = "_comments.csv.zip"
#//*** Maximize columns and rows displayed by pandas
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', None)

In [2]:
def build_all_comments(subreddit,filepath,filename_suffix,keep_cols=['created_utc','score','total_awards_received','is_submitter','author_fullname','body']):

    import time
    
    start_time = time.time()

    #//*** Initialize Output dataframe
    output_df = pd.DataFrame()

    #//*** Load each Subreddit for Aggregation
    for subreddit in subreddits:
        #//*** Filepath + subreddit name + csv.zip
        input_filename = filepath+subreddit+filename_suffix

        print(f"Reading Compressed CSV: {input_filename}")

        #//*** Read Each DataFrame and combine with output_df
        output_df = pd.concat([output_df,pd.read_csv(input_filename,compression='zip' )])

    #//*** All Files read
    #//*** Reset the output_df index, since multiple indexes have been combined
    output_df.reset_index(drop=True, inplace=True)

    print(f"Files Loaded: {round(time.time()-start_time,2)}s")
    print(f"Total Records: {len(output_df)}")
    
    #//*** Ensure the body column is a string
    output_df['body'] = output_df['body'].astype('str')

    #//*** Convert UTC to date (not datetime)
    #//** Second pass goes from 12-21 to 4-19
    try:
        output_df['created_utc'] = output_df['created_utc'].apply(lambda x: datetime.fromtimestamp(x))
    except:
        print()

    try:
        #//*** Keep just date and body fields
        output_df = output_df[keep_cols]
    except:
        print("Skipping Keep Cols")
        
    return output_df

def build_stocks(symbol,interval,targets):
    
    stock_df = pd.read_csv(f"./stocks/{symbol}_{interval}.csv.zip")
    stock_df
    
    if 'date' in stock_df.columns:
        stock_df['date'] = pd.to_datetime(stock_df['date'])
        stock_df = stock_df.rename(columns={'date':'time'})
    else:
        stock_df['time'] = pd.to_datetime(stock_df['time'])
    
    offset_targets = [1,2,3,4,5,6,7,8,9,10]

    for offset in targets:
        #//*** create a list of nan values of x length
        nan_list = list(np.empty( offset )* np.nan )

        #//*** Create target variable Price which is stocks + x columns in advance
        #//*** Takes the closing price starting at x and gets the remainder, this generates the offset
        #//*** nan_list fills the missing x values with nans
        stock_df[f'target_{offset}'] = list(stock_df['close'][offset:]) + nan_list 

    stock_df = stock_df[:offset*-1]
    
    #//*** Keeping this cool chunk of code as a reference
    #stock_df['time'] = stock_df['time'].apply(lambda x: x.timestamp())
    
    #//*** Remove Comments older than the first stock price
    #raw_df = raw_df[ raw_df['created_utc'] >= stock_df['time'].min() ]
    
    #//*** Remove Stock Prices older than the first Comments price
    #stock_df = stock_df[ stock_df['time'] >= raw_df['created_utc'].min() ]
    

    
    #//*** Reorder Comments by date
    raw_df.sort_values('created_utc',inplace=True,ignore_index=True)
    
    #//*** Reorder Stocks by date
    stock_df.sort_values('time',inplace=True, ignore_index=True)
    
    return stock_df
  
def group_dataframe_by_time(stock_df,cdf,symbol=None):
    groups = stock_df.groupby('time')

    #https://www.geeksforgeeks.org/how-to-iterate-over-dataframe-groups-in-python-pandas/
    key_list = list(groups.groups.keys())

    out_df = pd.DataFrame()

    start_time = time.time()
    comment_min_time = raw_df['created_utc'].min()
    comment_max_time = raw_df['created_utc'].max()
    min_index = 0
    start_time = time.time()
    print("Processing...")
    for x in range(len(key_list)-1):

        #//*** Get a single line of stocks as a dataframe
        loop_stocks_df = groups.get_group((key_list)[x]).copy()  

        t1 = groups.get_group((key_list)[x])['time'].iloc[0]
        t2 = groups.get_group((key_list)[x+1])['time'].iloc[0]

        #search through cdf to find comments that are between t1 and t2
        #if len(cdf[ (cdf['created_utc'] > t1) & (cdf['created_utc'] < t2) ]) > 0:
        #temp_df = cdf.iloc[min_index:]
        temp_df = cdf[(cdf['created_utc'] >= t1) & (cdf['created_utc'] < t2) ]

        if len(temp_df) == 0:
                #//*** No COmments on this Date
                loop_stocks_df['body'] = " "
                loop_stocks_df['comment_count'] = 0
                loop_stocks_df['is_submitter'] = 0
                loop_stocks_df['score'] = 0 
                loop_stocks_df['total_awards_received'] = 0 
                loop_stocks_df['author_fullname'] = pd.Series(dtype=object)
        else:
            #print(temp_df.index[0])
            #print(temp_df.index[-1])
            #print(cdf.iloc[temp_df.index[0]:temp_df.index[-1]])
            temp_df = cdf.iloc[temp_df.index[0]:temp_df.index[-1]]
            #//*** Get all the body comments and combine them
            
            #//*** If symbol is NOT specified take all the comments
            if symbol == None:
                loop_stocks_df['body'] = " ".join(list(temp_df['body']))
            else:
                #//*** Symbol specified, only use comments that include the symbol
                
                if len(temp_df[temp_df['body'].str.contains(symbol)]) > 0:
                    symbol_body_df = temp_df[temp_df['body'].str.contains(symbol)]
                    #print(symbol_body_df)
                    loop_stocks_df['body'] = " ".join(list(symbol_body_df['body']))
                    #print(loop_stocks_df)
                    #if len(symbol_body_df[symbol_body_df['body'].str.find('symbol') > -1]):
                    #    print(symbol_body_df[symbol_body_df['body'].str.find('symbol') > -1])
                    #    break
                
            
            

            #//*** Get a comment count, BC IDK Y
            loop_stocks_df['comment_count'] = len(temp_df['body']) 
            
            loop_stocks_df['is_submitter'] = temp_df['is_submitter'].sum()
            
            loop_stocks_df['score'] = temp_df['score'].sum()
            
            loop_stocks_df['total_awards_received'] = temp_df['total_awards_received'].sum()
            
            loop_stocks_df['author_fullname'] = (temp_df['author_fullname'])
        #//*** Secret to speeding up algorithm. Get the Index value of the last item found -1. 
        #//*** When Searching above, start the search from this index. Seems to speed things up. Since we're skipping past elements that we've already found
        if len(temp_df) > 0: 
            min_index = temp_df.index[-1]
            
        #print(temp_df)
        #print(temp_df.score.sum())
        #print(loop_stocks_df)
        #break

        #//*** Add the single line of loop_stocks_df to bin_df    
        out_df = pd.concat([out_df,loop_stocks_df])
            #print(loop_df.index)
            #print(loop_df)
            #cdf = cdf.drop(index=loop_df.index)
    print(f"Merge Built: {round(time.time()-start_time,2)}s")   
    return out_df

#model_df = group_dataframe_by_time(stock_df,raw_df,symbol)


In [3]:
#//*** Get all Reddit Comments and merge into a single Collection

#process_tfidf = False

#//**** Aggregate comments into a single dataframe
raw_df = build_all_comments(subreddits,filepath,filename_suffix)

raw_df

#//*** Trim dataframe to start in January 20202-01-02
start_trim_date = pd.to_datetime("2020-01-02")
raw_df = raw_df[raw_df['created_utc'] >= start_trim_date]


Reading Compressed CSV: ./data/wallstreetbets_comments.csv.zip
Reading Compressed CSV: ./data/stocks_comments.csv.zip
Reading Compressed CSV: ./data/wallstreetbetsOGs_comments.csv.zip
Reading Compressed CSV: ./data/spacs_comments.csv.zip
Reading Compressed CSV: ./data/investing_comments.csv.zip
Reading Compressed CSV: ./data/pennystocks_comments.csv.zip
Reading Compressed CSV: ./data/stockmarket_comments.csv.zip
Reading Compressed CSV: ./data/options_comments.csv.zip
Reading Compressed CSV: ./data/robinhoodpennystocks_comments.csv.zip
Reading Compressed CSV: ./data/wallstreetbetsnew_comments.csv.zip
Reading Compressed CSV: ./data/smallstreetbets_comments.csv.zip
Files Loaded: 40.96s
Total Records: 4432533


In [4]:
raw_df

Unnamed: 0,created_utc,score,total_awards_received,is_submitter,author_fullname,body
11770,2020-01-31 15:56:13,1,0.0,True,t2_3yo4vdyj,I thought he didn‚Äôt kill people ü§î
11771,2020-01-31 15:56:17,1,0.0,True,t2_127kom,Looks like 4pm is where your chart ends at the...
11772,2020-01-31 15:56:19,1,0.0,True,t2_49f6b0hp,Notice how I said Trump‚Äôs America
11773,2020-01-31 15:56:21,1,0.0,True,t2_56mvp9vo,If trump is exhonerated this weekend üöÄüöÄüöÄ
11774,2020-01-31 15:56:22,1,0.0,True,t2_15yfqh,Lol after yesterday I'm going monthly
...,...,...,...,...,...,...
4432528,2021-06-30 20:34:33,1,0.0,True,t2_8kmwyads,"Jun 30, 9.30pm EST.\n\n \nI just saw this pos..."
4432529,2021-06-30 21:08:36,1,0.0,False,t2_ah5dhj6a,Good run through. The mill is running and we‚Äôr...
4432530,2021-06-30 22:11:10,0,0.0,False,t2_6ypqa,Back to normal not there yet
4432531,2021-06-30 22:16:51,1,0.0,False,t2_9ypzayi0,üíéüëêüöÄüöÄüöÄüöÄ


In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle

#//**** Get all Stocks to Model
interval="daily"

#//*** Path to processed files

#//*** Path to the stock ticker JSON file
stock_ticker_filename = "./data/stock_tickers.json"

#//*** Convert Path to Mac formatting if needed
#if platform.system() == 'Darwin':
#    output_filename = output_filename.replace("\\","/")
#    stock_ticker_filename = stock_ticker_filename.replace("\\","/")

#//*** Load the Stock Tickers
f = open(stock_ticker_filename, "r")
symbols = json.loads(f.read())['symbols']

#symbols = ["amc"]
f.close()

#//*** Convert symbols to lower case
symbols = [x.lower() for x in symbols]

start_process_time = time.time()

for symbol in symbols:
    start_symbol_time = time.time()
    
    print("==========")
    print("==========")
    print("==========")
    print(f"Building {symbol} Targets...")
    #//*** Build stock price and target columns to predict
    stock_df = build_stocks(symbol,'daily',[1])  

    print(f"Dropping {symbol} dates before {start_trim_date}")
    stock_df = stock_df[stock_df['time'] >= start_trim_date]

    print(f"Dropping {symbol} dates after last comment {raw_df['created_utc'].iloc[-1]}")
    #//*** Remove stocks that are older than comments
    stock_df = stock_df[stock_df['time'] <= raw_df['created_utc'].iloc[-1]]
    
    print(f"Merging {symbol} with comments")
    #//*** Combine
    model_df = group_dataframe_by_time(stock_df,raw_df,symbol)
    
    
    model_df['body'] = model_df['body'].replace(np.nan, " ")
    
    tfidf = TfidfVectorizer()

    print(f"Begin {symbol} tfidf....")
    start_time = time.time()
    tfidf = TfidfVectorizer()
    tfidf_matrix = tfidf.fit_transform(model_df['body'])
    print(f"tfidf Built: {round(time.time()-start_time,2)}s")
    
    print(f"Pickling {symbol} tfidf...")
    outfile = open(f"./ignore_folder/model_ready_{symbol}_{interval}_tfidf_.pkl","wb")
    pickle.dump(tfidf,outfile)
    outfile.close()
    print(f"Pickling {symbol} tfidf_matrix")
    outfile = open(f"./ignore_folder/model_ready_{symbol}_{interval}_tfidf_matrix.pkl","wb")
    pickle.dump(tfidf_matrix,outfile)
    outfile.close()
    
    print(f"Saving {symbol} stocks with target(s)")
    del model_df['body']
    del model_df['author_fullname']
    #//*** Drop the comments and comment count and save for modeling
    model_df.to_csv(f"./ignore_folder/model_ready_{symbol}_{interval}.csv.zip",compression="zip",index=False)
    
    print(f"{symbol} Processing Complete: {round(time.time()-start_symbol_time,2)}")
    

print (f"All Processing Complete : {round(time.time()-start_process_time,2)}s")

Building clov Targets...
Dropping clov dates before 2020-01-02 00:00:00
Dropping clov dates after last comment 2021-06-30 22:40:35
Merging clov with comments
Processing...
Merge Built: 6.85s
Begin clov tfidf....
tfidf Built: 0.02s
Pickling clov tfidf...
Pickling clov tfidf_matrix
Saving clov stocks with target(s)
clov Processing Complete: 7.29
Building sofi Targets...
Dropping sofi dates before 2020-01-02 00:00:00
Dropping sofi dates after last comment 2021-06-30 22:40:35
Merging sofi with comments
Processing...
Merge Built: 4.6s
Begin sofi tfidf....
tfidf Built: 0.02s
Pickling sofi tfidf...
Pickling sofi tfidf_matrix
Saving sofi stocks with target(s)
sofi Processing Complete: 4.89
Building wkhs Targets...
Dropping wkhs dates before 2020-01-02 00:00:00
Dropping wkhs dates after last comment 2021-06-30 22:40:35
Merging wkhs with comments
Processing...
Merge Built: 8.55s
Begin wkhs tfidf....
tfidf Built: 0.01s
Pickling wkhs tfidf...
Pickling wkhs tfidf_matrix
Saving wkhs stocks with targ

In [6]:
model_df['body']

KeyError: 'body'

In [None]:

gg = pd.read_csv(f"./data/model_ready_{symbol}_{interval}.csv.zip")
gg


In [None]:
#//*** Group the stock values by time, this is essentially like itterrows()



In [None]:
fig,ax = plt.subplots()

ax.plot(model_df['time'],model_df['comment_count'] )



    #plt.legend(loc='upper right',bbox_to_anchor=(1.35, 1.2))
plt.show()

fig,ax = plt.subplots()

ax.plot(model_df['time'],model_df['comment_count'] )



    #plt.legend(loc='upper right',bbox_to_anchor=(1.35, 1.2))
plt.show()



In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from scipy.stats import pearsonr

import time
from sklearn.neural_network import MLPRegressor
from sklearn import linear_model
from sklearn.model_selection import cross_val_score
from sklearn import metrics
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import r2_score

In [None]:
results_df = pd.DataFrame(columns=['target','rmse','r2','start','mid','end','actual','predict'])

In [None]:
predict_df = pd.DataFrame(columns=['time','actual','predict'])

In [None]:
#//*** Lengthy K-Means Run. Runs a model on every day after 60+ and predicts the next day. Uses the previous 60 days to predict the next day. 
#//*** It may or may not be helpful to run k-means across the whole tfidf instead of the previous 60.

predict_df = pd.DataFrame(columns=['time','actual','predict','best_r'])

#//*** Loop through model_df. Model each day individually. Train 60 days, Predict 61.
training_days = 60
train_start_slice = 0
train_end_slice = training_days
cluster_iterations = 50
predict_col = [f'target_1']
train_cols = ['cluster_1','close','volume','open','high','low']
#Loop through combined 
for row_int in range(0,len(tdf)-training_days-1):
    
    
    #//*** Build the slice start index
    start_dex = train_start_slice+row_int
    
    #//*** Build the slice end index
    end_dex = train_end_slice+row_int
    
    dc = f"{start_dex}/{len(model_df)}"

    #//*** Build the sliced training df
    train_df = model_df[start_dex:end_dex].copy()

    tfidf = TfidfVectorizer()

    #print(dc, " Starting tfidf....")
    start_time = time.time()
    tfidf = TfidfVectorizer()
    train_tfidf = tfidf.fit_transform(train_df['body'])
    print(dc,f"tfidf Built: {round(time.time()-start_time,2)}s")
    
    
    #//*** Build the row index on the prediction (train_end_slice+1)
    predict_dex = row_int+train_end_slice+1
    
    #print("Starting K-Means...")
    offset=1
    best_r = 0
    best_cluster = []
    for x in range(1,cluster_iterations):

        start_time=time.time()
        kmeans = KMeans(n_clusters=5,init='random').fit(train_tfidf)
        cluster = kmeans.predict(train_tfidf)
        r,p = pearsonr(train_df[f'target_{offset}'],cluster)
        r = abs(r)
        #print(r,round(p,6))

        if p < .05:
            if r > best_r:
                best_r = r
                best_cluster = cluster
            #plt.legend(loc='upper right',bbox_to_anchor=(1.35, 1.2))
        if x == 1:
            print(dc,f"Estimated Cluster Run Time: {round( (time.time()-start_time)*cluster_iterations-1,2) }s")
    #print(f"Best R:{best_r}")
     
    train_df[f'cluster_{offset}'] = best_cluster

    
    x_train = np.array(train_df[train_cols])
    y_train = np.array(train_df[predict_col])

    x_test = pd.DataFrame(model_df.iloc[end_dex+1].copy()).transpose()
    
    x_test[f'cluster_{offset}'] = best_cluster[0]
    x_test = np.array(x_test[train_cols])
    

    y_test = model_df[predict_col].iloc[end_dex:end_dex+1]

    #regr_iter = 20

    start_time = time.time()

    # Create linear regression object
    regr = linear_model.LinearRegression()

    # Train the model using the training sets
    regr.fit(x_train, y_train)
    result = regr.predict(x_test)



    print(dc,f"[{result[0][0]}]  Actual: {model_df.iloc[predict_dex]['close']} - Best R2: {best_r}")
    
    
    #//*** Add Results to the predict_df
    predict_df.loc[len(predict_df)] = [model_df.iloc[predict_dex]['time'],model_df.iloc[predict_dex]['close'],result[0][0],best_r]

print("Done!")
predict_df

In [None]:
predict_df.to_csv("./results/amc_daily_60_to_1_model.csv",index=False)

In [None]:
predict_df
fig,ax = plt.subplots()

fig.set_figheight(10)
fig.set_figwidth(20)
plot_x = np.arange(len(y_test))
ax.plot(predict_df['time'],predict_df['actual'] )
ax.scatter(predict_df['time'],predict_df['predict'],color='red' )
plt.suptitle(f"AMC Model Train 60: Predict 1")
plt.show()

In [None]:
x_test = (model_df.iloc[end_dex+1].copy())
x_test = pd.DataFrame(model_df.iloc[end_dex+1].copy()).transpose()
x_test[f'cluster_{offset}'] = best_cluster[0]
#x_test = x_test.append( pd.Series(best_cluster[0], index = [f'cluster_{offset}']) )
x_test
x_train
x_train = np.array(train_df[train_cols].iloc[start_dex:end_dex])
y_train = np.array(train_df.iloc[start_dex:end_dex][predict_col],)
train_df


In [None]:
#//*** Train 60
#//*** Model 30
#//*** The Original K-Means Model


#//*** Divide up the data in 60 day intervals to predict the next 30 days
days = 30 
train_cols = ['cluster_1','close','volume','open','high','low']
cluster_iterations = 50


for index in range(2,int(len(tdf)/days)):
    
    min_slice = ((index-2)*days)
    mid_slice = index*days
    max_slice = (index*days)+days
    print(index,"Train Range:",min_slice,mid_slice," - Test Range:", mid_slice, max_slice  )
    
    train_df = tdf[min_slice:max_slice].copy()

    tfidf = TfidfVectorizer()

    print("Starting tfidf....")
    start_time = time.time()
    tfidf = TfidfVectorizer()
    train_tfidf = tfidf.fit_transform(train_df['body'])
    print(f"Built: {round(time.time()-start_time,2)}")
    
    print("Starting K-Means...")
    offset=1
    best_r = 0
    best_cluster = []
    for x in range(1,cluster_iterations):

        start_time=time.time()
        kmeans = KMeans(n_clusters=5,init='random').fit(train_tfidf)
        cluster = kmeans.predict(train_tfidf)
        r,p = pearsonr(train_df[f'target_{offset}'],cluster)
        r = abs(r)
        #print(r,round(p,6))

        if p < .05:
            if r > best_r:
                best_r = r
                best_cluster = cluster
            #plt.legend(loc='upper right',bbox_to_anchor=(1.35, 1.2))
        if x == 1:
            print(f"Estimated Cluster Run Time: {round( (time.time()-start_time)*cluster_iterations-1,2) }s")
    print(f"Best R:{best_r}")
    train_df[f'cluster_{offset}'] = best_cluster
    
     
    for tgt in [1,2,3,4,5,6,7,8,9,10]:
        predict_col = [f'target_{tgt}']
        x_train = np.array(train_df[train_cols].iloc[0:days*2*-1])
        y_train = np.array(train_df.iloc[0:days*2*-1][predict_col],)

        x_test = train_df[train_cols].iloc[days*2*-1:]
        y_test = train_df[predict_col].iloc[days*2*-1:]

        regr_iter = 20

        start_time = time.time()

        # Create linear regression object
        regr = linear_model.LinearRegression()

        # Train the model using the training sets
        regr.fit(x_train, y_train)
        result = regr.predict(x_test)

        mse = mean_squared_error(y_test, result)
        r2 = r2_score(y_test,result)
        
        #//*** Root Mean squared Error
        rmse = sqrt(mse)

        # Plot outputs
        display_size = 40

        fig,ax = plt.subplots()
        plot_x = np.arange(len(y_test))
        ax.plot(plot_x,y_test )
        ax.scatter(plot_x,result,color='red' )
        plt.suptitle(f"{predict_col}\nRmse: {rmse} - r2 {r2}")
        plt.show()
        
        

        #print(rmse)
        #print(result)

        #//*** Update Results_df
        results_df.loc[len(results_df)] = [predict_col[0],rmse,r2,min_slice,mid_slice,max_slice,np.array(y_test),result]
print('done')

In [None]:
results_df

In [None]:
#results_df.to_pickle("./results_amc_daily_kmeans_ols.pkl")


In [None]:
print(model_df)
tfidf = TfidfVectorizer()
whole_tfidf = tfidf.fit_transform(model_df['body'])
whole_tfidf

In [None]:
t = whole_tfidf.toarray()


In [None]:
display_size = 40

fig,ax = plt.subplots()
fig.set_figheight(10)
fig.set_figwidth(20)
plot_x = np.arange(len(t[0]))
for x in t:
    ax.scatter(plot_x,x,s=1 )
plt.show()

In [None]:
vocab_df = pd.DataFrame()
vocab_df['word'] = tfidf.vocabulary_
vocab_df['score'] = tfidf.idf_

In [None]:
tfidf_df = pd.DataFrame(whole_tfidf.toarray())
tfidf_df.columns = tfidf.vocabulary_

#tfidf_df[tfidf_df.columns[:10]]


list(vocab_df.sort_values('score',ascending=True)['word'])
#tfidf_df.columns = (range(len(tfidf_df.columns)))
tfidf_df[tfidf_df.columns[:10]]

In [None]:
vocab_df = pd.DataFrame()
vocab_df['word'] = tfidf.vocabulary_
vocab_df['score'] = tfidf.idf_
tfidf_df = pd.DataFrame(whole_tfidf.toarray())
ascending_vals = list(tfidf_df.sum().sort_values(ascending=True).index)
descending_vals = list(tfidf_df.sum().sort_values(ascending=False).index)
vocab_vals = list(vocab_df.sort_values('score',ascending=True)['word'])

In [None]:
from sklearn.decomposition import TruncatedSVD
tsvd = TruncatedSVD(3)
tsvd_df = pd.DataFrame(tsvd.fit_transform(tfidf_df[descending_vals[:100]]))
print(tsvd.explained_variance_ratio_.sum())
tsvd_df**2

In [None]:
from sklearn.model_selection import train_test_split

tfidf_df[descending_vals[:1000]]

train_stock_cols = ['close','volume','open','high','low']
train_stock_cols = ['close']
model_df.reset_index(drop=True,inplace=True)

#//*** Combine the model columns for training with the tfidf columns
X = np.array(pd.concat([model_df[train_stock_cols],tfidf_df[descending_vals[:100]]],ignore_index=True,axis=1))
X = np.array(pd.concat([model_df[train_stock_cols],tsvd_df],ignore_index=True,axis=1))
X = np.array(pd.concat([tsvd_df**2,model_df[train_stock_cols]],ignore_index=True,axis=1))
#X = model_df[train_stock_cols]
y = model_df['target_1']

x_train, x_test, y_train, y_test = train_test_split(X,y,test_size=0.2, shuffle=False)

# Create linear regression object
regr = linear_model.LinearRegression()

# Train the model using the training sets
regr.fit(x_train, y_train)
result = regr.predict(x_test)

mse = mean_squared_error(y_test, result)
r2 = r2_score(y_test,result)

#//*** Root Mean squared Error
rmse = sqrt(mse)



fig,ax = plt.subplots()

fig.set_figheight(10)
fig.set_figwidth(20)
plot_x = np.arange(len(y_test))
ax.plot(plot_x,y_test )
ax.scatter(plot_x,result,color='red' )
#ax.scatter(predict_df['time'],predict_df['predict'],color='red' )
plt.suptitle(f"r2: {r2} rmse: {rmse}")
plt.show()

In [None]:

x_test = pd.DataFrame(model_df.iloc[end_dex+1].copy()).transpose()

x_test[f'cluster_{offset}'] = best_cluster[0]
x_test = np.array(x_test[train_cols])





y_test = model_df[predict_col].iloc[end_dex:end_dex+1]

#regr_iter = 20

start_time = time.time()




In [None]:
plot_df = tfidf_df[descending_vals]
display_size = 40

fig,ax = plt.subplots()
fig.set_figheight(10)
fig.set_figwidth(20)
plot_x = np.arange(len(plot_df.columns))
for x in range(len(plot_df)):
    ax.scatter(plot_x,plot_df.iloc[x],s=1 )
plt.show()


fig,ax = plt.subplots()
fig.set_figheight(10)
fig.set_figwidth(20)
plot_x = np.arange(len(plot_df.columns[:1000]))
for x in range(len(plot_df)):
    ax.scatter(plot_x,plot_df[plot_df.columns[:1000]].iloc[x],s=1 )
plt.show()

In [None]:
for x in plot_df[plot_df.columns[:100]]:
    print(x)

In [None]:
display_size = 40

fig,ax = plt.subplots()
fig.set_figheight(10)
fig.set_figwidth(20)
plot_x = np.arange(len(plot_df.columns))
ax.scatter(plot_df.index,plot_df[plot_df.columns[0]])
plt.show()

In [None]:
plot_df = tfidf_df[vocab_vals]
display_size = 40

fig,ax = plt.subplots()
fig.set_figheight(10)
fig.set_figwidth(20)
plot_x = np.arange(len(plot_df.columns))
for x in range(len(plot_df)):
    ax.scatter(plot_x,plot_df.iloc[x],s=1 )
plt.show()

In [None]:
display_size = 40

fig,ax = plt.subplots()
fig.set_figheight(10)
fig.set_figwidth(20)
plot_x = np.arange(len(tfidf_df.columns))

#for x in range(len(tfidf_df)):
ax.scatter(plot_x,tfidf_df.iloc[0][descending_vals],s=5,color='green' )
#tfidf_df.columns = tfidf_df.sum().sort_values(ascending=True).index
ax.scatter(plot_x,tfidf_df.iloc[0][ascending_vals],s=2,color='black' )
plt.show()

In [None]:
display_size = 40

fig,ax = plt.subplots()
fig.set_figheight(100)
fig.set_figwidth(200)
plot_x = np.arange(len(tfidf_df.columns))
for x in range(len(tfidf_df)):
    ax.scatter(plot_x,tfidf_df.iloc[x],s=1 )
plt.show()

In [None]:
"""

print("Finding best cluster for classification")

train_slice = (270,300)
train_interval = 10

cluster_iterations = 50

train_df = tdf[train_slice[0]:train_slice[1]+train_interval].copy()

tfidf = TfidfVectorizer()

print("Starting tfidf....")
start_time = time.time()
tfidf = TfidfVectorizer()
train_tfidf = tfidf.fit_transform(train_df['body'])


print(f"Built: {round(time.time()-start_time,2)}")
print("Starting K-Means...")
offset=1
best_r = 0
best_cluster = []
for x in range(1,cluster_iterations):
    
    start_time=time.time()
    kmeans = KMeans(n_clusters=5,init='random').fit(train_tfidf)
    cluster = kmeans.predict(train_tfidf)
    r,p = pearsonr(train_df[f'target_{offset}'],cluster)
    r = abs(r)
    #print(r,round(p,6))
    
    if p < .05:
        if r > best_r:
            best_r = r
            best_cluster = cluster
        #plt.legend(loc='upper right',bbox_to_anchor=(1.35, 1.2))
    if x == 1:
        print(f"Estimated Cluster Run Time: {round( (time.time()-start_time)*cluster_iterations-1,2) }s")
print(best_r,best_cluster)
train_df[f'cluster_{offset}'] = best_cluster
train_df
   
   """
print()

In [None]:
 
"""

train_cols = ['cluster_1','close','volume','open','high','low']
predict_col = ['target_1']
x_train = np.array(train_df[train_cols].iloc[0:train_interval*-1])
y_train = np.array(train_df.iloc[0:train_interval*-1][predict_col],)

x_test = train_df[train_cols].iloc[train_interval*-1:]
y_test = train_df[predict_col].iloc[train_interval*-1:]


import time
from sklearn.neural_network import MLPRegressor
from sklearn import linear_model
from sklearn.model_selection import cross_val_score
from sklearn import metrics
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import r2_score

regr_iter = 20

start_time = time.time()

print("Regressing")


#regr = MLPRegressor(max_iter=50000).fit(x_train,y_train)
#scores = cross_val_score(regr, x_train, y_train, cv=5)
print(f"Complete: {round(time.time()-start_time,2)}" )
#//*** Score the model
#score = regr.score(x_train, y_train)
#result = regr.predict(x_test)


# Create linear regression object
regr = linear_model.LinearRegression()

# Train the model using the training sets
regr.fit(x_train, y_train)
result = regr.predict(x_test)
# Plot outputs
display_size = 40

fig,ax = plt.subplots()
plot_x = np.arange(len(y_test))
ax.plot(plot_x,y_test )
ax.scatter(plot_x,result,color='red' )

plt.show()
mse = mean_squared_error(y_test, result)

#//*** Root Mean squared Error
rmse = sqrt(mse)

print(rmse)
print(result)

"""

In [None]:
#//***Aggreate Comments for Training
#//*** Build tfidf
from sklearn.feature_extraction.text import TfidfVectorizer


#loop_list.append(tfidf.fit_transform(input_df['tfidf']))
tfidf_matrix = []
tfidf_list = []
tfidf = TfidfVectorizer()

print("Starting tfidf....")
start_time = time.time()
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(tdf['body'])


print(f"Built: {round(time.time()-start_time,2)}")

print(tfidf_matrix)
print

In [None]:
"""
#//*** Build TruncatedSVD

#from sklearn.decomposition import PCA
#from sklearn.preprocessing import StandardScaler
#from sklearn.preprocessing import MinMaxScaler

from sklearn.decomposition import TruncatedSVD

start_time = time.time()
print(f"Begin Truncated SVD ")

start_time=time.time()
#//*** Set the number of components to 6000. This is generating 98% variance capture
#//*** 60min data set took around 25minutes to build
tsvd = TruncatedSVD(250)
tsvd_df = pd.DataFrame(tsvd.fit_transform(tfidf_matrix))
print(tsvd.explained_variance_ratio_.sum())

print (f"Truncated SVD Done: {round(time.time()-start_time,2)}s")
#output_filename = './ignore_folder/tsvd_model_ready_daily.csv.zip'
#//*** Write Truncated SVD to disk
#tsvd_df.to_csv(output_filename, compression='zip', index=False)
"""
print("Truncated SVD Code")

LTSM: https://www.datacamp.com/community/tutorials/lstm-python-stock-market


In [None]:
import time
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import cross_val_score
from sklearn import metrics
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import r2_score

In [None]:
tsvd_df

In [None]:
fig,ax = plt.subplots()

for col in tsvd_df.columns:

    ax.scatter(tsvd_df[col],tsvd_df.index,label=col )


    #plt.legend(loc='upper right',bbox_to_anchor=(1.35, 1.2))
plt.show()

In [None]:
from sklearn.cluster import KMeans

# function returns WSS score for k values from 1 to kmax
def calculate_WSS(points, kmax):
  sse = []
  for k in range(1, kmax+1):
    kmeans = KMeans(n_clusters = k).fit(points)
    centroids = kmeans.cluster_centers_
    pred_clusters = kmeans.predict(points)
    curr_sse = 0
    
    # calculate square of Euclidean distance of each point from its cluster center and add to current WSS
    for i in range(len(points)):
        curr_center = centroids[pred_clusters[i]]
        curr_sse += (points[i, 0] - curr_center[0]) ** 2 + (points[i, 1] - curr_center[1]) ** 2
      
    sse.append(curr_sse)
    return sse

dir(tfidf_matrix)
#kmeans = KMeans(n_clusters=2).fit(tfidf_matrix)
dir(kmeans)
kmeans.score(tfidf_matrix)
k_scores = []
for x in [5,10,15,20,25,50]:
    start_time=time.time()
    kmeans = KMeans(n_clusters=x).fit(tfidf_matrix)
    loop_score = kmeans.score(tfidf_matrix)
    print(f"{x} - {loop_score} - {time.time()-start_time}s")
    k_scores.append(loop_score)
print(k_scores)

In [None]:
fig,ax = plt.subplots()

ax.plot(range(len(k_scores)),k_scores )


    #plt.legend(loc='upper right',bbox_to_anchor=(1.35, 1.2))
plt.show()

Kmeans + Mean Shift: https://jamesxli.blogspot.com/2012/03/on-mean-shift-and-k-means-clustering.html


In [None]:
dir(kmeans)