# Naive Machine Learning Models for WSB Stock Prediction

I'll be trying to use decision tree and random forest regressors to predict the stock of Tesla (TSLA) stock. I anticipate that my results won't be as good as the LSTM model, but I wanted to see how the models we learned in class would compare to it. 

In [1]:
!pip install yfinance



In [2]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.pipeline import Pipeline, make_pipeline
#from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

import pickle
from pprint import pprint

import yfinance as yf

import datetime
import time

In [3]:
def min_max_date(df):
    """
    Get the min and max date of the ticker being mentioned in our WSB dataset
    """
    min_date = df.sort_values('datetime').iloc[0]["datetime"].date()
    max_date = df.sort_values('datetime', ascending= False).iloc[0]["datetime"].date()
    return(min_date, max_date)

def explode_action_column(df):
    """
    Explodes columns of
    dict(puts: [., ., .,]
         calls: [., ., .,]
         sell: [., ., .,]
         buy: [., ., .,])
         
    into four columns ...
    |puts|calls|sell|buy|
    """
    return pd.concat([df.drop(['sentiment'], axis=1), df['sentiment'].apply(pd.Series)], axis=1)

def date_from_datetime(df):
    """
    Make datetime to date
    """
    df_copy = df.copy()
    df_copy["Date"] = df_copy["datetime"].dt.date
    return df_copy

def flatten_filter(df, ticker, filter = True):
    """
    Explodes the column of lists of tickers into multiple rows and then filter for ticker
    """
    df_copy = df.copy()
    df_flatten = df_copy.explode('tickers')
    if filter:
        df_flatten = df_flatten[df_flatten['tickers'] == ticker]
    else:
        return df_flatten
    return df_flatten

def indicator_actions(df, ticker):
    """
    Make indicator variables for ticker instead of list of tickers
    """
    df_copy = df.copy()
    df_copy = explode_action_column(df_copy)
    df_copy["puts"] = ((df_copy["puts"].apply(len) != 0) & 
                      (df_copy["puts"].astype('str').str.contains(ticker))).astype(int)
    df_copy["calls"] = ((df_copy["calls"].apply(len) != 0) & 
                       (df_copy["calls"].astype('str').str.contains(ticker))).astype(int)
    df_copy["buy"] = ((df_copy["buy"].apply(len) != 0) & 
                     (df_copy["buy"].astype('str').str.contains(ticker))).astype(int)
    df_copy["sell"] = ((df_copy["sell"].apply(len) != 0) & 
                      (df_copy["sell"].astype('str').str.contains(ticker))).astype(int)
    return df_copy

def plot_actions(df, ticker, min_date, max_date, hide_score = True):
    """
    Plot the aggregate sum of sell, buy, calls ,puts for each day within range for a specific ticker
    """
    df_copy = df.copy()
    
    # Aggregate financial action frequency per day
    if hide_score:
        agg_sell = df_copy[['Date','sell','buy','calls','puts']].groupby('Date').agg('sum')
    else:
        agg_sell = df_copy[['Date','sell','buy','calls','puts','score','ups']].groupby('Date').agg('sum')


    # Filter for range specified
    mask = (agg_sell.index > pd.to_datetime("2018-01-01").date()) & (agg_sell.index <= max_date)

    agg_sell.loc[mask].plot(figsize=(12, 10), linewidth=2.5)
    plt.xlabel("Date", labelpad=15)
    plt.ylabel("Movement Mentions", labelpad=15)
    plt.title("Movement mentions for {0} from {1} to {2}".format(ticker, str(min_date), str(max_date)), y=1.02, fontsize=22);

def plot_stock_vs_wsb(df,ticker, min_date, max_date, stock_col, action_col):
    """
    Plot the a specfic financial action mention in WSB comments vs historical data
    """
    df_copy = df.copy()
    
    # Aggregate financial action frequency per day
    agg_sell = df_copy[['Date','sell','buy','calls','puts','score','ups']].groupby('Date').agg('sum')
    
    
    # Get the minimum date of that ticker mentioned on our WSB comment
    MIN_DATE, MAX_DATE = min_max_date(df)

    # Pull the data from yahoo finance api
    stock_data = yf.download(ticker, start = MIN_DATE, end = MAX_DATE)
    
    # Filter for range specified
    mask = (stock_data.index > np.datetime64(str(min_date))) & (stock_data.index <= np.datetime64(max_date))

    plot_multi(stock_data.loc[mask].join(other = agg_sell)[[stock_col,action_col]], figsize=(10,5))

    plt.xlabel("Date", labelpad=15)
    plt.ylabel("Daily {0} mentions".format(action_col), labelpad=15)
    plt.title("Daily {1} mentions vs {2} from {3} to {4} for {0}".format(ticker, action_col, stock_col, str(min_date), str(max_date)), y=1.02, fontsize=22);

In [4]:
data = pickle.load(open("wsb.pkl", "rb"))

In [5]:
tesla_df = date_from_datetime(flatten_filter(data, 'TSLA'))

In [6]:
ticker = "TSLA"

# Get the minimum date of that ticker mentioned on our WSB comment
MIN_DATE, MAX_DATE = min_max_date(tesla_df)

# add 5 days to the max date because we will be aggregating da
MAX_DATE += datetime.timedelta(days=5)

# Pull the data from yahoo finance api
stock_data = yf.download(ticker, start = MIN_DATE, end = MAX_DATE)

[*********************100%***********************]  1 of 1 completed


In [7]:
stock_data.head()

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2013-04-09,41.799999,41.830002,40.330002,40.5,40.5,1696100
2013-04-10,40.700001,42.009998,40.610001,41.860001,41.860001,2121100
2013-04-11,42.060001,44.549999,41.75,43.59,43.59,3447400
2013-04-12,43.25,45.139999,43.049999,43.75,43.75,3149400
2013-04-15,43.5,43.799999,42.509998,43.299999,43.299999,1681400


In [8]:
tesla_sent = indicator_actions(tesla_df, 'TSLA')
tesla_sent["body"] = tesla_sent["body"].str.replace('\\n','')

Diverges from here

In [9]:
tesla_df2 = tesla_sent[['body','Date','sell','buy','calls','puts','score']].set_index('Date')
tesla_df2.head()

Unnamed: 0_level_0,body,sell,buy,calls,puts,score
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2014-08-07,TSLA is trading on news. Everytime some bad ne...,0,1,0,0,1
2014-04-26,Lots of big companies dipped today. Held onto ...,0,1,0,0,1
2014-05-08,Unless TSLA pulls a TRIP and rallies back to g...,0,1,0,0,1
2014-02-26,If you have TSLA stock your already making mon...,0,1,0,0,0
2014-05-08,&gt; Trash of a stock.Hardly. They're doing a ...,0,1,0,0,2


In [10]:
for row in range(len(stock_data)):
    # for each day, calculate the average mid price ((high + low) / 2) for the next 3 days 
    
    stock_data.loc[stock_data.index[row], 'mid_price_next_3_days'] = \
    ((stock_data.iloc[row+1:row+4,1] + stock_data.iloc[row+1:row+4,2])/2).mean()
    
# the last date of stock prices will be NA because there is no next day mid price to average
stock_data = stock_data.dropna()

# we only want the mid price (to predict)
stock_mid_data = stock_data["mid_price_next_3_days"]

In [11]:
# not every day of comments has an associated mid_price_next_3_days
# so NaN will be replaced with the preceding mid price
tesla_stock_df = tesla_df2.join(stock_mid_data).fillna(method='ffill')
tesla_stock_df.head()

Unnamed: 0_level_0,body,sell,buy,calls,puts,score,mid_price_next_3_days
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2013-04-09,"TSLA, SSYS, SCTY, OHI, RVBD",0,1,0,0,1,42.851666
2013-05-08,"*Zing!*JROD teach me your ways. I am noob, the...",0,1,0,0,2,76.143333
2013-05-09,I like the move esp in the short run although ...,1,0,0,0,1,82.611668
2013-05-10,I actually did play a oom strangle on TSLA bef...,0,1,0,0,1,85.068334
2013-05-10,I'm very curious what you paid here. On my bid...,0,1,0,0,2,85.068334


In [12]:
for column in tesla_stock_df.drop(["mid_price_next_3_days"], axis=1).columns:
    print(tesla_stock_df[column].isna().value_counts())

False    12079
Name: body, dtype: int64
False    12079
Name: sell, dtype: int64
False    12079
Name: buy, dtype: int64
False    12079
Name: calls, dtype: int64
False    12079
Name: puts, dtype: int64
False    12079
Name: score, dtype: int64


# Decision Tree
Decision trees were one of the first models we learned in KDD, lets see how it does for this data

In [13]:
ct = make_column_transformer(
    (TfidfVectorizer(), 'body'),
    remainder='passthrough'
)

In [14]:
model = DecisionTreeRegressor(random_state=42)

In [15]:
pipeline = make_pipeline(ct, StandardScaler(with_mean=False), model)

In [16]:
scores = -cross_val_score(pipeline, tesla_stock_df.drop(["mid_price_next_3_days"], axis=1), tesla_stock_df["mid_price_next_3_days"], 
                     cv=10, scoring="neg_mean_squared_error")

In [17]:
# RMSE to predict the average price for the next 3 days is about 66 (not good)
np.sqrt(np.mean(scores))

66.39727589124422

### Conclusions

Ouch. Not well apparently. RMSE of $66 is pretty terrible given how Tesla stock does not move this much

# Random Forest
Now, what if we use many weak decision tree learners? i.e. Random Forest

In [38]:
rf_model = RandomForestRegressor(random_state=42, n_estimators=20, max_depth=10)

In [39]:
pipeline = make_pipeline(ct, StandardScaler(with_mean=False), rf_model)

In [40]:
# This takes a very long time to run
start_time = time.time()
scores = -cross_val_score(pipeline, tesla_stock_df.drop(["mid_price_next_3_days"], axis=1), tesla_stock_df["mid_price_next_3_days"], 
                     cv=10, scoring="neg_mean_squared_error")
print("Time take to finish: ", time.time() - start_time, "seconds.")

Time take to finish:  72.28455924987793 seconds.


In [41]:
np.sqrt(np.mean(scores))

50.39798349757181

# XGBoost
We will now use XGBoost (Extreme Gradient Boost) to see how it compares to other naive methods, out of the box (little to no tuning)

In [26]:
!pip install xgboost
import xgboost as xgb



In [34]:
xgb_model = xgb.XGBRegressor(random_state=42, n_estimators=20, max_depth=10)

In [35]:
pipeline = make_pipeline(ct, StandardScaler(with_mean=False), xgb_model)

In [36]:
scores = -cross_val_score(pipeline, tesla_stock_df.drop(["mid_price_next_3_days"], axis=1), tesla_stock_df["mid_price_next_3_days"], 
                     cv=10, scoring="neg_mean_squared_error")

In [37]:
# RMSE to predict the average price for the next 3 days is about 66 (not good)
np.sqrt(np.mean(scores))

50.78977170368405