<a href="https://colab.research.google.com/github/limshaocong/analyticsEdge/blob/main/Prediction_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Libraries & Data Import Functions**

In [43]:
suppressMessages(library(tidyverse)) # generic must have package
library(dplyr)
library(ggplot2) # plotting package
library(lubridate) # easy comprehension of dates from string to correct datetime format
library(data.table)
library(purrr) # reduce
#if("caret" %in% rownames(installed.packages()) == FALSE) {install.packages("caret")}
#library(caret)
#if("psych" %in% rownames(installed.packages()) == FALSE) {install.packages("psych")}
#library(psych) # unscaling
#if("padr" %in% rownames(installed.packages()) == FALSE) {install.packages("padr")}
#library(padr)
#if("janitor" %in% rownames(installed.packages()) == FALSE) {install.packages("janitor")}
#library(janitor)
if("anytime" %in% rownames(installed.packages()) == FALSE) {install.packages("anytime")}
library(anytime)

options(repr.plot.width = 9,
        repr.plot.height = 6,
        repr.plot.pointsize = 20)

Installing package into ‘/usr/local/lib/R/site-library’
(as ‘lib’ is unspecified)

also installing the dependency ‘BH’




Using the following functions to extract the daily trading data, twitter sentiment analysis, r/wsb sentiment analysis and news sentiment analysis. Thereafter, left-joining the data to daily trading data by ticker-date pairs to reduce the entire dataset down to trading days only.

In [68]:
git.path = "https://raw.githubusercontent.com/limshaocong/analyticsEdge/main/Datasets/"

# Prices
# Source: CRSP

price.path = "CRSP_WRDS/crsp_daily_stock_price_2020.csv"

get.prices <- function(tickerlist) {

  path = paste0(git.path, price.path)

  df = read.csv(path) %>%
    filter(ticker %in% tickerlist) %>%
    #mutate(as.Date(date, format = "%m/%d/%y")) %>%
    mutate(date = as.Date(date, format = "%m/%d/%y")) %>%
    rename(low = BIDLO,
          high = ASKHI,
          open = OPENPRC,
          close = PRC,
          vol = VOL,
          Ntrades = NUMTRD) %>%
    select(ticker, date, open, close, high, low) %>%
    mutate(change = close - open,
          dayspread = high - low)
}

# Twitter Sentiment Analysis
# Source: Open source
# Scrapped by snscrape
# Sentiment Analysis by BERT - zer0-shot, multilingual, sentiment model

twtr.path = "Imputed/twitter_sa.csv"

get.twitter.sa <- function (tickerlist){
  path = paste0(git.path, twtr.path)
  
  df = read.csv(path) %>%
    filter(ticker %in% tickerlist) %>%
    #mutate(date = as.Date(date, format = "%m/%d/%y"))
    mutate(date = anydate(date))

}

# r/wsb Sentiment Analysis
# Source: Quiver Quant
# Sentiment based on VADER sentiment

wsb.path = "Imputed/wsb_imputed_min0.csv"

get.wsb.sa <- function(tickerlist){

  path = paste0(git.path, wsb.path)

  df = read.csv(path) %>%
    filter(Ticker %in% tickerlist) %>%
    mutate(Date = mdy(Date)) %>%
    pad(start_val = as.Date('2020-01-01'), end_val = as.Date('2020-12-31')) %>%
    rename(wsbsentiment = Sentiment,
          wsblog10mentions = log10Mentions) %>%
    select(Ticker, Date, wsblog10mentions, wsbsentiment) %>%
    rename_with(tolower) %>%
    mutate(date = anydate(date))
}

# News Sentiment - Dow Jones + Global Press

pr.path = "Imputed/rp_imputed_min500.csv"

get.news.sa <- function(tickerlist){

  path = paste0(git.path, pr.path)

  df = read.csv(path) %>%
    filter(Date >= as.Date("2020-01-01") & Date <= as.Date("2020-12-31")) %>%
    filter(ticker %in% tickerlist) %>%
    #mutate(date = as.Date(Date, format = "%m/%d/%y")) %>%
    rowwise() %>%
    mutate(newssentiment = mean(DJ_mean_ess * DJ_news_instance, PR_mean_ess * PR_news_instance)/All_news_instance) %>%
    replace(is.na(.), 0) %>%
    mutate(newssentiment = (newssentiment-50)/100) %>%
    rename(newslog10mentions = log10Allmentions) %>%
    select(ticker, Date, newssentiment, newslog10mentions) %>%
    rename_with(tolower) %>%
    mutate(date = anydate(date))
}

# Compile all data
# Left join to prices df retaining only trading days

get.all.data <- function(tickerlist) {

  prices = get.prices(tickerlist)
  print("price ok")
  prices$date = as.Date(prices$date)

  twitter.sa = get.twitter.sa(tickerlist)
  print("twtr ok")
  twitter.sa$date = as.Date(twitter.sa$date)

  wsb.sa = get.wsb.sa(tickerlist)
  print("wsb ok")
  wsb.sa$date = as.Date(wsb.sa$date)

  news.sa = get.news.sa(tickerlist)
  print("news ok")
  news.sa$date = as.Date(news.sa$date)

  df = list(prices, wsb.sa, news.sa, twitter.sa) %>%
            reduce(left_join, by = c("ticker", "date"))
}

check.missing.data <- function(df) {
  complete.row = sum(complete.cases(df))
  actual.row = dim(df)[1]

  if(complete.row == actual.row) {
    print("No missing data")
  } else {
    print("Missing data present")
  }
}

In [69]:
tickerlist = list("UBER", "AMZN", "FB")

imported_df = get.all.data(tickerlist) %>%
  arrange(ticker, date)

ticker.count = length(unique(imported_df[["ticker"]]))

check.missing.data(imported_df)

[1] "price ok"
[1] "twtr ok"


“All formats failed to parse. No formats found.”
“There are NA values in the column Date. The records with NA values are returned
in the final rows of the dataframe.”
“no non-missing arguments to min; returning Inf”
“no non-missing arguments to max; returning -Inf”
pad applied on the interval: 365 day



[1] "wsb ok"
[1] "news ok"
[1] "Missing data present"


In [72]:
missing_df = imported_df[rowSums(is.na(imported_df)) > 0,]
#missing_df
missing_df %>% pull(ticker) %>% unique()

# **(Slight) Feature Engineering**

In [440]:
# Add in target variable

df = imported_df %>%
  group_by(ticker) %>%
  #mutate_at(c(3:8), scale) %>%
  mutate(target = lead(close, n = 1, default = NA)) %>% # target using by taking the $close of the next period
  mutate(prevclose1 = lag(close, n = 1, default = NA), # additional variables by lagging earlier $close
         prevclose2 = lag(close, n = 2, default = NA), # in log2 scale, kaggle trick
         prevclose4 = lag(close, n = 4, default = NA),
         prevclose8 = lag(close, n = 8, default = NA)) %>%
  mutate(prevwsbsentiment1 = lag(wsbsentiment, n = 1, default = NA), # additional variables by lagging earlier $wsbsentiment
         prevwsbsentiment2 = lag(wsbsentiment, n = 2, default = NA), # in log2 scale, kaggle trick
         prevwsbsentiment4 = lag(wsbsentiment, n = 4, default = NA),
         prevwsbsentiment8 = lag(wsbsentiment, n = 8, default = NA)) %>%
  mutate(prevnewssentiment1 = lag(newssentiment, n = 1, default = NA), # additional variables by lagging earlier $newssentiment
         prevnewssentiment2 = lag(newssentiment, n = 2, default = NA), # in log2 scale, kaggle trick
         prevnewssentiment4 = lag(newssentiment, n = 4, default = NA),
         prevnewssentiment8 = lag(newssentiment, n = 8, default = NA)) %>%
  mutate(prevtwtrsentiment1 = lag(twtrsentiment, n = 1, default = NA), # additional variables by lagging earlier $twtrsentiment
         prevtwtrsentiment2 = lag(twtrsentiment, n = 2, default = NA), # in log2 scale, kaggle trick
         prevtwtrsentiment4 = lag(twtrsentiment, n = 4, default = NA),
         prevtwtrsentiment8 = lag(twtrsentiment, n = 8, default = NA)) %>%
  na.omit %>%
  as.data.frame() %>%
  relocate(target, .after = last_col())

In [195]:
""""
# Do scaling of all variables
# Retain means and sd for unscaling of data

scaling.mean = colMeans(df[sapply(df, is.numeric)])
scaling.sd = sapply(df[sapply(df, is.numeric)], sd)

df[map_lgl(df, is.numeric)] = df %>%
                              select(is.numeric) %>%
                              scale(center = scaling.mean, scale = scaling.sd)
"""

In [128]:
"""
# One-hot encoding for ticker

if (ticker.count > 1){
  dummy = dummyVars(" ~ .", data = df)
  df = data.frame(predict(dummy, newdata = df))
} 
"""

ERROR: ignored

Final check for missing data before model training

In [432]:
check.missing.data(df)

[1] "No missing data"


# **Train-Validate-Test Split**



In [443]:
split = unclass(as.Date("2020-09-30"))

# Train-test split
train = df %>% filter(date < split)
test = df %>% filter(date >= split)

train.days = dim(train)[1]/ticker.count
test.days = dim(test)[1]/ticker.count

train.prop = train.days/(train.days + test.days)

print("% of Training Data")
print(train.prop)
print(train.days)

[1] "% of Training Data"
[1] 0.7377049
[1] 180


This splitting of train-validate only serves as an illustration. The real splitting is embedded in the training process as per normal CV. However, as normal k-fold CV does not work on time series data due to the emphasis its temporal features, a sliding window approach is used (see Section 4.3 of https://topepo.github.io/caret/data-splitting.html#time)

In [434]:
# In this instance, with 180 days worh of data, there are 6 folds based on the chosen parameters

index = 1:train.days
folds = createTimeSlices(index, initialWindow = 95, horizon = 30, fixedWindow = TRUE, skip = 10)
lapply(folds, length)

Sample of how the folds are constructed is as follows.

In [435]:
folds$train
folds$test

# **Model Building and Selection**

In [436]:
library(rpart)
if("rpart.plot" %in% rownames(installed.packages()) == FALSE) {install.packages("rpart.plot")}
library(rpart.plot)
if("Metrics" %in% rownames(installed.packages()) == FALSE) {install.packages("Metrics")}
library(Metrics)
if("randomForest" %in% rownames(installed.packages()) == FALSE) {install.packages("randomForest")}
library(randomForest)

In [444]:
str(train)

#3 open
#14 twtrsentiment
#18 prevclose8

'data.frame':	180 obs. of  31 variables:
 $ ticker            : chr  "MSFT" "MSFT" "MSFT" "MSFT" ...
 $ date              : Date, format: "2020-01-14" "2020-01-15" ...
 $ open              : num  163 163 164 167 167 ...
 $ close             : num  162 163 166 167 166 ...
 $ high              : num  164 164 166 167 168 ...
 $ low               : num  162 163 164 165 166 ...
 $ change            : num  -1.26 0.56 1.82 -0.32 -0.18 ...
 $ dayspread         : num  1.88 1.37 2.21 2.04 1.76 ...
 $ wsblog10mentions  : num  1.34 1.26 1.74 2.06 1.72 ...
 $ wsbsentiment      : num  0.168395 0.123718 0.147098 0.073684 -0.000337 ...
 $ newssentiment     : num  -0.0783 0.14 0.024 -0.5 0.1475 ...
 $ newslog10mentions : num  0.845 0.699 0.778 0 0.699 ...
 $ twtrlog10mentions : num  2.53 2.44 2.64 2.73 2.49 ...
 $ twtrsentiment     : num  0.202 0.274 0.376 0.324 0.255 ...
 $ prevclose1        : num  163 162 163 166 167 ...
 $ prevclose2        : num  161 163 162 163 166 ...
 $ prevclose4        : num  

In [463]:
cart <- function(trainX, trainY) {
  
  train.control = trainControl(method = "timeslice",
                            initialWindow = 95,
                            horizon = 30,
                            fixedWindow = TRUE)

  cp.values = data.frame(.cp = seq(0, 0.005, by = 0.0001))

  model = train(x = trainX,
                y = trainY,
                method = "rpart",
                trControl = train.control,
                tuneGrid = cp.values)

}

randomforest <- function(trainX, trainY) {

  train.control = trainControl(method = "timeslice",
                                initialWindow = 95,
                                horizon = 30,
                                fixedWindow = TRUE)
  
  n.pred = dim(trainX)[2]
  mtry.low = round(n.pred/3, 0) - round(n.pred/6, 0)
  mtry.upp = mtry.low + (2 * round(n.pred/6, 0))
  mtry.grid = data.frame(mtry = seq(mtry.low, mtry.upp, by = 1))
      
  model = train(x = trainX,
                y = trainY,
                method = "rf",
                trControl = train.control,
                tuneGrid = mtry.grid,
                ntree = 80,
                nodesize = 15)

}

In [464]:
ticker = "MSFT"

set.seed(15071)

ticker.train = train %>% filter (ticker == ticker) %>% as.data.frame()
trainX = ticker.train[, c(3:18)]
trainY = ticker.train[["target"]]

ticker.test = test %>% filter (ticker == ticker) %>% as.data.frame()
testX = ticker.test[, c(3:18)]
testY = ticker.test[["target"]]
testC = ticker.test[["close"]]

naive.mape = smape(testC, testY)

cart.mod = cart(trainX, trainY)

rf.mod = randomforest(trainX, trainY)

“There were missing values in resampled performance measures.”
“There were missing values in resampled performance measures.”


In [465]:
naive.mape

In [467]:
cart.train.pred = predict(object = cart.mod$finalModel, newdata = trainX)
cart.test.pred = predict(object = cart.mod$finalModel, newdata = testX)

cart.train.mape = smape(cart.train.pred, trainY)
cart.test.mape = smape(cart.test.pred, testY)

cart.train.mape
cart.test.mape

In [468]:
rf.train.pred = predict(object = rf.mod$finalModel, newdata = trainX)
rf.test.pred = predict(object = rf.mod$finalModel, newdata = testX)

rf.train.mape = smape(rf.train.pred, trainY)
rf.test.mape = smape(rf.test.pred, testY)

rf.train.mape
rf.test.mape

Unnamed: 0,IncNodePurity
open,10899.73195
close,20168.44712
high,13819.51287
low,20417.24415
change,161.59573
dayspread,132.98412
wsblog10mentions,127.15054
wsbsentiment,119.54659
newssentiment,58.75977
newslog10mentions,18.18472
