<a href="https://colab.research.google.com/github/limshaocong/analyticsEdge/blob/main/Prediction_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [74]:
suppressMessages(library(tidyverse)) # generic must have package
library(dplyr)
library(ggplot2) # plotting package
library(lubridate) # easy comprehension of dates from string to correct datetime format
library(data.table)
library(purrr) # reduce
if("psych" %in% rownames(installed.packages()) == FALSE) {install.packages("psych")}
library(psych) # unscaling
if("padr" %in% rownames(installed.packages()) == FALSE) {install.packages("padr")}
library(padr)
if("janitor" %in% rownames(installed.packages()) == FALSE) {install.packages("janitor")}
library(janitor)

options(repr.plot.width = 9,
        repr.plot.height = 6,
        repr.plot.pointsize = 20)

Installing package into ‘/usr/local/lib/R/site-library’
(as ‘lib’ is unspecified)

also installing the dependencies ‘tmvnsim’, ‘mnormt’



Attaching package: ‘psych’


The following objects are masked from ‘package:ggplot2’:

    %+%, alpha




In [5]:
# Functions to pull data

git.path = "https://raw.githubusercontent.com/limshaocong/analyticsEdge/main/Datasets/"

# Prices
# Source: CRSP

price.path = "CRSP_WRDS/crsp_daily_stock_price_2020.csv"

get.prices <- function(tickerlist) {

  path = paste0(git.path, price.path)

  df = read.csv(path) %>%
    filter(ticker %in% tickerlist) %>%
    mutate(date = ymd(date)) %>%
    rename(low = BIDLO,
          high = ASKHI,
          open = OPENPRC,
          close = PRC,
          vol = VOL,
          Ntrades = NUMTRD) %>%
    select(ticker, date, open, close, high, low, vol, Ntrades) %>%
    mutate(change = close - open,
          dayspread = high - low,
          averagetrade = vol/Ntrades)
}

# Twitter Sentiment Analysis
# Source: Open source
# Scrapped by snscrape
# Sentiment Analysis by BERT - zer0-shot, multilingual, sentiment model

twtr.path = "Imputed/twitter_sa.csv"

get.twitter.sa <- function (tickerlist){
  path = paste0(git.path, twtr.path)
  
  df = read.csv(path) %>%
    filter(ticker %in% tickerlist) %>%
    mutate(date = as.Date(date, format = "%m/%d/%y")) %>%
    rename(twtrmentions = mentions)
}

# r/wsb Sentiment Analysis
# Source: Quiver Quant
# Sentiment based on VADER sentiment

wsb.path = "Imputed/wsb_imputed_min0.csv"

get.wsb.sa <- function(tickerlist){

  path = paste0(git.path, wsb.path)

  df = read.csv(path) %>%
    filter(Ticker %in% tickerlist) %>%
    mutate(date = ymd(Date)) %>%
    pad(start_val = as.Date('2020-01-01'), end_val = as.Date('2020-12-31')) %>%
    select(Ticker, Date, Mentions, log10Mentions, Sentiment) %>%
    rename(wsbsentiment = Sentiment,
          wsbmentions = Mentions,
          wsblog10mentions = log10Mentions) %>%
    rename_with(tolower)
}

# News Sentiment - Dow Jones + Global Press

pr.path = "Imputed/rp_imputed_min1000.csv"

get.news.sa <- function(tickerlist){

  path = paste0(git.path, pr.path)

  df = read.csv(path) %>%
    filter(Date >= as.Date("2020-01-01") & Date <= as.Date("2020-12-31")) %>%
    filter(ticker %in% tickerlist) %>%
    mutate(date = ymd(Date)) %>%
    rowwise() %>%
    mutate(newssentiment = mean(DJ_mean_ess * DJ_news_instance, PR_mean_ess * PR_news_instance)/All_news_instance) %>%
    replace(is.na(.), 0) %>%
    mutate(newssentiment = (newssentiment-50)/100) %>%
    select(ticker, Date, All_news_instance, newssentiment, log10Allmentions) %>%
    rename(newsmentions = All_news_instance,
          newslog10mentions = log10Allmentions) %>%
    rename_with(tolower)
}

# Compile all data
# Left join to prices df retaining only trading days

get.all.data <- function(tickerlist) {

  prices = get.prices(tickerlist)
  prices$date = as.Date(prices$date)

  wsb.sa = get.wsb.sa(tickerlist)
  wsb.sa$date = as.Date(wsb.sa$date)

  news.sa = get.news.sa(tickerlist)
  news.sa$date = as.Date(news.sa$date)

  twitter.sa = get.twitter.sa(tickerlist)
  twitter.sa$date = as.Date(twitter.sa$date)

  df = list(prices, wsb.sa, news.sa, twitter.sa) %>%
            reduce(left_join, by = c("ticker", "date"))
}

In [94]:
tickerlist = list("AAPL", "TSLA", "MSFT")

imported_df = get.all.data(tickerlist) %>%
  arrange(ticker, date)

pad applied on the interval: day



In [109]:
# Add in target variable

df = imported_df %>%
  group_by(ticker) %>%
  mutate(target = lead(close, n = 1, default = NA)) %>% # target using by taking the $close of the next period
  mutate(prevclose1 = lag(close, n = 1, default = NA), # additional variables by lagging earlier $close
         prevclose2 = lag(close, n = 2, default = NA), # in log2 scale, kaggle trick
         prevclose4 = lag(close, n = 4, default = NA),
         prevclose8 = lag(close, n = 8, default = NA)) %>%
  mutate(prevwsbsentiment1 = lag(wsbsentiment, n = 1, default = NA), # additional variables by lagging earlier $wsbsentiment
         prevwsbsentiment2 = lag(wsbsentiment, n = 2, default = NA), # in log2 scale, kaggle trick
         prevwsbsentiment4 = lag(wsbsentiment, n = 4, default = NA),
         prevwsbsentiment8 = lag(wsbsentiment, n = 8, default = NA)) %>%
  mutate(newssentiment1 = lag(newssentiment, n = 1, default = NA), # additional variables by lagging earlier $newssentiment
         newssentiment2 = lag(newssentiment, n = 2, default = NA), # in log2 scale, kaggle trick
         newssentiment4 = lag(newssentiment, n = 4, default = NA),
         newssentiment8 = lag(newssentiment, n = 8, default = NA)) %>%
  mutate(twtrsentiment1 = lag(twtrsentiment, n = 1, default = NA), # additional variables by lagging earlier $twtrsentiment
         twtrsentiment2 = lag(twtrsentiment, n = 2, default = NA), # in log2 scale, kaggle trick
         twtrsentiment4 = lag(twtrsentiment, n = 4, default = NA),
         twtrsentiment8 = lag(twtrsentiment, n = 8, default = NA)) %>%
  na.omit %>%
  as.data.frame()


In [111]:
df %>% filter(ticker == "AAPL")

ticker,date,open,close,high,low,vol,Ntrades,change,dayspread,⋯,prevwsbsentiment4,prevwsbsentiment8,newssentiment1,newssentiment2,newssentiment4,newssentiment8,twtrsentiment1,twtrsentiment2,twtrsentiment4,twtrsentiment8
<chr>,<date>,<dbl>,<dbl>,<dbl>,<dbl>,<int>,<int>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
AAPL,2020-01-14,316.700,312.68,317.57,312.1700,40621057,345317,-4.02002,5.40000,⋯,0.0141076923,0.1285582418,-0.50000000,0.13000000,0.13000000,0.11000000,0.33421053,0.31958763,0.32267168,0.4386938
AAPL,2020-01-15,311.850,311.34,315.50,309.5500,30452727,267127,-0.51001,5.95001,⋯,0.0468870968,-0.0301795455,0.31750000,-0.50000000,0.11250000,0.14600000,0.30421456,0.33421053,0.33266399,0.2421525
AAPL,2020-01-16,313.590,315.24,315.70,312.0900,27627154,210077,1.64999,3.61001,⋯,-0.0165218750,0.0701671233,0.35500000,0.31750000,0.13000000,0.18000000,0.21131742,0.30421456,0.31958763,0.2467890
AAPL,2020-01-17,316.270,318.73,318.74,315.0000,34420045,241162,2.46002,3.73999,⋯,-0.0042350649,0.0699777778,-0.01000000,0.35500000,-0.50000000,0.13000000,0.29697624,0.21131742,0.33421053,0.2633880
AAPL,2020-01-21,317.190,316.57,319.02,316.0000,27674778,213062,-0.61999,3.01999,⋯,0.0414839286,0.0141076923,0.31000000,-0.01000000,0.31750000,0.13000000,0.39202082,0.29697624,0.30421456,0.3226717
AAPL,2020-01-22,318.580,317.70,319.99,317.3100,25432169,210390,-0.87998,2.67999,⋯,0.0361071429,0.0468870968,0.00000000,0.31000000,0.35500000,0.11250000,0.24676851,0.39202082,0.21131742,0.3326640
AAPL,2020-01-23,317.920,319.23,319.56,315.6500,26094188,199031,1.31000,3.91001,⋯,0.0719672131,-0.0165218750,-0.50000000,0.00000000,-0.01000000,0.13000000,0.26911315,0.24676851,0.29697624,0.3195876
AAPL,2020-01-24,320.250,318.31,323.33,317.5188,36599802,299346,-1.94000,5.81119,⋯,0.0105278351,-0.0042350649,0.21833333,-0.50000000,0.31000000,-0.50000000,0.31370450,0.26911315,0.39202082,0.3342105
AAPL,2020-01-27,310.060,308.95,311.77,304.8800,40414808,384807,-1.10999,6.88999,⋯,-0.0173051282,0.0414839286,0.03000000,0.21833333,0.00000000,0.31750000,0.22521138,0.31370450,0.24676851,0.3042146
AAPL,2020-01-28,312.600,317.69,318.40,312.1900,40519265,356616,5.08999,6.20999,⋯,-0.0076928571,0.0361071429,-0.01777778,0.03000000,-0.50000000,0.35500000,0.16578947,0.22521138,0.26911315,0.2113174


In [114]:
# Retain mean and 

scaling.mean = colMeans(df[sapply(df, is.numeric)])
scaling.sd = sapply(df[sapply(df, is.numeric)], sd)

In [115]:
# Scaling of all numerical columns
df[map_lgl(df, is.numeric)] = df %>%
                              select(is.numeric) %>%
                              scale(center = scaling.mean, scale = scaling.sd)
                            

In [116]:
df

ticker,date,open,close,high,low,vol,Ntrades,change,dayspread,⋯,prevwsbsentiment4,prevwsbsentiment8,newssentiment1,newssentiment2,newssentiment4,newssentiment8,twtrsentiment1,twtrsentiment2,twtrsentiment4,twtrsentiment8
<chr>,<date>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
AAPL,2020-01-14,-0.2693676,-0.2814730,-0.2850848,-0.2630236,-0.16722757,-0.47422931,-0.216704388,-0.4813800,⋯,-0.44018258,1.61445458,-1.15639913,1.0813187,1.0632880,0.9672343,1.057136153,0.913327299,0.947427853,2.066266687
AAPL,2020-01-15,-0.2825247,-0.2850953,-0.2905165,-0.2703735,-0.41206490,-0.68082347,-0.059295857,-0.4635939,⋯,0.15056815,-1.23182663,1.76042788,-1.1656292,1.0008515,1.0957613,0.764546524,1.055738933,1.044788515,0.157650282
AAPL,2020-01-16,-0.2778044,-0.2745529,-0.2899917,-0.2632481,-0.48010023,-0.83156138,0.037570656,-0.5392642,⋯,-0.99218892,0.56746037,1.89422729,1.7500532,1.0632880,1.2171478,-0.141599961,0.763609769,0.917378166,0.202675664
AAPL,2020-01-17,-0.2705341,-0.2651188,-0.2820148,-0.2550847,-0.31653814,-0.74942838,0.073896943,-0.5350610,⋯,-0.77075592,0.56406527,0.59191309,1.8838001,-1.1844287,1.0386382,0.693941816,-0.141110661,1.059857297,0.363868742
AAPL,2020-01-21,-0.2680383,-0.2709576,-0.2812801,-0.2522794,-0.47895352,-0.82367440,-0.064227977,-0.5583441,⋯,0.05319221,-0.43772502,1.73366800,0.5819969,1.7322513,1.0386382,1.621034995,0.693116176,0.767589676,0.939573938
AAPL,2020-01-22,-0.2642675,-0.2679030,-0.2787349,-0.2486045,-0.53295201,-0.83073437,-0.075887386,-0.5693390,⋯,-0.04370828,0.15003297,0.62759293,1.7233038,1.8660440,0.9761598,0.204200569,1.618750335,-0.137559553,1.036609401
AAPL,2020-01-23,-0.2660579,-0.2637672,-0.2798632,-0.2532613,-0.51701163,-0.86074720,0.022323598,-0.5295629,⋯,0.60256224,-0.98693491,-1.15639913,0.6176628,0.5637954,1.0386382,0.422156887,0.204145663,0.697062672,0.909624621
AAPL,2020-01-24,-0.2597371,-0.2662541,-0.2699708,-0.2480188,-0.26405304,-0.59569421,-0.123424627,-0.4680831,⋯,-0.50469880,-0.76662368,1.40660279,-1.1656292,1.7054928,-1.2105840,0.857114216,0.421758970,1.623135542,1.051627851
AAPL,2020-01-27,-0.2873807,-0.2915558,-0.3003039,-0.2834741,-0.17219372,-0.36988855,-0.086202324,-0.4331971,⋯,-1.00630472,0.05315033,0.73463245,1.3963669,0.5994735,1.7080495,-0.006074142,0.856031782,0.207860408,0.760336446
AAPL,2020-01-28,-0.2804901,-0.2679301,-0.2829070,-0.2629675,-0.16967856,-0.44437501,0.191839546,-0.4551868,⋯,-0.83307228,-0.04325926,0.56416210,0.7246603,-1.1844287,1.8419318,-0.585693138,-0.005798126,0.425576855,-0.141789455
