<a href="https://colab.research.google.com/github/limshaocong/analyticsEdge/blob/main/Prediction_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [24]:
suppressMessages(library(tidyverse)) # generic must have package
library(dplyr)
library(ggplot2) # plotting package
library(lubridate) # easy comprehension of dates from string to correct datetime format
library(data.table)
library(purrr) # reduce
if("padr" %in% rownames(installed.packages()) == FALSE) {install.packages("padr")}
library(padr)
if("janitor" %in% rownames(installed.packages()) == FALSE) {install.packages("janitor")}
library(janitor)

options(repr.plot.width = 9,
        repr.plot.height = 6,
        repr.plot.pointsize = 20)

In [66]:
# Functions to pull data

git.path = "https://raw.githubusercontent.com/limshaocong/analyticsEdge/main/Datasets/"

# Prices
# Source: CRSP

price.path = "CRSP_WRDS/crsp_daily_stock_price_2020.csv"

get.prices <- function(tickerlist) {

  path = paste0(git.path, price.path)

  df = read.csv(path) %>%
    filter(ticker %in% tickerlist) %>%
    mutate(date = ymd(date)) %>%
    rename(low = BIDLO,
          high = ASKHI,
          open = OPENPRC,
          close = PRC,
          vol = VOL,
          Ntrades = NUMTRD) %>%
    select(ticker, date, open, close, high, low, vol, Ntrades) %>%
    mutate(change = close - open,
          dayspread = high - low,
          averagetrade = vol/Ntrades)
}

# Twitter Sentiment Analysis
# Source: Open source
# Scrapped by snscrape
# Sentiment Analysis by BERT - zer0-shot, multilingual, sentiment model

twtr.path = "Imputed/twitter_sa.csv"

get.twitter.sa <- function (tickerlist){
  path = paste0(git.path, twtr.path)
  
  df = read.csv(path) %>%
    filter(ticker %in% tickerlist) %>%
    mutate(date = as.Date(date, format = "%m/%d/%y")) %>%
    rename(twtrmentions = mentions)
}

# r/wsb Sentiment Analysis
# Source: Quiver Quant
# Sentiment based on VADER sentiment

wsb.path = "Imputed/wsb_imputed_min0.csv"

get.wsb.sa <- function(tickerlist){

  path = paste0(git.path, wsb.path)

  df = read.csv(path) %>%
    filter(Ticker %in% tickerlist) %>%
    mutate(date = ymd(Date)) %>%
    pad(start_val = as.Date('2020-01-01'), end_val = as.Date('2020-12-31')) %>%
    select(Ticker, Date, Mentions, log10Mentions, Sentiment) %>%
    rename(wsbsentiment = Sentiment,
          wsbmentions = Mentions,
          wsblog10mentions = log10Mentions) %>%
    rename_with(tolower)
}

# News Sentiment - Dow Jones + Global Press

pr.path = "Imputed/rp_imputed_min1000.csv"

get.news.sa <- function(tickerlist){

  path = paste0(git.path, pr.path)

  df = read.csv(path) %>%
    filter(Date >= as.Date("2020-01-01") & Date <= as.Date("2020-12-31")) %>%
    filter(ticker %in% tickerlist) %>%
    mutate(date = ymd(Date)) %>%
    rowwise() %>%
    mutate(newssentiment = mean(DJ_mean_ess * DJ_news_instance, PR_mean_ess * PR_news_instance)/All_news_instance) %>%
    replace(is.na(.), 0) %>%
    mutate(newssentiment = (newssentiment-50)/100) %>%
    select(ticker, Date, All_news_instance, newssentiment, log10Allmentions) %>%
    rename(newsmentions = All_news_instance,
          newslog10mentions = log10Allmentions) %>%
    rename_with(tolower)
}

get.all.data <- function(tickerlist) {

  prices = get.prices(tickerlist)
  prices$date = as.Date(prices$date)

  wsb.sa = get.wsb.sa(tickerlist)
  wsb.sa$date = as.Date(wsb.sa$date)

  news.sa = get.news.sa(tickerlist)
  news.sa$date = as.Date(news.sa$date)

  twitter.sa = get.twitter.sa(tickerlist)
  twitter.sa$date = as.Date(twitter.sa$date)

  df = list(prices, wsb.sa, news.sa, twitter.sa) %>%
            reduce(left_join, by = c("ticker", "date"))
}

In [67]:
tickerlist = list("AAPL", "TSLA", "MSFT")

df = get.all.data(tickerlist)

pad applied on the interval: day



In [68]:
head(df)

Unnamed: 0_level_0,ticker,date,open,close,high,low,vol,Ntrades,change,dayspread,averagetrade,wsbmentions,wsblog10mentions,wsbsentiment,newsmentions,newssentiment,newslog10mentions,twtrmentions,twtrsentiment
Unnamed: 0_level_1,<chr>,<date>,<dbl>,<dbl>,<dbl>,<dbl>,<int>,<int>,<dbl>,<dbl>,<dbl>,<int>,<dbl>,<dbl>,<int>,<dbl>,<dbl>,<int>,<dbl>
1,MSFT,2020-01-02,158.78,160.62,160.73,158.33,22610236,177688,1.84,2.4,127.2468,37,1.579784,0.058202703,0,-0.5,0.0,362,0.3895028
2,MSFT,2020-01-03,158.32,158.62,159.945,158.06,21099013,168785,0.29999,1.88501,125.0053,37,1.579784,-0.009467568,0,-0.5,0.0,235,0.4085106
3,MSFT,2020-01-06,157.08,159.03,159.1,156.51,21156101,151702,1.95,2.59002,139.4583,16,1.230449,-0.070725,2,-0.5,0.4771213,238,0.3781513
4,MSFT,2020-01-07,159.32,157.58,159.67,157.32,21844325,169629,-1.74001,2.34999,128.7771,41,1.623249,-0.090526829,2,-0.5,0.4771213,275,0.2218182
5,MSFT,2020-01-08,158.93,160.09,160.8,157.9491,27722052,201189,1.16001,2.8509,137.7911,43,1.643453,-0.111018605,2,0.35,0.4771213,345,0.4231884
6,MSFT,2020-01-09,161.835,162.09,162.215,161.03,21375474,163170,0.25499,1.185,131.0013,37,1.579784,0.06507027,11,0.02272727,1.0791812,407,0.2948403
