<a href="https://colab.research.google.com/github/limshaocong/analyticsEdge/blob/main/Data_Exploration.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
library(tidyverse)
library(ggplot2)

“running command 'timedatectl' had status 1”
── [1mAttaching packages[22m ─────────────────────────────────────── tidyverse 1.3.1 ──

[32m✔[39m [34mggplot2[39m 3.3.5     [32m✔[39m [34mpurrr  [39m 0.3.4
[32m✔[39m [34mtibble [39m 3.1.5     [32m✔[39m [34mdplyr  [39m 1.0.7
[32m✔[39m [34mtidyr  [39m 1.1.4     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 2.0.2     [32m✔[39m [34mforcats[39m 0.5.1

── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()



In [3]:
path = "https://raw.githubusercontent.com/limshaocong/analyticsEdge/main/Datasets/"

# Russell 3000 tickers
tickers.path = paste0(path,"r3000_tickers.csv")

# Quiver Quant data
twitter.follower.path = paste0(path, "QuiverQuant/twitter_follower.csv")
wiki.pageviews.path = paste0(path,"QuiverQuant/wikipedia_pageviews.csv")
wsb.sentiment.path = paste0(path,"QuiverQuant/wsb_sentiment.csv")

# RavenPack Sentiment Relevance data
rp.company.index.path = paste0(path,"RavenPack_WRDS/rp_company_index.csv")
## Dow Jones new releases (2018-2021)
rp.dj.2018.path = paste0(path,"RavenPack_WRDS/rp_dj_2018.csv")
rp.dj.2019.path = paste0(path,"RavenPack_WRDS/rp_dj_2019.csv")
rp.dj.2020.path = paste0(path,"RavenPack_WRDS/rp_dj_2020.csv")
rp.dj.2021.path = paste0(path,"RavenPack_WRDS/rp_dj_2021.csv")
## Global Press releases (2018-2021)
rp.pr.2018.path = paste0(path,"RavenPack_WRDS/rp_pr_2018.csv")
rp.pr.2019.path = paste0(path,"RavenPack_WRDS/rp_pr_2019.csv")
rp.pr.2020.path = paste0(path,"RavenPack_WRDS/rp_pr_2020.csv")
rp.pr.2021.path = paste0(path,"RavenPack_WRDS/rp_pr_2021.csv")

**Exploring RavenPack Data**


In [4]:
# Import company index which allows the tickers to be mapped to RavenPack's
# internal entity index
rp.company.index = read.csv(rp.company.index.path)

# Read the 2021 Dow Jones sentiment data
rp.dj.2021 = read.csv(rp.dj.2021.path)

In [5]:
# Sample of Data of AAPL for 2021 Dow Jones
#
# Conclusions:
#
# 1. Data is relatively spares, with no data on most dates. Preprocess to
# fill in non-entries.
#
# 2. There are several entries which share the same "category", "relevance",
# "ess" and "date". Preprocess to aggregate repetitions.
#
# 3. There are also several entries on the same "date" that have a range of 
# "ess". Need to determine means of aggregation.
#
# 3. Many entries that have neutral sentiment (ie. ess = 50). Preprocessing
# needs to assume if absence of press release and neutral announcement should
# encoded the same.

aapl.index = rp.company.index$rp_entity_id[rp.company.index$ticker == "AAPL"]

rp.dj.2021 %>%
  filter(rp.dj.2021$rp_entity_id == aapl.index) %>%
  head(20)

Unnamed: 0_level_0,rp_entity_id,category,relevance,ess,rpna_date_utc
Unnamed: 0_level_1,<chr>,<chr>,<int>,<int>,<chr>
1,D8442A,analyst-ratings-change-neutral,100,51,1/5/2021
2,D8442A,price-target-upgrade,100,91,1/5/2021
3,D8442A,executive-appointment,100,54,1/5/2021
4,D8442A,executive-appointment,100,54,1/5/2021
5,D8442A,executive-appointment,100,54,1/5/2021
6,D8442A,analyst-ratings-change-neutral,100,78,1/5/2021
7,D8442A,price-target-upgrade,100,75,1/5/2021
8,D8442A,executive-salary-increase,100,44,1/6/2021
9,D8442A,executive-salary-increase,100,44,1/6/2021
10,D8442A,stock-loss,100,40,1/6/2021


In [10]:
# Relevance values are limited to 20 or 100
aggregate(rp_entity_id ~ relevance,
          data = rp.dj.2021,
          FUN = length)

relevance,rp_entity_id
<int>,<int>
20,41202
100,224702
