# Web Scraping 10-K Annual Reports
This notebook contains code to scrape 10-K annual reports from the SEC website and analyze sentiment to predict stock price movements.

In [None]:
# Load necessary libraries
library(tidyverse)
library(caret)  # For predictive modeling
library(keyATM)
library(quanteda)
library(pdftools)
library(tidytext)
library(rvest)
library(ldatuning)
library(topicmodels)
library(tm)
library(XML)
library(dplyr)
library(ggplot2)
library(httr)
library(RSelenium)
library(netstat)
library(wdman)
library(curl)
library(rJava)

In [None]:
# Define the URL of the SEC page
sec_site <- "https://www.sec.gov/edgar/search/#/category=form-cat1&filter_forms=10-K"

driver <- rsDriver(browser = "firefox",
                   chromever = NULL,
                   verbose = F,
                   port = free_port())

remDr <- driver[["client"]]
remDr$navigate('https://www.sec.gov/edgar/search/#/category=form-cat1&filter_forms=10-K')

In [None]:
# Extract URLs from the SEC page
test <- list()

for(i in 1:50) {
  remDr$navigate('https://www.sec.gov/edgar/search/#/category=form-cat1&filter_forms=10-K')
  Sys.sleep(0)
  links <- remDr$findElements(using = 'css selector', ".preview-file")
  links[[i]]$clickElement()
  Sys.sleep(0)
  test[[i]] <- read_html(remDr$getPageSource()[[1]]) %>%
    html_elements("#open-file") %>%
    html_attrs() %>%
    tibble() %>%
    unnest(cols = c(.)) %>%
    filter(str_detect(., "https")) %>%
    pull(.) %>%
    unname()
  close_button <- remDr$findElement(using = 'css selector', ".close")
  close_button$clickElement()
  Sys.sleep(0)
}

print(test)

In [None]:
# Function to predict stock price movement based on sentiment score
predict_stock_movement <- function(sentiment_df, stock_prices) {
  model_data <- stock_prices %>%
    left_join(sentiment_df, by = c("symbol" = "ticker")) %>%
    filter(!is.na(sentiment_score)) %>%
    select(date, close, sentiment_score)
  model <- train(
    close ~ sentiment_score,
    data = model_data,
    method = "lm"
  )
  predictions <- predict(model, newdata = model_data)
  model_data <- model_data %>%
    mutate(predicted_close = predictions)
  return(model_data)
}

predicted_data <- predict_stock_movement(sentiment_df, stock_prices)
print(predicted_data)

In [None]:
# Close the browser and server when done
remDr$close()
driver$server$stop()