In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive



# DATABASE BUILDING
1. Grabbing the top 100 companies
2. Downloading those to a csv to save it
3. From that list, getting the investor relations page from their website to download the last 3-4 quarters of reports, or 2023 year
4. Connecting and reading from the folder containing all the reports in pdf format
5. Download the top 100 companies past-year historical data (2023) and store in drive
6. Grab data in github from 2015-2016, both the stock data as well as press releases

-----------------------
#MODEL - Text
1. Build a BERT model
2. Train said BERT model on the press releases from 2023 and 2015
3. Cluster on the embeddings using DBSCAN
4. Interpret clusters
5. Compare and contrast the differences?
6. Analyze & predict!

#MODEL - Stocks
1. Build time series model
2. Random forest, XGboost, LSTM(Long Short-Term Memory) Network, Neural Nets?
3. Analyze and optimize
4. Predict and compare

-----------------------

#My github repo! (Bread & butter to this project)
https://github.com/kkhawk20/MGSC310-Final-Project

#DATABASE BUILDING

In [None]:
#1. Grabbing the top companies off Yahoo Finance
#This was run on 11/13/23, stock data came from this day!

import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import os

# Function to get most active symbols from Yahoo Finance
def get_most_active_symbols(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'lxml')

    symbols = []
    for row in soup.select('tr.simpTblRow'):
        symbol = row.select_one('td:nth-of-type(1) a').text.strip()
        symbols.append(symbol)

    return symbols

most_active_symbols = get_most_active_symbols('https://finance.yahoo.com/most-active')

In [None]:
#2. Download the top companies and save it
# DO NOT RUN THIS UNLESS YOU WANT TO DOWNLOAD THE CSV

import csv

#list of top 100 companies
your_list = most_active_symbols

# Open a new CSV file
with open('stock_tickers.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    # Writing each item in the list as a new row
    for item in your_list:
        writer.writerow([item])

# Download the file to your local machine (specific to Google Colab)
from google.colab import files
files.download('stock_tickers.csv')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# 3. Grabbing all the investor relations websites and exporting to a CSV file with the subsequent Ticker
# I will now go through each website and download each of the files into a shared google doc

from bs4 import BeautifulSoup
import requests
import re
import csv
import pandas as pd

# Load the CSV file containing the stock tickers
file_path = 'https://raw.githubusercontent.com/kkhawk20/MGSC310-Final-Project/main/Market_data.csv'
stock_tickers = pd.read_csv(file_path)

# Display the first few rows of the DataFrame to verify the contents
stock_tickers.head()

# Function to find investor relations website for a given stock ticker
def find_investor_relations_website(ticker):
    # Constructing the Google search query
    query = f"{ticker} investor relations site"

    try:
        # Performing the search
        search_result = requests.get(f"https://www.google.com/search?q={query}")
        soup = BeautifulSoup(search_result.text, 'html.parser')

        # Finding all the links in the search result
        links = soup.find_all('a')

        for link in links:
            # Extracting the href attribute of each link
            href = link.get('href')

            # Looking for the first valid URL which is typically the company's investor relations page
            if "url?q=" in href and not "webcache.googleusercontent.com" in href:
                url = re.findall("url\?q=(.*?)&", href)
                if url:
                    return url[0]
    except Exception as e:
        return f"Error: {e}"

# Testing the function with a few tickers from the list
all_tickers = stock_tickers['Symbol']
investor_relations_websites = {ticker: find_investor_relations_website(ticker) for ticker in all_tickers}

# Open a new CSV file to download
with open('investor_relations_website.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    # Writing the header
    writer.writerow(['Ticker', 'Investor Relations Website'])
    # Writing each ticker and its corresponding URL as a new row
    for ticker, url in investor_relations_websites.items():
        writer.writerow([ticker, url])

# Download the file to your local
from google.colab import files
files.download('investor_relations_website.csv')

In [None]:
#4. Connecting and reading from the folder containing all the reports in pdf format

import pdfplumber
import os

# Path to your folder in Google Drive
folder_path = '/content/drive/MyDrive/Colab Notebooks/MGSC 310/Portfolio Project/Shareholder Reports'

# Loop through all files in the folder
for filename in os.listdir(folder_path):
    if filename.endswith('.pdf'):
        file_path = os.path.join(folder_path, filename)
        with pdfplumber.open(file_path) as pdf:
            for page in pdf.pages:
                text = page.extract_text()
                # print(f"Contents of {filename}:")
                # print(text)


In [None]:
#5. Downloading the past-year historical data for each of the 100 companies
#DO NOT RUN THIS AGAIN DEAR GOD DO NOT RUN THIS EVER AGAIN
#This data was pulled on 11/14/23

''' Safeguards to not run it again

import os
import yfinance as yf

# Define the list of stock tickers
stock_tickers_data = pd.read_csv('https://raw.githubusercontent.com/kkhawk20/MGSC310-Final-Project/main/Market_data.csv')
stock_tickers = stock_tickers_data['Symbol'].tolist()

# Path to the folder in Google Drive
folder_path = '/content/drive/MyDrive/Colab Notebooks/MGSC 310/Portfolio Project/Yahoo Finance Stock Data'

# Check if the folder exists, if not create it
if not os.path.exists(folder_path):
    os.makedirs(folder_path)

# Loop through each ticker, fetch data, and save as CSV
for ticker in stock_tickers:
    stock_data = yf.download(ticker, period='1y')  # Download past year data
    file_path = os.path.join(folder_path, f'{ticker}_data.csv')
    stock_data.to_csv(file_path)

'''


In [None]:
#6. Grab data in github from 2015-2016, both the stock data as well as press releases

#MODEL

In [None]:
from transformers import BertModel, BertTokenize
import torch

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
input_text = "We really very much value AI"
encoded_input = tokenizer(input_text, return_tensors='pt')
model = BertModel.from_pretrained('bert-base-uncased')

with torch.no_grad():
  output = model(**encoded_input)

  last_hidden_states = output.last_hidden_state
  embeddings = last_hidden_states[0]