In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive



# DATABASE BUILDING
1. Grabbing the top 100 companies
2. Downloading those to a csv to save it
3. From that list, getting the investor relations page from their website to download the last 3-4 quarters of reports, or 2023 year
4. Connecting and reading from the folder containing all the reports in pdf format
5. Download the top 100 companies past-year historical data (2023) and store in drive
6. Convert those files into .txt format (extract the text)

-----------------------
#MODEL - Text
1. Build a BERT model
2. Train said BERT model on the press releases from 2023 and 2015
3. Cluster on the embeddings using DBSCAN
4. Interpret clusters
5. Compare and contrast the differences?
6. Analyze & predict!

#MODEL - Stocks
1. Build time series model
2. Random forest, XGboost, LSTM(Long Short-Term Memory) Network, Neural Nets?
3. Analyze and optimize
4. Predict and compare

-----------------------

#My github repo! (Bread & butter to this project)
https://github.com/kkhawk20/MGSC310-Final-Project

#My Google Drive Folder (Holds all data & such)
https://drive.google.com/drive/folders/1mHmm85q6tzB-V0usK4tMwQmzMnPX1p6P?usp=sharing


#DATABASE BUILDING

In [None]:
#1. Grabbing the top companies off Yahoo Finance
#This was run on 11/13/23, stock data came from this day!

import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import os

# Function to get most active symbols from Yahoo Finance
def get_most_active_symbols(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'lxml')

    symbols = []
    for row in soup.select('tr.simpTblRow'):
        symbol = row.select_one('td:nth-of-type(1) a').text.strip()
        symbols.append(symbol)

    return symbols

most_active_symbols = get_most_active_symbols('https://finance.yahoo.com/most-active')

In [None]:
#2. Download the top companies and save it
# DO NOT RUN THIS UNLESS YOU WANT TO DOWNLOAD THE CSV

import csv

#list of top 100 companies
your_list = most_active_symbols

# Open a new CSV file
with open('stock_tickers.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    # Writing each item in the list as a new row
    for item in your_list:
        writer.writerow([item])

# Download the file to your local machine (specific to Google Colab)
from google.colab import files
files.download('stock_tickers.csv')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# 3. Grabbing all the investor relations websites and exporting to a CSV file with the subsequent Ticker
# I will now painfully go through each website and download each of the files into a shared google doc
# update ... I pulled about 10 companies, 3-4 reports each. top 10 companies listed in github->'MGSC310-Final-Project/investor_relations_website.csv'

from bs4 import BeautifulSoup
import requests
import re
import csv
import pandas as pd

# Load the CSV file containing the stock tickers
file_path = 'https://raw.githubusercontent.com/kkhawk20/MGSC310-Final-Project/main/Market_data.csv'
stock_tickers = pd.read_csv(file_path)

# Display the first few rows of the DataFrame to verify the contents
stock_tickers.head()

# Function to find investor relations website for a given stock ticker
def find_investor_relations_website(ticker):
    # Constructing the Google search query
    query = f"{ticker} investor relations site"

    try:
        # Performing the search
        search_result = requests.get(f"https://www.google.com/search?q={query}")
        soup = BeautifulSoup(search_result.text, 'html.parser')

        # Finding all the links in the search result
        links = soup.find_all('a')

        for link in links:
            # Extracting the href attribute of each link
            href = link.get('href')

            # Looking for the first valid URL which is typically the company's investor relations page
            if "url?q=" in href and not "webcache.googleusercontent.com" in href:
                url = re.findall("url\?q=(.*?)&", href)
                if url:
                    return url[0]
    except Exception as e:
        return f"Error: {e}"

# Testing the function with a few tickers from the list
all_tickers = stock_tickers['Symbol']
investor_relations_websites = {ticker: find_investor_relations_website(ticker) for ticker in all_tickers}

# Open a new CSV file to download
with open('investor_relations_website.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    # Writing the header
    writer.writerow(['Ticker', 'Investor Relations Website'])
    # Writing each ticker and its corresponding URL as a new row
    for ticker, url in investor_relations_websites.items():
        writer.writerow([ticker, url])

# Download the file to your local
from google.colab import files
files.download('investor_relations_website.csv')

In [None]:
#4. Connecting and reading from the folder containing all the reports in pdf format

import pdfplumber
import os

# Path to your folder in Google Drive
folder_path = '/content/drive/MyDrive/Colab Notebooks/MGSC 310/Portfolio Project/Shareholder Reports'

# Loop through all files in the folder
for filename in os.listdir(folder_path):
    if filename.endswith('.pdf'):
        file_path = os.path.join(folder_path, filename)
        with pdfplumber.open(file_path) as pdf:
            for page in pdf.pages:
                text = page.extract_text()
                # print(f"Contents of {filename}:")
                # print(text)


In [None]:
#5. Downloading the past-year historical data for each of the 100 companies, given the list of tickers
#DO NOT RUN THIS AGAIN DEAR GOD DO NOT RUN THIS EVER AGAIN
#This data was pulled on 11/14/23

''' Safeguards comments to not run it again

import os
import yfinance as yf

# Define the list of stock tickers
stock_tickers_data = pd.read_csv('https://raw.githubusercontent.com/kkhawk20/MGSC310-Final-Project/main/Market_data.csv')
stock_tickers = stock_tickers_data['Symbol'].tolist()

# Path to the folder in Google Drive
folder_path = '/content/drive/MyDrive/Colab Notebooks/MGSC 310/Portfolio Project/Yahoo Finance Stock Data'

# Check if the folder exists, if not create it
if not os.path.exists(folder_path):
    os.makedirs(folder_path)

# Loop through each ticker, fetch data, and save as CSV
for ticker in stock_tickers:
    stock_data = yf.download(ticker, period='1y')  # Download past year data
    file_path = os.path.join(folder_path, f'{ticker}_data.csv')
    stock_data.to_csv(file_path)

'''


In [None]:
#6. Download the old 2015-2018 shareholder report press conference data and clean files
#DO NOT RUN THIS AGAIN


''' #SAFEGUARD COMMENT TO NOT RUN IT AGAIN

import os
import shutil

# Define the path to the parent directory containing company folders
!git clone https://github.com/kkhawk20/MGSC310-Final-Project.git
%cd /content/MGSC310-Final-Project/HistoricalPressInf

parent_dir = '/content/MGSC310-Final-Project/HistoricalPressInfo'
print("Contents of parent_dir:", os.listdir(parent_dir))


# Define the path to the target directory where you want to consolidate all CSV files
target_dir = '/content/MGSC310-Final-Project/cleanHistoricalPressInfo'

# Create the target directory if it doesn't exist
if not os.path.exists(target_dir):
    os.makedirs(target_dir)

# Iterate over each subfolder in the parent directory
for folder_name in os.listdir(parent_dir):
    folder_path = os.path.join(parent_dir, folder_name)

    # Check if it's a directory
    if os.path.isdir(folder_path):
        # Iterate over each file in the subfolder
        for file_name in os.listdir(folder_path):
            if file_name.endswith('.csv'):
                # Construct the full file path
                file_path = os.path.join(folder_path, file_name)
                # Rename the file to include the company's name
                new_file_name = f"{folder_name}_{file_name}"
                new_file_path = os.path.join(target_dir, new_file_name)
                # Move and rename the file
                shutil.move(file_path, new_file_path)


In [None]:
#7. Extract text from pdf files into another folder.
#Converting the pdf files into .txt files for BERT model
#These are the shareholder reports

#DO NOT RUN AGAIN
#STORED IN GOOGLE DRIVE FOLDER -> Shareholder Reports TXT

'''
import os
import pdfplumber

# Directory containing the PDF files
pdf_dir = '/content/drive/MyDrive/Colab Notebooks/MGSC 310/Portfolio Project/Shareholder Reports'

# Directory to store the text files
text_dir = '/content/drive/MyDrive/Colab Notebooks/MGSC 310/Portfolio Project/Shareholder Reports TXT'

# Create the text directory if it doesn't exist
if not os.path.exists(text_dir):
    os.makedirs(text_dir)

# Iterate over each file in the PDF directory
for pdf_file in os.listdir(pdf_dir):
    if pdf_file.endswith('.pdf'):
        pdf_path = os.path.join(pdf_dir, pdf_file)
        text_path = os.path.join(text_dir, pdf_file.replace('.pdf', '.txt'))

        # Extract text and save to a .txt file
        with pdfplumber.open(pdf_path) as pdf, open(text_path, 'w') as text_file:
            for page in pdf.pages:
                text = page.extract_text()
                if text:
                    text_file.write(text)
                    text_file.write('\n')  # Add a newline between pages



#MODEL - TEXT -> this sucks and doesnt work, dont use!!

This did not work. Please ignore and use as a way to follow my thought process in this project

In [None]:
import os

folder_path = '/content/drive/MyDrive/Colab Notebooks/MGSC 310/Portfolio Project/Shareholder Reports TXT'
texts = []

for filename in os.listdir(folder_path):
    if filename.endswith('.txt'):
        file_path = os.path.join(folder_path, filename)
        with open(file_path, 'r', encoding='utf-8') as file:
            text = file.read()
            texts.append(text)
            # Add a corresponding label for each text


In [None]:
from transformers import BertTokenizer, BertModel
import torch
from sklearn.cluster import KMeans

# Load BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Tokenize and encode texts
inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt", max_length=512)
with torch.no_grad():
    outputs = model(**inputs)
embeddings = outputs.last_hidden_state.mean(dim=1).numpy()  # Use mean pooling for sentence embeddings

# Apply clustering algorithm (e.g., K-Means)
num_clusters = 5  # Set the number of clusters you want
kmeans = KMeans(n_clusters=num_clusters)
kmeans.fit(embeddings)
cluster_labels = kmeans.labels_

# Now, 'cluster_labels' contains the cluster assignment for each text




In [None]:
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

# Assuming 'embeddings' is your array of BERT embeddings and 'cluster_labels' is from your KMeans clustering

# Dimensionality Reduction with PCA
pca = PCA(n_components=4)
reduced_embeddings = pca.fit_transform(embeddings)

# Plotting
plt.figure(figsize=(12, 8))
scatter = plt.scatter(reduced_embeddings[:, 0], reduced_embeddings[:, 1], c=cluster_labels, cmap='viridis')
plt.title('PCA visualization of BERT embeddings with cluster labels')
plt.colorbar(scatter)
plt.show()


In [None]:


import os
import torch
from torch.nn.functional import softmax
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import numpy as np

# Load model and tokenizer
model = BertForSequenceClassification.from_pretrained('bert-base-uncased',
                                                      num_labels=10,
                                                      output_hidden_states = True)

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Load and preprocess text
folder_path = '/content/drive/MyDrive/Colab Notebooks/MGSC 310/Portfolio Project/Shareholder Reports TXT'
texts = []
embeddings = []

for filename in os.listdir(folder_path):
    if filename.endswith('.txt'):
        file_path = os.path.join(folder_path, filename)
        with open(file_path, 'r', encoding='utf-8') as file:
            text = file.read()
            inputs = tokenizer(text, padding='max_length', truncation=True, max_length=512, return_tensors="pt")
            texts.append(inputs)

# Predict using the model and extract embeddings
model.eval()
categories = []
embeddings = []

for text in texts:
    with torch.no_grad():
        outputs = model(**text)
        logits = outputs.logits
        probabilities = softmax(logits, dim=1)
        predicted_class = probabilities.argmax(dim=1).item()
        categories.append(predicted_class)

        # Extract [CLS] token embeddings
        embedding = outputs.hidden_states[-1][:, 0, :].squeeze().numpy()
        embeddings.append(embedding)

# Clustering within each classified group
num_clusters_per_category = 3  # Example: 3 subclusters per category
clustered_texts = [[] for _ in range(10)]  # Assuming 10 categories

for category, embedding in zip(categories, embeddings):
    clustered_texts[category].append(embedding)

# Apply clustering for each category
for i, category_embeddings in enumerate(clustered_texts):
    if len(category_embeddings) > 0:
        category_embeddings = np.array(category_embeddings)

        # Dimensionality Reduction (Optional)
        pca = PCA(n_components=50)
        reduced_embeddings = pca.fit_transform(category_embeddings)

        # Clustering
        kmeans = KMeans(n_clusters=num_clusters_per_category)
        clusters = kmeans.fit_predict(reduced_embeddings)

        # You can now analyze 'clusters' for each category
        print(f"Category {i}: Cluster labels - {clusters}")



#Model - TEXT ... BERT, Tokenizers, DBSCAN

This is attempt 2 in my project, digging deeper into BERT and tokenizing sentences

Utilizing DBSCAN to cluster the topics present in the shareholder reports

In [None]:
#This took +30 minutes to run ... good luck LOL

import os
from transformers import BertTokenizer, BertModel
import torch
from sklearn.cluster import DBSCAN
import numpy as np

# Initialize BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')
model.eval()  # Set the model to evaluation mode

folder_path = '/content/drive/MyDrive/Colab Notebooks/MGSC 310/Portfolio Project/Shareholder Reports TXT'
embeddings = []

# Read and process each file
for filename in os.listdir(folder_path):
  if filename.endswith('.txt'):
    file_path = os.path.join(folder_path, filename)
    with open(file_path, 'r', encoding='utf-8') as file:
      text = file.read()
      sentences = text.split('\n')  # Assuming each sentence is on a new line
      for sentence in sentences:
        inputs = tokenizer(sentence, return_tensors='pt', truncation=True, max_length=512, padding=True)
        with torch.no_grad():
            outputs = model(**inputs)
        sentence_embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
        embeddings.append(sentence_embedding)


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [None]:
folder_path = '/content/drive/MyDrive/Colab Notebooks/MGSC 310/Portfolio Project/Shareholder Reports TXT'
texts = []

for filename in os.listdir(folder_path):
  if filename.endswith('.txt'):
    file_path = os.path.join(folder_path, filename)
    with open(file_path, 'r', encoding='utf-8') as file:
      text = file.read()
      sentences = text.split('\n')  # Assuming each sentence is on a new line
      for sentence in sentences:
        texts.append(sentence) # Creating a "texts" variable that holds all the sentences as the embeddings were given

texts_df = pd.DataFrame({"texts": texts})
texts_df.head(100)

In [None]:
#Using the elbow method to choose eps and min_samples for DBSCAN

from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import DBSCAN
from sklearn.pipeline import Pipeline
from plotnine import *
import pandas as pd

mins = 25
nn = NearestNeighbors(n_neighbors = mins + 1)

# create pipeline
pipe_elbow = Pipeline([
    ("nn", nn)
])

# fit nn model
pipe_elbow.fit(embeddings)

# get neighbors
distances, neighbors = pipe_elbow.named_steps["nn"].kneighbors(embeddings)

# sort the distances
distances = np.sort(distances[:, mins], axis = 0)

#plot the distances
distances_df = pd.DataFrame({"distances": distances,
                             "index": list(range(0,len(distances)))})
plt = (ggplot(distances_df, aes(x = "index", y = "distances")) +
 geom_line(size = 2) + theme_minimal() +
 labs(title = "Elbow Method for Choosing eps") +
       geom_hline(yintercept = 7, color = "red", linetype = "dashed") # our estimate for inflection point
       )

plt

In [None]:
# Apply clustering to embeddings
embeddings = np.vstack(embeddings)
clustering = DBSCAN(eps=7, min_samples=50).fit(embeddings)
cluster_labels = clustering.labels_
# Analyze clusters for topic discovery
# You can now examine texts in each cluster to identify common themes or topics

In [None]:
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE

tsne_model = TSNE(n_components=2, perplexity=20, random_state=42)
tsne_embeddings = tsne_model.fit_transform(embeddings)

plt.figure(figsize=(12, 8))
scatter = plt.scatter(tsne_embeddings[:, 0], tsne_embeddings[:, 1], c=cluster_labels, cmap='viridis')
plt.title('t-SNE visualization of BERT embeddings with DBSCAN cluster labels')
plt.colorbar(scatter)
plt.show()

#MODEL - STOCKS

In [None]:
# Date	Open	High	Low	Close	Adj Close	Volume

In [None]:
import os
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense

def create_sequences(data, sequence_length):
    X, y = [], []
    for i in range(len(data) - sequence_length):
        sequence = data[i:i + sequence_length]
        label = data[i + sequence_length]
        X.append(sequence)
        y.append(label)
    return np.array(X), np.array(y)

folder_path = '/content/drive/MyDrive/Colab Notebooks/MGSC 310/Portfolio Project/Yahoo Finance Stock Data'
sequence_length = 60
scaler = MinMaxScaler(feature_range=(0, 1))

for filename in os.listdir(folder_path):
    if filename.endswith('.csv'):
        file_path = os.path.join(folder_path, filename)
        # Load data
        dataframe = pd.read_csv(file_path)
        # Assuming 'Close' is the closing price of the stock
        prices = dataframe['Close'].values.reshape(-1, 1)
        # Normalize data
        scaled_prices = scaler.fit_transform(prices)
        # Create sequences
        X, y = create_sequences(scaled_prices, sequence_length)

        # Split data (you can also use train_test_split from sklearn)
        train_size = int(0.8 * len(X))
        X_train, y_train = X[:train_size], y[:train_size]
        X_test, y_test = X[train_size:], y[train_size:]

        # Build LSTM model (as shown in the previous example)
        model = Sequential([
            LSTM(50, return_sequences=True, input_shape=(sequence_length, 1)),
            LSTM(50),
            Dense(1)
        ])
        model.compile(optimizer='adam', loss='mean_squared_error')

        # Train the model
        model.fit(X_train, y_train, epochs=20, batch_size=32)

pred = model.predict_transform(X_test)


In [None]:
plt.plot(X_train, color = 'black', label = 'TATA Stock Price')
plt.plot(pred, color = 'green', label = 'Predicted TATA Stock Price')
plt.title('TATA Stock Price Prediction')
plt.xlabel('Time')
plt.ylabel('TATA Stock Price')
plt.legend()
plt.show()