In [28]:
import pandas as pd
from sklearn.model_selection import train_test_split
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import requests
import json 
import os

In [50]:
"""

Extract data from CSV files in "Prices" folder and combine into a single Pandas dataframe with ticker, date, price, 1-month price change (%), and 3-month price change (%).

"""

# Defines path to files containing price information (just the "Prices" folder)
folder_path = "Prices"

# Initialize an empty list to store the dataframes
dataframes = []

# Iterate over each file in the folder
for filename in os.listdir(folder_path):
    if filename.endswith(".csv"):  # Check if the file is a CSV
        file_path = os.path.join(folder_path, filename)  # Get the path to the file
        
        # Read the CSV file into a pandas dataframe
        df = pd.read_csv(file_path)

        # Add a new column containing the company ticker, which we extract from the filename
        filename_wo_extension = os.path.splitext(filename)[0]
        df['Ticker'] = filename_wo_extension

        condition_one_month = df['Ticker'].shift(-20) == df['Ticker']
        df.loc[condition_one_month, 'OneMonthChange'] = (df['Close'].shift(-20) - df['Close']) / df['Close'] * 100

        condition_three_month = df['Ticker'].shift(-60) == df['Ticker']
        df['ThreeMonthChange'] = (df['Close'].shift(-60) - df['Close']) / df['Close'] * 100
        
        #Creating our y column, Result
        df['Result'] = 0  # Init the column with zeros
        one_month_threshold = 5
        three_month_threshold = 10

        df.loc[(df['OneMonthChange'] > one_month_threshold) & (df['ThreeMonthChange'] > three_month_threshold), 'Result'] = 1
        df.loc[(df['OneMonthChange'] < -one_month_threshold) & (df['ThreeMonthChange'] < -three_month_threshold), 'Result'] = -1
        
        # Append the DataFrame to the list
        dataframes.append(df)

# Concatenate all the DataFrames in the list into a single DataFrame
prices_df = pd.concat(dataframes, ignore_index=True)

# Be careful--the below filters out any features not specified so adding a feature above won't show up if you don't modify the below line.
prices_df = prices_df[["Date", "Close", "Ticker", "OneMonthChange", "ThreeMonthChange", "Result"]]

# prices_df has all the CSV files' contents in a single DataFrame
print(prices_df)

                            Date       Close Ticker  OneMonthChange  \
0      1999-11-18 00:00:00-05:00   26.652401      A        4.403417   
1      1999-11-19 00:00:00-05:00   24.456602      A       16.099074   
2      1999-11-22 00:00:00-05:00   26.652401      A        5.965871   
3      1999-11-23 00:00:00-05:00   24.229450      A       18.906221   
4      1999-11-24 00:00:00-05:00   24.873049      A       21.156744   
...                          ...         ...    ...             ...   
92411  2024-04-25 00:00:00-04:00  246.339996    ADP             NaN   
92412  2024-04-26 00:00:00-04:00  243.070007    ADP             NaN   
92413  2024-04-29 00:00:00-04:00  243.949997    ADP             NaN   
92414  2024-04-30 00:00:00-04:00  241.889999    ADP             NaN   
92415  2024-05-01 00:00:00-04:00  247.330002    ADP             NaN   

       ThreeMonthChange  Result  
0             86.079527       0  
1            100.619218       1  
2            120.454482       1  
3          

Need to merge prices with earnings transcript data here.

In [51]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split

#Will have some placeholder at first to test this
texts = ["good test", "bad test"]

#Make label.
prices_df["Label"] = 1
df.loc[(df['Result'] < 1), 'Label'] = 0

# Split data into training and validation sets

# Load pre-trained BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize texts

# Convert labels to tensors

# Load pre-trained BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)  # For now, will assume binary classification

# Need to define training arguments here 

# Need to define the trainer here

# Need to fine-tune BERT on our data here
#trainer.train()

# Need to evaluate our work here
#trainer.evaluate()


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [32]:
from itertools import combinations
from sklearn.linear_model import LogisticRegression

#LR = LogisticRegression(max_iter=1000)

#Below is extremely straightforward logistic regression code to train the model on the quantitative outputs BERT pumps out
#LR.fit(X_train[cols], y_train)
#print(LR.score(X_train[cols], y_train))