Train on AAPL

In [24]:
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
import utils

In [28]:
input_directory = "../data/tickers"

# Load the data for each day into a dataframe
features_train = []
features_test = []
targets_train = []
targets_test = []
for filename in utils.list_files_recursively(input_directory):
    # read csv
    features_day = pd.read_csv(filename)

    # get date
    date = utils.read_date_string(filename)

    # only keep rows for predicting market times 
    # (first prediction will occur when 9:29 minute arrives (at start of 9:30), predicting close of 9:30 minute)
    # (last prediction will occur when 15:58 minute arrives (at start of 15:59), predicting close of 15:59 minute)
    features_day = features_day.loc[features_day["time"] >= "09:29"]
    features_day = features_day.loc[features_day["time"] <= "15:59"] # keep 15:59 minute for now to generate target correctly for 15:58 minute

    # calculate targets
    targets_day = features_day["close"].diff(-1) < 0 # if 9:30 close - 9:31 close is < 0, then target is true (price increase)

    # drop last row of feature and targets (15:59 minute which arrives at market close)
    features_day = features_day.iloc[:-1, :]
    targets_day = targets_day.iloc[:-1]

    # add feature colums for minute in day
    # features_day["minute"] = features_day["time"].apply(utils.convert_time_to_minutes)

    # drop time column
    features_day = features_day.drop("time", axis=1)

    # add to training if before 2023/07/25, add to testing if on or after 2023/07/25
    if date < "20230726":
        features_train.append(features_day)
        targets_train.append(targets_day)
    else:
        features_test.append(features_day)
        targets_test.append(targets_day)


In [29]:
features_train = pd.concat(features_train)
targets_train = pd.concat(targets_train)
features_test = pd.concat(features_test)
targets_test = pd.concat(targets_test)

In [49]:

# Train a decision tree classifier
classifier = RandomForestClassifier(n_estimators=10, n_jobs=-1, max_depth=10)
classifier.fit(features_train, targets_train)

# Make predictions on the test set
predictions = classifier.predict(features_test)

# Evaluate the accuracy of the model
accuracy = accuracy_score(targets_test, predictions)

print("Accuracy:", accuracy)

Accuracy: 0.5044871794871795
