In [12]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from ta import add_all_ta_features
from ta.utils import dropna
import sys
import pandas as pd

def filter_companies(file_name, stocks):
    """
    Filters companies to US only (New York Stock Exchange)

    Parameters:
    - file_name (str): The name of the CSV file containing company data.
    - stocks (int): number of stock tickers to save

    Returns:
    - DataFrame: Filtered DataFrame with companies from the United States.
    """
    # Read the CSV file
    df = pd.read_csv(file_name)

    # Fill NaN values in 'country' column with an empty string
    df['country'] = df['country'].fillna('')

    # Filter the DataFrame
    filtered_df = df[df["country"].str.contains("United States")].head(stocks)

    # Return the filtered DataFrame
    return filtered_df




In [13]:
def add_indicators(df):
    """
    adds 84 Technical Indicators to the dataframe
    
    Parameters:
    - df (dataframe): The dataframe containing default stock information
    
    Returns: 
    - Dataframe: the initial dataframe with all Technical Indicators
    
    """
    #print(df.shape)
    df = dropna(df)
    df = add_all_ta_features(
        df, open="Open", high="High", low="Low", close="Close", volume="Volume")
    return df


In [32]:
def convert_datetime_to_int(df):
    """
    Converts datetime column to integer format.

    Parameters:
    - df (DataFrame): The DataFrame with datetime column.

    Returns:
    - DataFrame: DataFrame with datetime converted to int.
    """
    df = df.reset_index()
    df['Date'] = pd.to_datetime(df['Date'])
    df['Date'] = df['Date'].view('int64') // 10**9  # Converts to seconds
    return df



In [33]:
def normalize_columns(df, columns):
    """
    Normalizes specified columns in the DataFrame.
    Deletes the old columns once done

    Parameters:
    - df (DataFrame): The DataFrame to normalize.
    - columns (list): List of column names to normalize.

    Returns:
    - DataFrame: DataFrame with normalized columns.
    """
    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    df.fillna(0, inplace=True)
    scaler = MinMaxScaler()
    saved_col = []
    for col in columns:
        saved_col.append(f'n_{col}')
        df[f'n_{col}'] = scaler.fit_transform(df[[col]])
        df = df.drop(col, axis=1)
    return [df,saved_col]


In [34]:
def create_profit(df, days):
    """
    Creates buy and profit target columns in the DataFrame.

    Parameters:
    - df (DataFrame): The DataFrame.
    - days (int): The number of days to calculate profit.

    Returns:
    - DataFrame: DataFrame with profit columns.
    """
    for day in range(1, days+1):
        future_close = df['Close'].shift(-day)
        current_close = df['Close']

        # Calculate buy signal
        df['T1-Buy-' + str(day) + 'D'] = (future_close > current_close).astype(int)

        # Calculate profit ratio and handle non-finite values
        profit_ratio = future_close / current_close
        profit_ratio[~np.isfinite(profit_ratio)] = 0  # Replace non-finite values with 0
        df['T2-Profit-' + str(day) + 'D'] = profit_ratio.astype(float)

    # Drop the last 'days' rows to avoid NaNs in the shifted columns
    df.drop(df.tail(days).index, inplace=True)

    return df

def save_temp_csv(df, filename='temp.csv'):
    """
    Saves the DataFrame to a temporary CSV file.

    Parameters:
    - df (DataFrame): The DataFrame to save.
    - filename (str, optional): The filename to save as. Defaults to 'temp.csv'.
    """
    df.to_csv(os.path.join(os.getcwd(), filename), index=False)

import pandas as pd
import yfinance

In [35]:


def preprocess(stocks, day, chunk_size=10000):
    """
    Preprocesses the data from the given filepath.

    Parameters:
    - stocks (list): List of stock tickers.
    - x_columns (list): List of columns to normalize.
    - day (int): The number of days to calculate profit.
    - chunk_size (int): The size of each chunk to process.

    Returns:
    - DataFrame: Preprocessed data.
    """
    first_chunk = True
    for t in stocks:
        y = yfinance.Ticker(t)
        hist = y.history(period="max")

        # Removes unused columns
        hist = hist.drop(columns=['Dividends', 'Stock Splits'], errors='ignore')
        hist = hist.dropna()

        if hist.empty:
            continue

        # Here's the magic
        #hist = add_indicators(hist)
        hist = convert_datetime_to_int(hist)
        hist = create_profit(hist, day)
        hist.replace([np.inf, -np.inf], np.nan, inplace=True)
        hist.fillna(0, inplace=True)

        # Processing in chunks
        for start in range(0, len(hist), chunk_size):
            chunk = hist.iloc[start:start+chunk_size]

            # Saving chunk to CSV
            mode = 'w' if first_chunk else 'a'
            header = first_chunk
            chunk.to_csv("data.csv", mode=mode, header=header, index=False)
            first_chunk = False
    
    print("Preprocessing Done")
    # Read and return the entire preprocessed data
    return 


In [36]:
def prepare_dataset(data, cols):
    # Extracting features and target from the preprocessed data
    data = pd.read_csv(data)
    norm_data, n_cols = normalize_columns(data, cols)
    
    X = norm_data[n_cols]
    selected_columns = norm_data.filter(regex='T1')
    y = selected_columns.to_numpy()
    return X, y

In [37]:
"""
Evaluation Methods
"""
import numpy as np
import torch
from sklearn.metrics import hamming_loss, mean_squared_error, accuracy_score, mean_absolute_error



"""
sklearn eval method: 
Uses predict to get test results
"""
def evaluate_model(net, X_test, y_test):
    prediction = net.predict(X_test)
     # Check if X_test and y_test are tensors, and convert them to numpy if they are

    # Binarize predictions
    prediction_bin = [[1 if element > 0.5 else 0 for element in row] for row in prediction]

    # Calculating metrics
    ham_loss = hamming_loss(y_test, prediction_bin)
    out_acc = round(accuracy_score(y_test, prediction_bin),4)
    ham_acc = round(1- ham_loss,4)
    mse = round(mean_squared_error(y_test, prediction), 4)

    # Print and return results
    #printResults(ham_acc, out_acc, mse)
    return [out_acc, ham_acc, mse]

   
"""prints results"""
def printResults(ham_acc, out_acc, mse):
    print("Output Accuracy:", out_acc)
    print("Individual Accuracy:", ham_acc)
    print("Mean Squared Error:", mse)


In [39]:
class datalogger():
    def __init__(self,features,output_size,stocks):
        self.init = []
        self.init.append(features)
        self.init.append(output_size)
        self.init.append(stocks)
        self.records = []
        headers = []
        headers.append("Features")
        headers.append("Output Size")
        headers.append("Num of Stocks")
        headers.append("Model")
        headers.append("Output Accuracy")
        headers.append("Individual Accuracy")
        headers.append("Mean Square Error")
        self.records.append(headers)

    def save_info(self, model, results):
        info = []
        for i in self.init:
            info.append(i)
        
        info.append(model)
        for r in results:
            info.append(r)
        self.records.append(info)
        
        df = pd.DataFrame(self.records)

        # Save to CSV
        df.to_csv('output.csv', index=False, header=False)
        return 0
    