In [1]:
import pandas as pd
import polars as pl
import numpy as np
import streamlit
import plotly.express as px

from preprocessing import bulk_preprocessing, streamed_preprocessing


from xgboost import XGBRegressor
# from statsmodels.tsa.statespace.sarimax import SARIMAX

import simfin as sf
import os
from time import sleep
import re

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)


In [4]:
COM=pl.read_csv('us-companies.csv', separator=';')
PRI=pl.read_csv('us-shareprices-daily.csv', separator=';')

In [8]:
PRI.null_count()

Ticker,SimFinId,Date,Open,High,Low,Close,Adj. Close,Volume,Dividend,Shares Outstanding
u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
0,0,0,0,0,0,0,0,0,5771597,526004


In [2]:
class FinancialPredictions():
    def __init__(self, chosen_companies:list, start_date:str=None, end_date:str=None):
        '''
        chosen_companies : List of the companies the analysis will be performed on.

        start_date : initial date of the historical data. If None, retrieves from the beginning of the available information. Data starts on '2019-04-08'

        end_date : final date of the historical data. If None, retrieves until the end of the available information. Data ends on '2024-03-11'
        
        '''
        
        date_format=re.compile(r'^\d{4}-\d{2}-\d{2}$')
        
        if start_date is not None:
            if not date_format.match(start_date):
                print("The start_date parameter must be a string passed in the format '%Y-%m-%d'")
        
        if end_date is not None:
            if not date_format.match(end_date):
                print("The end_date parameter must be a string passed in the format '%Y-%m-%d'")

        if len(chosen_companies)==0:
            print("The chosen companies' list cannot be empty")
        
        if start_date is None:
            self.start_date=None
        else:
            self.start_date=pd.to_datetime(start_date)
            
        if end_date is None:
            self.end_date=None
        else:
            self.end_date=pd.to_datetime(end_date)

        self.chosen_companies=chosen_companies
        self.__api_key = '2c33b88f-d5c5-43cf-9d4e-14cf1bf5e589'
        self.companies, self.prices,  = self.__load_datasets__()
        self.new_data=self.get_new_prices()
        self.data=self.get_historical_data()
        self.updateable_data=self.get_historical_data()
        self.__model=self.__predictive_model__()
    
    

    def __load_datasets__(self):
        companies=COM
        prices=PRI

        prices=prices.with_columns(pl.col('Date').str.to_datetime('%Y-%m-%d'))

        if self.start_date is None and self.end_date is None: #If no start or end date are specified
            prices=prices
        elif self.start_date is None and self.end_date is not None: #If no start date is specified
            prices=prices.filter(pl.col('Date')<=self.end_date)
        elif self.start_date is not None and self.end_date is None: #If no end date is specified
            prices=prices.filter(pl.col('Date')>=self.start_date)
        else: #If both dates are specified
            prices=prices.filter((pl.col('Date')>=self.start_date)&(pl.col('Date')<=self.end_date))

        return companies, prices
    

    def get_historical_data(self):
        '''
        Returns
        -------
        Dataframe with consolidated and preprocessed historical information
        '''
        return bulk_preprocessing(self.companies, self.prices, self.chosen_companies)
    
    def get_new_prices(self):
        '''
        Fetches new prices from the Simfin platform using the API

        Returns
        -------
        Dataframe with latest information (1 day) for every stock in the USA market
        '''

        sleep(0.5)
        sf.set_api_key(self.__api_key)
        sf.set_data_dir(os.getcwd()+'/streamed')

        stream=sf.load_shareprices(market='us',variant='latest');   

        return streamed_preprocessing(self.companies, stream, self.chosen_companies)

    def __predictive_model__(self):
        #Define target and exogenous variables
        data=self.updateable_data
        #df=data[data['ticker']==stock]
        x=data.drop('returns', axis=1)
        y=data['returns']


        model = XGBRegressor(objective='reg:squarederror', learning_rate=0.15, n_estimators=200, subsample=0.4, enable_categorical=True)
        model.fit(x, y)

        return model;
    

    def predict_new_return(self, stock_data):
        preds=self.__model.predict(stock_data)

        #self.__continuous_training__()

        return preds
    
    def investing_strategy(self):

        stocks=self.chosen_companies
    
        for stock in stocks:
        
            pred=self.predict_new_return(self.new_data[self.new_data['ticker']==stock])
            historical_data=self.get_historical_data()
            rel=historical_data[historical_data['ticker']==stock]['returns']
            rang=rel.max()-rel.min()

            if pred>0.2*rang:
                print(f'According to our model, the return tomorrow for {stock} will be greatly positive, you should buy')
            elif pred <-0.2*rang:
                print(f"According to our model, the return tomorrow for {stock} will be highly negative, you should sell")
            else:
                print(f"According to our model, the return tomorrow for {stock} won't surpass 20% change in any direction, you should hold")
        
        self.__continuous_training__()

    def __continuous_training__(self):
        '''
        Function to keep training the model with new predictions
        '''
        #Create new rows and add the returns to the dataframe
        aux=self.new_data.copy()
        aux['returns']=self.predict_new_return(aux)

        #Check if rows are not already in the data dataframe by their Date (index)
        if not aux.index.isin(self.updateable_data.index):
            self.updateable_data=pd.concat([self.updateable_data,aux])

        self.__model=self.__predictive_model__()



In [5]:
fp=FinancialPredictions(['AAPL'])

Dataset "us-shareprices-latest" on disk (2 days old).
- Loading from disk ... Done!


In [5]:
data=fp.get_historical_data()
data.head()

Unnamed: 0_level_0,ticker,open,high,low,close,adj_close,volume,shares_outstanding,industry_id,number_employees,returns
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2019-04-09,AAPL,50.08,50.71,49.81,49.88,47.79,143072948,18429136000,101001,147000,-0.14
2019-04-10,AAPL,49.67,50.19,49.55,50.16,48.06,86781152,18429136000,101001,147000,0.27
2019-04-11,AAPL,50.21,50.25,49.61,49.74,47.66,83603232,18429136000,101001,147000,-0.4
2019-04-12,AAPL,49.8,50.03,49.05,49.72,47.64,111042672,18429136000,101001,147000,-0.02
2019-04-15,AAPL,49.65,49.96,49.5,49.81,47.72,70146584,18429136000,101001,147000,0.08


In [6]:
new_data=fp.get_new_prices()

Dataset "us-shareprices-latest" on disk (0 days old).
- Loading from disk ... Done!


In [7]:
new_data

Unnamed: 0_level_0,ticker,open,high,low,close,adj_close,volume,shares_outstanding,industry_id,number_employees
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2025-03-06,AAPL,234.44,237.86,233.16,235.33,235.33,44856248,15022070000.0,101001.0,147000.0


In [8]:
fp.investing_strategy()

According to our model, the return tomorrow for AAPL won't surpass 20% change in any direction, you should hold


In [9]:
fp.new_data

Unnamed: 0_level_0,ticker,open,high,low,close,adj_close,volume,shares_outstanding,industry_id,number_employees
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2025-03-06,AAPL,234.44,237.86,233.16,235.33,235.33,44856248,15022070000.0,101001.0,147000.0


In [10]:
fp.data

Unnamed: 0_level_0,ticker,open,high,low,close,adj_close,volume,shares_outstanding,industry_id,number_employees,returns
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2019-04-09,AAPL,50.08,50.71,49.81,49.88,47.79,143072948,18429136000,101001,147000,-0.14
2019-04-10,AAPL,49.67,50.19,49.55,50.16,48.06,86781152,18429136000,101001,147000,0.27
2019-04-11,AAPL,50.21,50.25,49.61,49.74,47.66,83603232,18429136000,101001,147000,-0.40
2019-04-12,AAPL,49.80,50.03,49.05,49.72,47.64,111042672,18429136000,101001,147000,-0.02
2019-04-15,AAPL,49.65,49.96,49.50,49.81,47.72,70146584,18429136000,101001,147000,0.08
...,...,...,...,...,...,...,...,...,...,...,...
2024-03-05,AAPL,170.76,172.04,169.62,170.12,169.32,95132355,15441881000,101001,147000,-4.96
2024-03-06,AAPL,171.06,171.24,168.68,169.12,168.32,68587707,15441881000,101001,147000,-1.00
2024-03-07,AAPL,169.15,170.73,168.49,169.00,168.21,71765061,15441881000,101001,147000,-0.11
2024-03-08,AAPL,169.00,173.70,168.94,170.73,169.93,76267041,15441881000,101001,147000,1.72


In [11]:
fp.updateable_data

Unnamed: 0_level_0,ticker,open,high,low,close,adj_close,volume,shares_outstanding,industry_id,number_employees,returns
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2019-04-09,AAPL,50.08,50.71,49.81,49.88,47.79,143072948,1.842914e+10,101001.0,147000.0,-0.14000
2019-04-10,AAPL,49.67,50.19,49.55,50.16,48.06,86781152,1.842914e+10,101001.0,147000.0,0.27000
2019-04-11,AAPL,50.21,50.25,49.61,49.74,47.66,83603232,1.842914e+10,101001.0,147000.0,-0.40000
2019-04-12,AAPL,49.80,50.03,49.05,49.72,47.64,111042672,1.842914e+10,101001.0,147000.0,-0.02000
2019-04-15,AAPL,49.65,49.96,49.50,49.81,47.72,70146584,1.842914e+10,101001.0,147000.0,0.08000
...,...,...,...,...,...,...,...,...,...,...,...
2024-03-06,AAPL,171.06,171.24,168.68,169.12,168.32,68587707,1.544188e+10,101001.0,147000.0,-1.00000
2024-03-07,AAPL,169.15,170.73,168.49,169.00,168.21,71765061,1.544188e+10,101001.0,147000.0,-0.11000
2024-03-08,AAPL,169.00,173.70,168.94,170.73,169.93,76267041,1.544188e+10,101001.0,147000.0,1.72000
2024-03-11,AAPL,172.94,174.38,172.05,172.75,171.94,58929918,1.544188e+10,101001.0,147000.0,2.01000


In [21]:
COM.drop_nulls(subset=['Company Name', 'Ticker'])

Ticker,SimFinId,Company Name,IndustryId,ISIN,End of financial year (month),Number Employees,Business Summary,Market,CIK,Main Currency
str,i64,str,i64,str,i64,i64,str,str,i64,str
"""A""",45846,"""AGILENT TECHNOLOGIES INC""",106001,"""US00846U1016""",10,16400,"""Agilent Technologies Inc is en…","""us""",1090872,"""USD"""
"""A21""",1333027,"""Li Auto Inc.""",,,12,,,"""us""",1791706,"""USD"""
"""AA""",367153,"""Alcoa Corp""",110004,"""US0138721065""",12,12900,"""Alcoa Corp is an integrated al…","""us""",1675149,"""USD"""
"""AAC""",7962652,"""Ares Acquisition Corporation""",104002,"""US0003071083""",12,,"""Ares Acquisition Corporation d…","""us""",1829432,"""USD"""
"""AACI""",11820349,"""Armada Acquisition Corp. I""",104002,"""US04208V1035""",9,,"""Armada Acquisition Corp. I foc…","""us""",1844817,"""USD"""
…,…,…,…,…,…,…,…,…,…,…
"""ZWS""",17663788,"""Zurn Elkay Water Solutions Cor…",100001,"""US98983L1089""",12,2700,"""Zurn Elkay Water Solutions Cor…","""us""",1439288,"""USD"""
"""ZY""",1243193,"""Zymergen Inc.""",106002,"""US98985X1000""",12,758,"""Zymergen is a biofacturing com…","""us""",1645842,"""USD"""
"""ZYME""",17663790,"""Zymeworks Inc.""",106002,"""CA98985W1023""",12,291,"""Zymeworks Inc., a clinical-sta…","""us""",1403752,"""USD"""
"""ZYNE""",901704,"""Zynerba Pharmaceuticals, Inc.""",106002,"""US98986X1090""",12,25,"""Zynerba Pharmaceuticals Inc to…","""us""",1621443,"""USD"""
