In [11]:
# Setup mlflow tracking
import mlflow
url = '127.0.0.1'
port = '8080'

mlflow.set_tracking_uri(uri=f"http://{url}:{port}")
print(f"MLflow tracking uri set to: {mlflow.get_tracking_uri()}")

MLflow tracking uri set to: http://127.0.0.1:8080


In [None]:
# Data processing
import datetime
import utils
import json

# Load API key
polygon_api_key = json.load(open('keys.json'))['POLYGON_API_KEY']

# Get min date
min_date = utils.get_max_date(file_dir='data/daily-aggregates')

# First time min date
# min_date = (datetime.datetime.now() - datetime.timedelta(days=5 * 365)).strftime('%Y-%m-%d')

# Download stock data
utils.update_stock_data(
    min_date=min_date.strftime('%Y-%m-%d'),
    max_date=datetime.datetime.now().strftime('%Y-%m-%d'),
    api_key=polygon_api_key
)

In [None]:
import pandas as pd

stock_df = utils.create_stock_dataframe(file_dir='data/daily-aggregates')
# Sort data by date
stock_df['Date'] = pd.to_datetime(stock_df['Date'])
stock_df.sort_values(by=['Date'], inplace=True, ascending=False)

# Save dataframe
stock_df.to_csv('data/stock_data/data.csv', index=False)

In [None]:
null_df = stock_df.groupby(['Date']).agg({'Transactions': lambda x: x.isnull().sum()})

null_df.reset_index(inplace=True)
null_df = null_df.rename(columns = {'index': 'Date'})

import plotly.express as px

fig = px.line(null_df, x='Date', y='Transactions')
fig.show()

In [None]:
null_df = stock_df.groupby(['Date']).agg({'Close_Price': lambda x: x.isnull().sum()})

null_df.reset_index(inplace=True)
null_df = null_df.rename(columns = {'index': 'Date'})

import plotly.express as px

fig = px.line(null_df, x='Date', y='Close_Price')
fig.show()

In [None]:
# Available summary data
from matplotlib import pyplot as plt

vals = stock_df.Exchange_Symbol.value_counts()
plt.hist(vals)
plt.title('Number of Daily Summaries for Stocks')
plt.xlabel('Daily Summaries')
plt.ylabel('Frequency')
plt.show()

In [None]:
import mlflow
import utils
import os
import json
import datetime

import numpy as np
import pandas as pd
import multiprocessing as mp

from prophet import Prophet
from prophet.diagnostics import cross_validation, performance_metrics
from mlflow.models import infer_signature

# Load stock data if not already loaded
try:
    stock_df
except Exception as e:
    print(e)
    stock_df = pd.read_csv('data/stock_data/data.csv')
    stock_df['Date'] = pd.to_datetime(stock_df['Date'])

# Limit stocks to those with at least 1000 daily summaries
forecast_symbols = stock_df.Exchange_Symbol.value_counts()[(stock_df.Exchange_Symbol.value_counts() > 1000).values]
forecast_symbols = list(forecast_symbols.keys())

# Place holder for parameters
param_list = []
for symbol in forecast_symbols:
    param_list.append({
        'stock_symbol': symbol,
        'df': stock_df
    })

# params = []
pool = mp.Pool(mp.cpu_count()-1)
params = pool.map(utils.create_forecast_params, param_list)
pool.close()


In [None]:
# Create nested runs
experiment_id = mlflow.create_experiment(f"experiment-{datetime.datetime.now().strftime('%Y%m%d')}")
for param in params:
    param['experiment_id'] = experiment_id
    param['tracking_uri'] = mlflow.get_tracking_uri()

with mlflow.start_run(
    run_name=f"stock-predictions-{datetime.datetime.now().strftime('%Y%m%d')}",
    experiment_id=experiment_id,
    description="parent"
) as parent_run:
    mlflow.log_param("parent", "yes")
    pool = mp.Pool(processes=mp.cpu_count()-1)
    pool.map(utils.train_model, params)
pool.close()

# Model Evaluation

In [3]:
import mlflow
import utils
import os
import json
import datetime

import numpy as np
import pandas as pd

from prophet import Prophet
from prophet.plot import plot_plotly, plot_components_plotly
from prophet.diagnostics import cross_validation, performance_metrics

# Load stock data if not already loaded
try:
    stock_df
except Exception as e:
    print(e)
    stock_df = pd.read_csv('data/stock_data/data.csv')
    stock_df['Date'] = pd.to_datetime(stock_df['Date'])

# Load model data
experiment_id = '868266089519611414'
model_runs = mlflow.search_runs(experiment_ids=experiment_id, filter_string = "metrics.rmse < 2")

# Simulate selecting stock symbol
stock_symbols = list(model_runs['tags.mlflow.runName'])
stock_symbols = [x.split('-')[0] for x in stock_symbols]
stock_symbol = stock_symbols[10]

# filter table based on selected stock
model_path = model_runs[model_runs['tags.mlflow.runName'].str.contains(stock_symbol)]['artifact_uri'].values[0]
model_path = os.path.join(model_path, 'prophet-model')
model = mlflow.prophet.load_model(model_path)

forecast = model.predict(model.make_future_dataframe(periods=30))
plot_plotly(model, forecast, xlabel='Date', ylabel='Close Price')

name 'stock_df' is not defined


Downloading artifacts:   0%|          | 0/9 [00:00<?, ?it/s]

In [4]:
%%capture
from IPython.display import clear_output

# Evaluate forecasts
stock_forecast_dict = {
    'stock_symbol': [],
    'ratio_change': [],
    'most_recent_price': [],
    'forecast_start': [],
    'forecast_end': []
}

# Load model data
experiment_id = '868266089519611414'
model_runs = mlflow.search_runs(experiment_ids=experiment_id)
model_runs = model_runs[~model_runs['end_time'].isna()]

# Simulate selecting stock symbol
stock_symbols = list(model_runs['tags.mlflow.runName'])
stock_symbols = [x.split('-')[0] for x in stock_symbols]

# Get future dates
baseline_date = datetime.datetime.now() - datetime.timedelta(days=1)
counter = 1
for stock_symbol in stock_symbols:
    # filter table based on selected stock
    model_path = model_runs[model_runs['tags.mlflow.runName'].apply(lambda x: x.split('-')[0]) == stock_symbol]['artifact_uri'].values[0]
    model_path = os.path.join(model_path, 'prophet-model')
    model = mlflow.prophet.load_model(model_path)

    forecast = model.predict(model.make_future_dataframe(periods=30))
    sub_forecast_df = forecast.loc[forecast['ds'] > baseline_date][['ds', 'yhat_lower', 'yhat', 'yhat_upper']]

    # Store info
    stock_forecast_dict['stock_symbol'].append(stock_symbol)
    stock_forecast_dict['forecast_start'].append(forecast.head(1)['yhat'].values[0])
    stock_forecast_dict['forecast_end'].append(forecast.tail(1)['yhat'].values[0])
    stock_forecast_dict['most_recent_price'].append(model.history['y'].tail(1).values[0])
    stock_forecast_dict['ratio_change'].append(forecast.tail(1)['yhat'].values[0] / forecast.head(1)['yhat'].values[0])

    # if counter % 50 == 0:
    # clear_output(wait=True)
    # print(f'{counter / len(stock_symbols) * 100}%')
    counter += 1

MlflowException: The following failures occurred while downloading one or more artifacts from http://127.0.0.1:8080/api/2.0/mlflow-artifacts/artifacts/868266089519611414/b5a013ef29cd49a1abb4cd2c22a4d11a/artifacts:
##### File prophet-model #####
API request to http://127.0.0.1:8080/api/2.0/mlflow-artifacts/artifacts/868266089519611414/b5a013ef29cd49a1abb4cd2c22a4d11a/artifacts/prophet-model failed with exception HTTPConnectionPool(host='127.0.0.1', port=8080): Max retries exceeded with url: /api/2.0/mlflow-artifacts/artifacts/868266089519611414/b5a013ef29cd49a1abb4cd2c22a4d11a/artifacts/prophet-model (Caused by ResponseError('too many 500 error responses'))

In [7]:
stock_forecast_df = pd.DataFrame(stock_forecast_dict)
os.makedirs('data/forecast_data', exist_ok=True)
stock_forecast_df.to_csv('data/forecast_data/stock_forecast.csv', index=False)

In [20]:
stock_forecast_df.sort_values(by=['ratio_change'], ascending=False, inplace=True)
stock_forecast_df.reset_index(inplace=True)

In [25]:
stock_forecast_df.loc[stock_forecast_df['most_recent_price'] < 40]

Unnamed: 0,index,stock_symbol,ratio_change,most_recent_price,forecast_start,forecast_end
1,211,SAVA,90.640375,22.03,0.332180,30.108900
5,1407,CLDX,16.276447,33.30,2.203306,35.861989
8,1702,AVDL,14.676057,15.95,1.215030,17.831850
11,3313,GME,11.975080,23.14,1.213486,14.531593
12,3099,MARA,11.869998,19.52,1.290203,15.314704
...,...,...,...,...,...,...
3665,274,CUTR,-0.217315,2.15,19.488017,-4.235032
3666,543,LAC,-0.296590,3.39,3.531434,-1.047389
3668,1150,BTAI,-0.391037,1.81,11.475181,-4.487215
3669,2152,ACRS,-0.459778,1.03,2.836445,-1.304134


In [29]:
stock_symbol = 'CUTR'

# filter table based on selected stock
model_path = model_runs[model_runs['tags.mlflow.runName'].apply(lambda x: x.split('-')[0]) == stock_symbol]['artifact_uri'].values[0]
model_path = os.path.join(model_path, 'prophet-model')
model = mlflow.prophet.load_model(model_path)

forecast = model.predict(model.make_future_dataframe(periods=30))
plot_plotly(model, forecast, xlabel='Date', ylabel='Close Price')

Downloading artifacts:   0%|          | 0/9 [00:00<?, ?it/s]