<a href="https://colab.research.google.com/github/j03m/lstm-price-predictor/blob/main/coin_charts.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#IMPORT DATASETS AND LIBRARIES


In [54]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

import sys

sys.path.insert(0,'/content/drive/My Drive/ml-trde-notebooks')


import warnings
import pandas as pd

# suppress warning message
warnings.filterwarnings("ignore", message="The frame.append method is deprecated")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Library

In [55]:
# ***** WARNING : Install deps - This will BUILD TALib and takes a while!
%run -i '/content/drive/My Drive/ml-trde-notebooks/installs.ipynb'

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [56]:
import yfinance as yf
import pandas as pd
from statsmodels.tsa.seasonal import seasonal_decompose
import plotly.graph_objs as go
import numpy as np
from scipy.stats import norm
from scipy.signal import find_peaks, peak_prominences
from plotly.subplots import make_subplots
from scipy.stats import poisson

In [57]:
# Executes this notebook in our space, making all of its functions/globals available
%run -i '/content/drive/My Drive/ml-trde-notebooks/common.ipynb'
train_models = False
save_models = False
load_models = True


Modified
1681172628
1681172628.0
1681172628.0


# Load all models from disk

In [134]:
def scale_to_price(series, df):
    price_range = [df['Close'].min(), df['Close'].max()]
    scaled_series = (series - series.min()) / (series.max() - series.min()) * (price_range[1] - price_range[0]) + price_range[0]
    return scaled_series

def plot(df, trend, prob_above_trend, prob_below_trend, model, df_durations):

  # Scale probabilities to the same range as the original time series
  scaled_prob_above_trend =  pd.Series(scale_to_price(prob_above_trend, df)) #.rolling(window=30, center=True).mean()
  scaled_prob_below_trend =  pd.Series(scale_to_price(prob_below_trend, df)) #.rolling(window=30, center=True) 

  line_index = df.tail(len(trend)).index

  # Create subplots with 2 rows and 1 column
  fig = make_subplots(rows=2, cols=1, shared_xaxes=True, vertical_spacing=0.05)

  # Add trace for the main time series plot to the first row of the subplot
  fig.add_trace(go.Scatter(x=df.index, y=df["Close"], mode="lines", name="Value"), row=1, col=1)
  fig.add_trace(go.Scatter(x=line_index, y=trend, mode="lines", name="Trend"), row=1, col=1)
  fig.add_trace(go.Scatter(x=df.index, y=scaled_prob_above_trend, mode='lines', name='Prob Above Trend'), row=1, col=1)
  fig.add_trace(go.Scatter(x=df.index, y=scaled_prob_below_trend, mode='lines', name='Prob Below Trend'), row=1, col=1)
  fig.add_trace(go.Scatter(x=df.index, y=df['high_prob_start'], mode='markers', name='Window Start', marker=dict(symbol='diamond', size=8, color='blue')), row=1, col=1)
  fig.add_trace(go.Scatter(x=df.index, y=df['cross_over_positive'], mode='markers', name='Up Cross', marker=dict(symbol='diamond', size=8, color='green')), row=1, col=1)
  fig.add_trace(go.Scatter(x=df.index, y=df['cross_over_negative'], mode='markers', name='Down Cross', marker=dict(symbol='diamond', size=8, color='red')), row=1, col=1)

  last_30_days_trend = trend[-30:]
  x = np.arange(1, 60)
  line_pred = model.predict(x.reshape(-1, 1))  
  new_index = pd.date_range(start=last_30_days_trend.index[0] + pd.DateOffset(days=1), end=df.index[-1] + pd.DateOffset(days=31))
  
  fig.add_trace(go.Scatter(x=new_index, y=line_pred, mode='lines', name='Linear Regression Line'), row=1, col=1)
  
  # Add trace for the durations plot to the second row of the subplot
  for i, row in df_durations.iterrows():
    duration_trace = go.Scatter(x=[row['start'], row['end']], y=[5,5], mode='lines', line=dict(color='red', width=row['duration']/2), name=f'Duration {row["duration"]}')
    fig.add_trace(duration_trace, row=2, col=1)

  fig.update_layout(title='Time Series with Trend, Scaled Seasonal Component, and Probabilities', xaxis_title='Date', height=800)
  fig.show()


def calc_durations_with_extremes(df_raw):
    # get last index
    last_index = df_raw.iloc[-1].name

    # get the first index that is the beginning of a high probability window
    start_index = df_raw['high_prob_start'].first_valid_index()
    df_durations = pd.DataFrame(columns=['start', 'end', 'duration', 'extreme'])

    # loop through all high probability windows
    while start_index < last_index:
        start_pos = df_raw.index.get_loc(start_index)

        # loop through all indexes after the high probability window starts, searching for a cross to mark its end
        for index in df_raw.index[start_pos + 1:]:
            cross1 = df_raw.loc[index, 'cross_over_positive']
            cross2 = df_raw.loc[index, 'cross_over_negative']

            # continue until one of these is not nan
            if np.isnan(cross1) and np.isnan(cross2):
                continue

            # we found a cross, calculate how far it was from the probability start
            duration = (index - start_index).days

            # get the extreme value in the duration
            if (np.isnan(cross1)):
              extreme_value = df_raw.loc[start_index:index, "Close"].max()
              extreme_index = df_raw.loc[start_index:index, "Close"].idxmax()
            else:
              extreme_value = df_raw.loc[start_index:index, "Close"].min()
              extreme_index = df_raw.loc[start_index:index, "Close"].idxmin()

            # Create a new row using a dictionary
            row = {'start': start_index, 'end': index, 'duration': duration, 'extreme': extreme_value, 'extreme_index': extreme_index}
            df_durations = pd.concat([df_durations, pd.DataFrame([row])], ignore_index=True)

            # once we find a cross, we need to exit. Get the position of the exit.
            start_pos = df_raw.index.get_loc(index)

            break

        # find the next high probability window start AFTER the exit
        start_index = df_raw['high_prob_start'].iloc[start_pos + 1:].first_valid_index()

        if start_index is None:
            break

    # Create a box plot of the duration data
    return df_durations


def attach_markers(df_raw, trend, prob_above_trend):
  threshold = 0.85
  threshold_low = 0.15
  prob_above_trend = pd.Series(prob_above_trend, index=df_raw.index)
  high_prob_zones = (prob_above_trend > threshold) | (prob_above_trend < threshold_low)
  high_prob_starts = high_prob_zones[high_prob_zones == 1].index

  df_raw['high_prob_start'] = np.nan
  # Iterate over the high probability start dates
  for i, start_date in enumerate(high_prob_starts):
      df_raw.loc[start_date, 'high_prob_start'] = df_raw.loc[start_date, 'Close']
    
  # Calculate the sign of the difference between Close and trend at each point in time
  diff_sign = np.sign(trend - df_raw["Close"])

  # Take the difference of the sign values to detect when the sign changes
  cross_over = diff_sign.diff().fillna(0)

  # Detect when the sign changes from positive to negative or negative to positive
  cross_over_positive = (cross_over == -2).astype(int).diff().fillna(0)
  cross_over_negative = (cross_over == 2).astype(int).diff().fillna(0)

  # Create empty columns in df_raw
  df_raw['cross_over_positive'] = np.nan
  df_raw['cross_over_negative'] = np.nan

  # Set the values of the new columns based on cross_over_positive and cross_over_negative
  df_raw.loc[cross_over_positive == 1, 'cross_over_positive'] = df_raw.loc[cross_over_positive == 1, 'Close']
  df_raw.loc[cross_over_negative == 1, 'cross_over_negative'] = df_raw.loc[cross_over_negative == 1, 'Close']

  return df_raw

def histogram(df):

  # Assuming df is your DataFrame containing 'probability', 'Est Close', 'Percent Change' columns

  # Create the histogram
  fig = go.Figure(data=[go.Histogram(x=df['probability'], nbinsx=20)])

  # Set the title and axis labels
  fig.update_layout(
      title="Probability Distribution",
      xaxis_title="Probability",
      yaxis_title="Frequency",
  )

  # Show the plot
  fig.show()


def bar_chart(df, long=True):
    # Normalize the 'Percent Change' column to be between 0 and 1
    
    
    # Create a gradient color scale based on the normalized 'Percent Change' and the 'long' value
    colors = ["blue"]
  
    # Create the bar chart
    fig = go.Figure(data=[go.Bar(x=df.index, y=df['probability'], marker_color=colors)])

    # Set the title and axis labels
    fig.update_layout(
        title="Probabilities Over Time",
        xaxis_title="Date",
        yaxis_title="Probability",
    )

    # Show the plot
    fig.show()

def prob_chart(df, values):
  fig = go.Figure()
  fig.add_trace(go.Scatter(x=df.index, y=values, mode='lines', name='Prob Above Trend'))
  fig.update_layout(
    title="Probabilities Over Time",
    xaxis_title="Date",
    yaxis_title="Probability",
  )
  # Show the plot
  fig.show()


def calculate_and_graph_price_probabilities(percentage_differences):
    
    # Fit percentage differences to a normal distribution
    mean, std = norm.fit(percentage_differences)

    # Define the percentage deviation range
    min_percentage = int(np.floor(percentage_differences.min()))
    max_percentage = int(np.ceil(percentage_differences.max()))
    num_points = max_percentage - min_percentage + 1
    percentage_range = np.linspace(min_percentage, max_percentage, num_points)

    # Calculate the PDF of the normal distribution for the range of percentage deviations
    pdf_values = norm.pdf(percentage_range, mean, std)

    # Create a DataFrame with the percentage deviations and their corresponding PDF values
    pdf_df = pd.DataFrame({"Percentage Deviation": percentage_range, "PDF Value": pdf_values})
    
    graph_pdf_bar(pdf_df)
    print("Current price diff:", percentage_differences[-1])

def graph_pdf_bar(df):
    fig = go.Figure()

    fig.add_trace(
        go.Bar(x=df["Percentage Deviation"], y=df["PDF Value"], marker_color='blue', name="PDF")
    )

    fig.update_layout(
        title="Probability Density Function",
        xaxis_title="Percentage Deviation from Trend (%)",
        yaxis_title="Probability",
    )

    fig.show()

def calculate_and_graph_duration_probabilities(start_date, df_raw, df_durations):
    # seed 60 days from the start of when we want to predict when the 
    # mean regression will happen
    n_periods = 60
    dates = [start_date + pd.DateOffset(days=i) for i in range(n_periods)]
    df = pd.DataFrame({'date': dates})

    # Calculate duration windows, filter where anything is < 5
    df_durations = df_durations[df_durations["duration"] >= 5]
    durations = df_durations['duration'].values.tolist()
    print("Last duration:", durations[-1])

    # Fit a Poisson distribution to the durations
    # Then figure out the probability of a cross in n days
    rate = np.mean(durations)
    poisson_dist = poisson(rate)
    numbers = np.arange(1, n_periods + 1)
    cdf_values = poisson_dist.cdf(numbers)

    # Calculate the probabilities for each duration window
    window_probabilities = np.diff(cdf_values, prepend=0)

    # Graph as bars so we can predict when the price will 
    total_probability = np.sum(window_probabilities)
    print("total:", total_probability)
    df['probability'] = window_probabilities
    df = df.set_index("date")
    bar_chart(df, False)

def calc_extreme_percentage_deviations(df_durations, trend):
    extreme_percentage_deviations = []

    for index, row in df_durations.iterrows():
        start_date = row['start']
        end_date = row['end']
        extreme_price = row['extreme']
        extreme_index = row['extreme_index']

        trend_value = trend.loc[extreme_index]
        
        deviation_percentage = (extreme_price - trend_value) / trend_value * 100
        extreme_percentage_deviations.append(deviation_percentage)
        
    return extreme_percentage_deviations


def analyze_extreme_deviations(df_durations, trend):
    extreme_percentage_deviations = calc_extreme_percentage_deviations(df_durations, trend)
    
    mean, std = norm.fit(extreme_percentage_deviations)
    numbers = np.arange(np.min(extreme_percentage_deviations),np.max(extreme_percentage_deviations))
    pdf_values = norm.pdf(numbers, mean, std)
    trace = go.Bar(
        x=numbers,
        y=pdf_values
    )
    
    layout = go.Layout(
        title='PDF Values vs Numbers',
        xaxis=dict(title='Numbers'),
        yaxis=dict(title='PDF Values')
    )

    fig = go.Figure(data=[trace], layout=layout)

    fig.show()


# Visualize and Compare all Models

In [133]:
windows = [300, 600, 900, 1500]

#for window in windows:
window = 300
coin_base = False
ku_coin = True
df_raw = get_coin_data_frames(window, "BTC-USDC")

#tickerObj = yf.download(tickers = "SPY", interval = "1d")
#df_raw = pd.DataFrame(tickerObj).tail(365)  
#df_raw = df_raw.reset_index()

#[results, data, features, fig] = renderPredictions(df_raw, models, [], False)
#features = features.set_index("Date")
df_raw = df_raw.set_index("Date")
df_raw = df_raw.sort_index()
trend, prob_above_trend, prob_below_trend, volatility, model = generate_probability(df_raw)

df_raw = attach_markers(df_raw, trend, prob_above_trend)
df_durations = calc_durations_with_extremes(df_raw)
plot(df_raw, trend, prob_above_trend, prob_below_trend, model, df_durations)
prob_chart(df_raw, prob_above_trend)

start_date = pd.to_datetime('2023-04-14')
calculate_and_graph_duration_probabilities(start_date, df_raw, df_durations)

percent_diff_from_trend = ((df_raw["Close"] - trend) / trend) * 100
calculate_and_graph_price_probabilities(percent_diff_from_trend)

analyze_extreme_deviations(df_durations, trend)


Last duration: 33
total: 0.9999999999841633


Current price diff: -3.024094899401758
[0.011674 0.013081 0.014559 0.016097 0.017679 0.019288 0.020902 0.022501
 0.024061 0.025558 0.026967 0.028265 0.029428 0.030435 0.031267 0.031907
 0.032345 0.032570 0.032578 0.032369 0.031948 0.031322 0.030504 0.029510
 0.028358 0.027070 0.025668 0.024177 0.022621 0.021024 0.019410 0.017801
 0.016216 0.014674]
