In [2]:
import pandas as pd
import joblib
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys

sys.path.append("../..")

import src.indicators.leavitt_indicator as lu

### ðŸ”¹ Step 1: Load the Test Dataset
- Ensure the dataset contains the same features used during training.

In [3]:
# Load test dataset
test_data_path = "../data/Predict_EUR_USD_D.csv"  # Adjust path as needed
df_analyze = pd.read_csv(test_data_path)

# Display sample data
print(df_analyze.tail(1))
print(df_analyze.shape)

      Unnamed: 0                       time  volume    mid_o    mid_h   mid_l  \
1846        1846  2025-02-11 22:00:00-05:00   55878  1.03612  1.03858  1.0354   

        mid_c    bid_o   bid_h    bid_l    bid_c    ask_o    ask_h    ask_l  \
1846  1.03737  1.03604  1.0385  1.03532  1.03729  1.03621  1.03865  1.03548   

        ask_c  
1846  1.03745  
(1847, 15)


âœ… Now, we have predict test dataset loaded.

---

### ðŸ”¹ Step 2: Extract lookback

- Now, grab the data we are interest in for our lookback analysis


---

In [4]:
lookback = 100

# Select the last lookback number of rows from the dataset
df_lookback = df_analyze.tail(lookback).copy()

# Convert the 'time' column to datetime format for proper analysis
df_lookback["time"] = pd.to_datetime(df_lookback["time"])

print(df_lookback.tail(1))
print(df_lookback.shape)

      Unnamed: 0                      time  volume    mid_o    mid_h   mid_l  \
1846        1846 2025-02-11 22:00:00-05:00   55878  1.03612  1.03858  1.0354   

        mid_c    bid_o   bid_h    bid_l    bid_c    ask_o    ask_h    ask_l  \
1846  1.03737  1.03604  1.0385  1.03532  1.03729  1.03621  1.03865  1.03548   

        ask_c  
1846  1.03745  
(100, 15)


In [5]:
df_lookback.to_csv("../data/lookback_data.csv", index=True)
print("âœ… File saved successfully.")

âœ… File saved successfully.


### ðŸ”¹ Step 3: Run it through data ingestion 

- Compute all the features 


---

In [6]:
from src.services.data_ingestion_service import DataIngestionService

ingestion_service = DataIngestionService()
# Apply preprocessing (renaming, dropping unnecessary columns)
df_cleaned = ingestion_service.preprocess_data(df_lookback)


[ 2025-02-14 04:46:24,076 ] INFO [src.services.data_ingestion_service:49] - Converted 'time' column to 'Date' with datetime format.
[ 2025-02-14 04:46:24,077 ] INFO [src.services.data_ingestion_service:67] - Dropped bid and ask price columns.
[ 2025-02-14 04:46:24,077 ] INFO [src.services.data_ingestion_service:80] - Renamed mid-price columns to OHLC format.
[ 2025-02-14 04:46:24,078 ] INFO [src.services.data_ingestion_service:85] - Dropped 'Unnamed: 0' index column.
[ 2025-02-14 04:46:24,078 ] INFO [src.services.data_ingestion_service:93] - Set index to Date with unique timestamps.
[ 2025-02-14 04:46:24,079 ] INFO [src.services.data_ingestion_service:95] - Processing leavitt data.
[ 2025-02-14 04:46:24,089 ] INFO [src.services.data_ingestion_service:97] - Finished leavitt data.
[ 2025-02-14 04:46:24,089 ] INFO [src.services.data_ingestion_service:99] - Processing indicators.
[ 2025-02-14 04:46:24,091 ] INFO [src.services.data_ingestion_service:101] - Finished indicators.
[ 2025-02-14 

In [7]:
print(df_cleaned.shape)
print(df_cleaned.describe())
print(df_cleaned.columns)

(78, 29)
              Volume       Open       High        Low      Close       AHMA  \
count      78.000000  78.000000  78.000000  78.000000  78.000000  78.000000   
mean   158979.858974   1.050034   1.053723   1.045359   1.049511   1.049304   
std     48031.345766   0.017334   0.017146   0.017282   0.017086   0.016844   
min     55878.000000   1.024340   1.025010   1.017790   1.024340   1.025912   
25%    132557.750000   1.037985   1.042588   1.034203   1.037497   1.037979   
50%    160770.500000   1.048200   1.052290   1.041620   1.047060   1.046549   
75%    178745.500000   1.056715   1.059658   1.052647   1.056607   1.053758   
max    398838.000000   1.092970   1.093740   1.087260   1.092970   1.088000   

       Leavitt_Projection  Leavitt_Convolution   LC_Slope  LC_Intercept  ...  \
count           78.000000            78.000000  78.000000     78.000000  ...   
mean             1.048633             1.048046  -0.000582      1.049792  ...   
std              0.017553             0

#### What is the last record??

- It should be the record that we will be predicting

In [8]:
df_cleaned.tail(1)

Unnamed: 0_level_0,Volume,Open,High,Low,Close,AHMA,Leavitt_Projection,Leavitt_Convolution,LC_Slope,LC_Intercept,...,Returns_T-10,Momentum_T-10,Returns_T-21,Momentum_T-21,Hour,Day_Of_Week,Month,Year,ATR,Movement_Class
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2025-02-11 22:00:00-05:00,55878,1.03612,1.03858,1.0354,1.03737,1.034488,1.032559,1.032416,-1.3e-05,1.032455,...,-0.001026,0.002232,0.006326,-0.00512,22,1,2,2025,0.007814,1


#### Verify ATR

In [9]:
from src.indicators.atr import calculate_atr

df_analysis = df_cleaned.copy()

# Apply EMA to True Range to calculate ATR
df_analysis["ATR_Computed_EMA"] = calculate_atr(df_analysis, 14)

# Extract ATR from the dataset for comparison
df_analysis["ATR_Dataset"] = df_cleaned["ATR"]  # Assuming ATR column exists in dataset


In [10]:
df_analysis[["ATR_Computed_EMA", "ATR_Dataset"]].tail(50)

Unnamed: 0_level_0,ATR_Computed_EMA,ATR_Dataset
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2024-12-02 22:00:00-05:00,0.008905,0.008917
2024-12-03 22:00:00-05:00,0.008676,0.008687
2024-12-04 22:00:00-05:00,0.008609,0.008618
2024-12-05 22:00:00-05:00,0.008629,0.008637
2024-12-08 22:00:00-05:00,0.008308,0.008315
2024-12-09 22:00:00-05:00,0.008131,0.008137
2024-12-10 22:00:00-05:00,0.007836,0.007841
2024-12-11 22:00:00-05:00,0.007687,0.007692
2024-12-12 22:00:00-05:00,0.00761,0.007614
2024-12-15 22:00:00-05:00,0.007265,0.007268


#### Verify Adaptive HMA

In [11]:
df_ahma = df_cleaned.copy()
ahma_period = 9

df_ahma["AHMA_Computed"] = lu.adaptive_hull_moving_average(df_ahma["Close"], ahma_period)
# Extract ATR from the dataset for comparison
df_ahma["AHMA_Dataset"] = df_cleaned["AHMA"]  # Assuming ATR column exists in dataset



In [12]:
df_ahma[["AHMA_Computed", "AHMA_Dataset"]]

Unnamed: 0_level_0,AHMA_Computed,AHMA_Dataset
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2024-10-23 21:00:00-05:00,1.082790,1.078503
2024-10-24 21:00:00-05:00,1.081839,1.078226
2024-10-27 21:00:00-05:00,1.081327,1.078478
2024-10-28 21:00:00-05:00,1.081264,1.079085
2024-10-29 21:00:00-05:00,1.082496,1.080851
...,...,...
2025-02-05 22:00:00-05:00,1.038044,1.038044
2025-02-06 22:00:00-05:00,1.036377,1.036377
2025-02-09 22:00:00-05:00,1.034043,1.034043
2025-02-10 22:00:00-05:00,1.033742,1.033742


#### Verify Leavitt 

In [13]:
df_leavitt = df_cleaned.copy()

# Apply Feature Engineering using AHMA as input instead of Close
plength = 9  # Projection lookback period
clength = 3  # Convolution lookback period

df_leavitt["Leavitt_Projection_Computed"] = lu.leavitt_projection(df_leavitt["AHMA"], plength)

(
    df_leavitt["Leavitt_Convolution_Computed"],
    df_leavitt["LC_Slope_Computed"],
    df_leavitt["LC_Intercept_Computed"],
) = lu.leavitt_convolution(df_leavitt["AHMA"], plength, clength)

df_leavitt["Leavitt_Projection_Dataset"] = df_cleaned["Leavitt_Projection"]
df_leavitt["Leavitt_Convolution_Dataset"] = df_cleaned["Leavitt_Convolution"]
df_leavitt["LC_Slope_Dataset"] = df_cleaned["LC_Slope"]
df_leavitt["LC_Intercept_Dataset"] = df_cleaned["LC_Intercept"]

In [14]:
df_leavitt[["Leavitt_Projection_Computed", "Leavitt_Projection_Dataset"]]

Unnamed: 0_level_0,Leavitt_Projection_Computed,Leavitt_Projection_Dataset
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2024-10-23 21:00:00-05:00,1.087959,1.075802
2024-10-24 21:00:00-05:00,1.087959,1.075383
2024-10-27 21:00:00-05:00,1.087959,1.075637
2024-10-28 21:00:00-05:00,1.087959,1.076432
2024-10-29 21:00:00-05:00,1.087959,1.077918
...,...,...
2025-02-05 22:00:00-05:00,1.034700,1.034700
2025-02-06 22:00:00-05:00,1.033670,1.033670
2025-02-09 22:00:00-05:00,1.032585,1.032585
2025-02-10 22:00:00-05:00,1.032182,1.032182


In [15]:
df_leavitt[["Leavitt_Projection_Computed", "Leavitt_Projection_Dataset"]]

Unnamed: 0_level_0,Leavitt_Projection_Computed,Leavitt_Projection_Dataset
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2024-10-23 21:00:00-05:00,1.087959,1.075802
2024-10-24 21:00:00-05:00,1.087959,1.075383
2024-10-27 21:00:00-05:00,1.087959,1.075637
2024-10-28 21:00:00-05:00,1.087959,1.076432
2024-10-29 21:00:00-05:00,1.087959,1.077918
...,...,...
2025-02-05 22:00:00-05:00,1.034700,1.034700
2025-02-06 22:00:00-05:00,1.033670,1.033670
2025-02-09 22:00:00-05:00,1.032585,1.032585
2025-02-10 22:00:00-05:00,1.032182,1.032182


In [16]:
df_leavitt[["Leavitt_Convolution_Computed", "Leavitt_Convolution_Dataset"]]

Unnamed: 0_level_0,Leavitt_Convolution_Computed,Leavitt_Convolution_Dataset
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2024-10-23 21:00:00-05:00,1.087959,1.074185
2024-10-24 21:00:00-05:00,1.087959,1.074572
2024-10-27 21:00:00-05:00,1.087959,1.075442
2024-10-28 21:00:00-05:00,1.087959,1.076866
2024-10-29 21:00:00-05:00,1.087959,1.078942
...,...,...
2025-02-05 22:00:00-05:00,1.032029,1.032029
2025-02-06 22:00:00-05:00,1.032162,1.032162
2025-02-09 22:00:00-05:00,1.031537,1.031537
2025-02-10 22:00:00-05:00,1.031325,1.031325


In [17]:
df_leavitt[["LC_Slope_Computed", "LC_Slope_Dataset"]]

Unnamed: 0_level_0,LC_Slope_Computed,LC_Slope_Dataset
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2024-10-23 21:00:00-05:00,,-0.001465
2024-10-24 21:00:00-05:00,,-0.000713
2024-10-27 21:00:00-05:00,-1.909471e-16,-0.000083
2024-10-28 21:00:00-05:00,-1.909471e-16,0.000524
2024-10-29 21:00:00-05:00,-1.909471e-16,0.001140
...,...,...
2025-02-05 22:00:00-05:00,-2.439982e-03,-0.002440
2025-02-06 22:00:00-05:00,-1.388656e-03,-0.001389
2025-02-09 22:00:00-05:00,-1.057426e-03,-0.001057
2025-02-10 22:00:00-05:00,-7.435772e-04,-0.000744


In [18]:
df_leavitt[["LC_Intercept_Computed", "LC_Intercept_Dataset"]]

Unnamed: 0_level_0,LC_Intercept_Computed,LC_Intercept_Dataset
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2024-10-23 21:00:00-05:00,,1.078579
2024-10-24 21:00:00-05:00,,1.076710
2024-10-27 21:00:00-05:00,1.087959,1.075690
2024-10-28 21:00:00-05:00,1.087959,1.075293
2024-10-29 21:00:00-05:00,1.087959,1.075522
...,...,...
2025-02-05 22:00:00-05:00,1.039349,1.039349
2025-02-06 22:00:00-05:00,1.036328,1.036328
2025-02-09 22:00:00-05:00,1.034709,1.034709
2025-02-10 22:00:00-05:00,1.033556,1.033556


#### ðŸ”¹ Make sure all calculated fields above are equal.

- This way we know how many look back bars need to be passed so our predictions are correct for our model
---

## All the preamble above will let us know how many lookback bars are needed

If we do not have enough bars to calculate all the features we will not be able to predict the values we need.

We will now proceed to do a prediction from the model

#### ðŸ”¹ Step 1: Normalize or Scale
- If you applied scaling (StandardScaler, MinMaxScaler) during training, apply the same here.

In [19]:
# Load the saved scaler
scaler_path = "../../artifacts/preprocessor.pkl"  # Adjust path
scaler = joblib.load(scaler_path)

print(df_cleaned.describe())

# Transform test data
X_predict = scaler.transform(df_cleaned)

              Volume       Open       High        Low      Close       AHMA  \
count      78.000000  78.000000  78.000000  78.000000  78.000000  78.000000   
mean   158979.858974   1.050034   1.053723   1.045359   1.049511   1.049304   
std     48031.345766   0.017334   0.017146   0.017282   0.017086   0.016844   
min     55878.000000   1.024340   1.025010   1.017790   1.024340   1.025912   
25%    132557.750000   1.037985   1.042588   1.034203   1.037497   1.037979   
50%    160770.500000   1.048200   1.052290   1.041620   1.047060   1.046549   
75%    178745.500000   1.056715   1.059658   1.052647   1.056607   1.053758   
max    398838.000000   1.092970   1.093740   1.087260   1.092970   1.088000   

       Leavitt_Projection  Leavitt_Convolution   LC_Slope  LC_Intercept  ...  \
count           78.000000            78.000000  78.000000     78.000000  ...   
mean             1.048633             1.048046  -0.000582      1.049792  ...   
std              0.017553             0.017992  

In [20]:
print(df_cleaned.shape)
print(X_predict.shape)

(78, 29)
(78, 47)


âœ… Ensures feature scaling is consistent for predicting.

#### ðŸ”¹Step 2: Load our saved model

- This should be saved during the training phase

In [21]:
# Load trained model (XGBoost or CatBoost)
model_path = "../../artifacts/models/model.pkl"  # Adjust based on best model
model = joblib.load(model_path)

#### ðŸ”¹Step 3: Predict Direction

- We assume the model predicts the direction

In [22]:
y_hat = model.predict(X_predict)

y_hat = y_hat.astype(int)
y_hat.shape

(78, 1)

#### ðŸ”¹Step 4: Calculate the probabilities

- Also calculate the probabilites

In [23]:
y_proba = model.predict_proba(X_predict)
y_proba.shape

(78, 3)

#### ðŸ”¹Step 5: Get the probability of the prediction

- The confidence of the y_hat value is the largest probability in the row
- There are 3 classes and we have a column for each probability (think softmax)

In [24]:
y_confidence = y_proba.max(axis=1)
y_confidence.shape

(78,)

#### ðŸ”¹Step 6: Create a DataFrame

In [25]:
# Create DataFrame for predictions
df_predictions = pd.DataFrame(
    {
        "Predicted_Label": np.array(y_hat).flatten(),
        "Prediction_Confidence": np.array(y_confidence).flatten(),
    }
)

#### ðŸ”¹Step 7: Do with the data frame what you want

In [26]:
df_predictions.tail(1)

Unnamed: 0,Predicted_Label,Prediction_Confidence
77,1,0.73984


In [27]:
# Select the last N rows from df_lookback where N = number of predictions
df_lookback_subset = df_lookback.tail(len(df_predictions)).reset_index(drop=True)

# Reset index for df_predictions to ensure alignment
df_predictions = df_predictions.reset_index(drop=True)

# Merge datasets
df_lookback_subset = pd.concat([df_lookback_subset, df_predictions], axis=1)

In [28]:
df_lookback_subset.tail(20)

Unnamed: 0.1,Unnamed: 0,time,volume,mid_o,mid_h,mid_l,mid_c,bid_o,bid_h,bid_l,bid_c,ask_o,ask_h,ask_l,ask_c,Predicted_Label,Prediction_Confidence
58,1827,2025-01-15 22:00:00-05:00,160175,1.02904,1.03152,1.02608,1.0302,1.02895,1.03144,1.02598,1.03012,1.02912,1.03159,1.02615,1.03029,1,0.533388
59,1828,2025-01-16 22:00:00-05:00,141219,1.0302,1.03308,1.02653,1.02718,1.03012,1.033,1.02645,1.02683,1.03029,1.03316,1.02661,1.02752,0,0.677034
60,1829,2025-01-19 22:00:00-05:00,187472,1.02718,1.04345,1.02664,1.0417,1.02683,1.04338,1.02656,1.04161,1.02752,1.04353,1.02672,1.04178,2,0.96152
61,1830,2025-01-20 22:00:00-05:00,214932,1.0417,1.04356,1.03417,1.0429,1.04161,1.04347,1.03409,1.04244,1.04178,1.04364,1.03425,1.04335,1,0.586455
62,1831,2025-01-21 22:00:00-05:00,148134,1.0429,1.04574,1.03922,1.04102,1.04244,1.04566,1.03914,1.04092,1.04335,1.04582,1.03929,1.04111,0,0.67386
63,1832,2025-01-22 22:00:00-05:00,165378,1.04102,1.0438,1.03721,1.04156,1.04092,1.04373,1.03713,1.04147,1.04111,1.04388,1.03728,1.04165,1,0.656375
64,1833,2025-01-23 22:00:00-05:00,174843,1.04156,1.05215,1.04116,1.04953,1.04147,1.05207,1.04108,1.04903,1.04165,1.05223,1.04124,1.05003,2,0.911999
65,1834,2025-01-26 22:00:00-05:00,178570,1.04953,1.05333,1.04539,1.04921,1.04903,1.05325,1.04531,1.04912,1.05003,1.05342,1.04547,1.0493,0,0.659874
66,1835,2025-01-27 22:00:00-05:00,166715,1.04921,1.0494,1.04138,1.04307,1.04912,1.04927,1.0413,1.04299,1.0493,1.04958,1.04146,1.04315,0,0.940875
67,1836,2025-01-28 22:00:00-05:00,173910,1.04307,1.04438,1.03824,1.042,1.04299,1.04431,1.03815,1.04192,1.04315,1.04445,1.03831,1.04209,1,0.674367
