In [1]:
# Import Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, classification_report, ConfusionMatrixDisplay, confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, StandardScaler

import joblib
import json
import warnings
warnings.simplefilter("ignore")

# I. Load the backtested trading data

In [2]:
# load data
raw_data = pd.read_csv('trades_data.csv')
raw_data.head()

Unnamed: 0,EntryPrice,win_loss,long_short,entry_month,entry_date,entry_day,entry_hour
0,1.3566,loss,short,4,23,0,8
1,1.3566,loss,short,4,23,0,8
2,1.3568,loss,short,4,24,1,0
3,1.3568,loss,short,4,24,1,0
4,1.3568,loss,short,4,24,1,0


In [3]:
# data decription
raw_data.describe()

Unnamed: 0,EntryPrice,entry_month,entry_date,entry_day,entry_hour
count,2443.0,2443.0,2443.0,2443.0,2443.0
mean,1.259341,6.76709,15.620958,2.109701,10.167826
std,0.131127,3.453237,8.706682,1.635073,5.874165
min,1.03965,1.0,1.0,0.0,0.0
25%,1.133285,4.0,8.0,1.0,8.0
50%,1.25675,7.0,15.0,2.0,12.0
75%,1.35937,10.0,23.0,3.0,16.0
max,1.59281,12.0,31.0,6.0,20.0


# II. Preprocessing

## II.I. define dataframe for inferencing

In [4]:
# Define how many inferences we want to generate
inf_count =  round(raw_data.shape[0]*0.05)
inf_count

122

In [5]:
# Get Data for model inference
data_inf = raw_data.sample(inf_count, random_state=33)
data_inf.sample(5)

Unnamed: 0,EntryPrice,win_loss,long_short,entry_month,entry_date,entry_day,entry_hour
2279,1.22396,win,short,1,3,6,12
1428,1.1294,win,short,9,16,2,8
1253,1.33881,loss,long,8,11,0,8
2187,1.12722,win,short,6,16,1,12
2423,1.12803,loss,short,12,6,0,0


## II.II. Split data

In [6]:
# Copy the raw_data
raw_data_1 = raw_data.copy()

In [7]:
# define features variable as X
X = raw_data_1.drop('win_loss', axis=1)

In [8]:
# define target variable as y
y = raw_data_1['win_loss']
y.sample(5)

659      win
1580    loss
2042     win
1227    loss
1314    loss
Name: win_loss, dtype: object

In [9]:
# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.075, random_state=33, stratify=y)

print(f'Train size: {X_train.shape[0]}')
print(f'Test size: {X_test.shape[0]}')

Train size: 2259
Test size: 184


## II.III. Handling empty values

In [10]:
# sum null value
X_train.isnull().sum()

EntryPrice     0
long_short     0
entry_month    0
entry_date     0
entry_day      0
entry_hour     0
dtype: int64

## II.IV. Feature Scaling

In [11]:
# Get Numerical Columns and Categorical Columns

num_columns = X_train.select_dtypes(include=np.number).columns.tolist()
cat_columns = X_train.select_dtypes(include=['object']).columns.tolist()

print('Numerical Columns   : ', num_columns)
print('Categorical Columns : ', cat_columns)

Numerical Columns   :  ['EntryPrice', 'entry_month', 'entry_date', 'entry_day', 'entry_hour']
Categorical Columns :  ['long_short']


In [12]:
X_train_2 = X_train.copy()

In [13]:
# Define scaler using min max scaler
scaler = MinMaxScaler()

# fir and transform scaling
X_train_scaled = scaler.fit_transform(X_train_2[num_columns])
X_test_scaled = scaler.transform(X_test[num_columns])

In [14]:
# # Define scaler using standard scaler
# scaler = StandardScaler()

# # fir and transform scaling
# X_train_scaled = scaler.fit_transform(X_train_2[num_columns])
# X_test_scaled = scaler.transform(X_test[num_columns])

## II.V. Feature Encoding

In [15]:
# Define Encoder
encoder = OneHotEncoder(sparse=False)

# fit and transform scaling
X_train_encoded = encoder.fit_transform(X_train_2[cat_columns])
X_test_encoded = encoder.transform(X_test[cat_columns])

In [16]:
# Summarize new encoded data
X_test_encoded

array([[1., 0.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.

In [17]:
# concat encoded and scaled training dataset
X_train_fin = np.concatenate((X_train_encoded, X_train_scaled), axis=1 )

In [18]:
# concat encoded and scaled test dataset
X_test_fin = np.concatenate((X_test_encoded, X_test_scaled), axis=1 )

# III. Model Productions

In [19]:
# Define model
model = RandomForestClassifier(max_depth=650, n_estimators=870)

In [20]:
# fit model
model.fit(X_train_fin, y_train)

In [21]:
# Predict the dataset
y_train_pred=model.predict(X_train_fin)
y_test_pred=model.predict(X_test_fin)

In [22]:
# Evaluate training model
print(classification_report(y_train,y_train_pred))

              precision    recall  f1-score   support

        loss       0.92      0.95      0.93      1393
         win       0.91      0.86      0.88       866

    accuracy                           0.91      2259
   macro avg       0.91      0.90      0.91      2259
weighted avg       0.91      0.91      0.91      2259



In [23]:
# Evaluate test model
print(classification_report(y_test,y_test_pred))

              precision    recall  f1-score   support

        loss       0.73      0.86      0.79       114
         win       0.68      0.49      0.57        70

    accuracy                           0.72       184
   macro avg       0.71      0.67      0.68       184
weighted avg       0.71      0.72      0.71       184



# IV. Inferences

## IV.I. Saving models

In [24]:
with open('model.pkl', 'wb') as file_1:
  joblib.dump(model, file_1)

with open('model_scaler.pkl', 'wb') as file_2:
  joblib.dump(scaler, file_2)

with open('model_encoder.pkl', 'wb') as file_3:
  joblib.dump(encoder, file_3)

with open('list_num.txt', 'w') as file_4:
  json.dump(num_columns, file_4)

with open('list_cat.txt', 'w') as file_5:
  json.dump(cat_columns, file_5)

## IV.II. Preprocessing inferences

In [25]:
# Summarize inference dataset
data_inf.sample(3)

Unnamed: 0,EntryPrice,win_loss,long_short,entry_month,entry_date,entry_day,entry_hour
2076,1.10287,loss,long,9,4,2,8
2187,1.12722,win,short,6,16,1,12
1057,1.29932,win,long,6,2,6,12


In [26]:
# Reseting index
data_inf.reset_index(inplace=True)

In [27]:
# Encoding and scaling the dataset
inf_scaled = scaler.transform(data_inf[num_columns])
inf_encoded = encoder.transform(data_inf[cat_columns])

In [28]:
# Concat encoded and scaled dataset
X_inf = np.concatenate((inf_encoded, inf_scaled), axis=1 )
X_inf

array([[0.        , 1.        , 0.56578567, 0.36363636, 0.23333333,
        0.16666667, 0.4       ],
       [1.        , 0.        , 0.03219683, 0.09090909, 0.73333333,
        0.5       , 1.        ],
       [0.        , 1.        , 0.07057633, 0.18181818, 0.93333333,
        0.33333333, 0.2       ],
       [1.        , 0.        , 0.54082002, 0.63636364, 0.33333333,
        0.        , 0.4       ],
       [0.        , 1.        , 0.50715887, 1.        , 0.66666667,
        0.66666667, 0.        ],
       [0.        , 1.        , 0.25616458, 0.90909091, 0.3       ,
        0.16666667, 0.4       ],
       [0.        , 1.        , 0.24463085, 0.63636364, 0.46666667,
        0.16666667, 0.2       ],
       [1.        , 0.        , 0.18226191, 0.90909091, 0.7       ,
        0.5       , 0.        ],
       [0.        , 1.        , 0.16224962, 0.72727273, 0.5       ,
        0.33333333, 0.4       ],
       [1.        , 0.        , 0.17325909, 1.        , 0.53333333,
        0.        , 0.6

In [29]:
# predict the dataset
inf_pred = model.predict(X_inf)
inf_pred

array(['win', 'win', 'win', 'loss', 'loss', 'win', 'loss', 'loss', 'loss',
       'win', 'loss', 'win', 'loss', 'win', 'loss', 'loss', 'loss',
       'loss', 'loss', 'win', 'loss', 'loss', 'loss', 'loss', 'loss',
       'win', 'win', 'win', 'win', 'loss', 'loss', 'loss', 'loss', 'win',
       'loss', 'loss', 'loss', 'win', 'loss', 'loss', 'win', 'win',
       'loss', 'loss', 'loss', 'win', 'loss', 'loss', 'loss', 'win',
       'loss', 'loss', 'loss', 'loss', 'loss', 'win', 'loss', 'loss',
       'win', 'loss', 'loss', 'loss', 'win', 'win', 'win', 'loss', 'loss',
       'win', 'loss', 'win', 'loss', 'win', 'win', 'win', 'win', 'loss',
       'win', 'win', 'win', 'loss', 'win', 'loss', 'win', 'win', 'loss',
       'win', 'loss', 'win', 'loss', 'loss', 'loss', 'loss', 'loss',
       'loss', 'loss', 'loss', 'loss', 'win', 'loss', 'loss', 'win',
       'win', 'win', 'loss', 'loss', 'loss', 'loss', 'loss', 'loss',
       'win', 'loss', 'win', 'win', 'loss', 'loss', 'loss', 'loss',
       'lo

In [30]:
# Evaluate test model
print(classification_report(data_inf['win_loss'],inf_pred))

              precision    recall  f1-score   support

        loss       0.87      0.96      0.91        68
         win       0.94      0.81      0.87        54

    accuracy                           0.89       122
   macro avg       0.90      0.89      0.89       122
weighted avg       0.90      0.89      0.89       122



In [31]:
# Evaluate test model
print(precision_score(data_inf['win_loss'], inf_pred, pos_label='win'))

0.9361702127659575
