In [123]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
# Import statistics Library
import statistics

In [115]:
# Reading Dataset
df = pd.read_csv('../../data/XAUUSD_2010-2023.csv').head(100_000)

In [116]:
# Dataset Overview
df.head(5)

Unnamed: 0,time,open,high,low,close,rsi14,sma14
0,2010-01-03 18:00:00,1098.45,1100.0,1098.05,1099.95,81.98,1096.38
1,2010-01-03 18:05:00,1100.0,1100.3,1099.45,1099.75,79.17,1096.72
2,2010-01-03 18:10:00,1099.7,1100.1,1099.3,1099.45,75.02,1097.03
3,2010-01-03 18:15:00,1099.5,1099.6,1098.5,1099.45,75.02,1097.3
4,2010-01-03 18:20:00,1099.4,1099.6,1098.9,1098.9,67.5,1097.5


In [117]:
# number of rows and columns
df.shape

(100000, 7)

In [118]:
# check for missing values
df.isnull().sum()

time     0
open     0
high     0
low      0
close    0
rsi14    0
sma14    0
dtype: int64

In [119]:

# getting some basic informations about the data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 7 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   time    100000 non-null  object 
 1   open    100000 non-null  float64
 2   high    100000 non-null  float64
 3   low     100000 non-null  float64
 4   close   100000 non-null  float64
 5   rsi14   100000 non-null  float64
 6   sma14   100000 non-null  float64
dtypes: float64(6), object(1)
memory usage: 5.3+ MB


In [120]:
# checking the number of missing values
df.isnull().sum()

time     0
open     0
high     0
low      0
close    0
rsi14    0
sma14    0
dtype: int64

In [124]:
statistics.median(df['open'])

1256.94

In [125]:
# getting the statistical measures of the data
df.describe()

Unnamed: 0,open,high,low,close,rsi14,sma14
count,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0
mean,1282.214728,1282.679546,1281.746151,1282.218261,50.929789,1282.191062
std,126.523868,126.5538,126.49312,126.524619,11.197596,126.514056
min,1048.58,1050.45,1044.18,1048.53,7.08,1050.85
25%,1176.63,1177.08,1176.1175,1176.63,43.66,1176.775
50%,1256.94,1257.3,1256.58,1256.94,51.04,1256.95
75%,1385.35,1385.78,1384.93,1385.35,58.37,1385.35
max,1574.3,1576.3,1573.7,1574.45,91.98,1568.56


In [126]:
# Checking Duplicates
df.duplicated().sum()

0

In [127]:
# Checking Missing Values
df.isnull().sum()

time     0
open     0
high     0
low      0
close    0
rsi14    0
sma14    0
dtype: int64

In [128]:
# Preprocess the data
df = df.dropna()  # Handle missing values

In [132]:
df['time'] = pd.to_datetime(df['time'])
df.sort_values(by='time', ascending=True, inplace=True)
df.reset_index(drop=True, inplace=True)
# df.index = df['time']
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 7 columns):
 #   Column  Non-Null Count   Dtype         
---  ------  --------------   -----         
 0   time    100000 non-null  datetime64[ns]
 1   open    100000 non-null  float64       
 2   high    100000 non-null  float64       
 3   low     100000 non-null  float64       
 4   close   100000 non-null  float64       
 5   rsi14   100000 non-null  float64       
 6   sma14   100000 non-null  float64       
dtypes: datetime64[ns](1), float64(6)
memory usage: 5.3 MB


In [133]:
# Dataset Overview
df.head(5)

Unnamed: 0,time,open,high,low,close,rsi14,sma14
0,2010-01-03 18:00:00,1098.45,1100.0,1098.05,1099.95,81.98,1096.38
1,2010-01-03 18:05:00,1100.0,1100.3,1099.45,1099.75,79.17,1096.72
2,2010-01-03 18:10:00,1099.7,1100.1,1099.3,1099.45,75.02,1097.03
3,2010-01-03 18:15:00,1099.5,1099.6,1098.5,1099.45,75.02,1097.3
4,2010-01-03 18:20:00,1099.4,1099.6,1098.9,1098.9,67.5,1097.5


In [134]:

# Define the conditions for each possible combination
conditions = [
    (df['open'] < df['low']),
    (df['open'] == df['low']),
    (df['open'] > df['low']),
]

# Define the corresponding choices
choices = [1, 2, 3]

# Apply the conditions to create the result column
df['status_low'] = np.select(conditions, choices, default=np.nan)
df['status_low'].value_counts()

status_low
3.0    89644
2.0    10356
Name: count, dtype: int64

In [135]:
# Define the conditions for each possible combination
conditions = [
    (df['open'] < df['high']),
    (df['open'] == df['high']),
    (df['open'] > df['high']),
]

# Define the corresponding choices
choices = [1, 2, 3]

# Apply the conditions to create the result column
df['status_high'] = np.select(conditions, choices, default=np.nan)
df['status_high'].value_counts()

status_high
1.0    90328
2.0     9672
Name: count, dtype: int64

In [136]:
# Define the conditions for each possible combination
conditions = [
    (df['open'] < df['close']),
    (df['open'] == df['close']),
    (df['open'] > df['close']),
]

# Define the corresponding choices
choices = [1, 2, 3]

# Apply the conditions to create the result column
df['status_close'] = np.select(conditions, choices, default=np.nan)
df['status_close'].value_counts()

status_close
1.0    48728
3.0    47820
2.0     3452
Name: count, dtype: int64

In [138]:

# df.drop(['time'], axis=1, inplace=True)
# NumCols = df.columns.drop(['Date'])
# df[NumCols] = df[NumCols].replace({',': ''}, regex=True)
# df[NumCols] = df[NumCols].astype('float64')
df.info()
df = df.drop(columns=['time', 'sma14', 'rsi14', 'open'], axis=1)
df

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 6 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   high          100000 non-null  float64
 1   low           100000 non-null  float64
 2   close         100000 non-null  float64
 3   status_low    100000 non-null  float64
 4   status_high   100000 non-null  float64
 5   status_close  100000 non-null  float64
dtypes: float64(6)
memory usage: 4.6 MB


KeyError: "['time', 'sma14', 'rsi14', 'open'] not found in axis"

In [139]:
X = df.drop(columns='status_close', axis=1)
Y = df['status_close']

In [140]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)

In [141]:
print(X.shape, X_train.shape, X_test.shape)

(100000, 5) (80000, 5) (20000, 5)


In [142]:
X_test

Unnamed: 0,high,low,close,status_low,status_high
86158,1424.40,1423.58,1423.58,3.0,2.0
33293,1257.18,1255.98,1257.08,3.0,1.0
44964,1225.33,1224.38,1224.88,3.0,2.0
9900,1115.95,1114.88,1115.33,3.0,1.0
93567,1500.43,1500.13,1500.33,3.0,2.0
...,...,...,...,...,...
78229,1338.55,1338.05,1338.48,3.0,1.0
10746,1097.85,1095.80,1096.60,3.0,1.0
96792,1482.03,1479.53,1479.88,3.0,1.0
25825,1238.58,1236.23,1237.03,3.0,1.0


In [143]:
print(X_train)

          high      low    close  status_low  status_high
87585  1426.78  1426.68  1426.68         2.0          1.0
78263  1336.43  1335.43  1335.53         3.0          1.0
75182  1368.78  1368.03  1368.63         3.0          1.0
86995  1404.58  1404.08  1404.08         3.0          2.0
87999  1431.73  1430.25  1431.68         2.0          1.0
...        ...      ...      ...         ...          ...
22963  1163.38  1162.58  1162.58         3.0          2.0
96909  1494.50  1493.25  1494.10         3.0          1.0
38450  1211.68  1211.40  1211.43         3.0          1.0
41873  1175.85  1174.48  1175.70         3.0          1.0
48746  1248.28  1248.18  1248.28         3.0          1.0

[80000 rows x 5 columns]


In [146]:
model = LogisticRegression(solver='liblinear', random_state=0)

In [147]:
# training the LogisticRegression model with Training data
model.fit(X_train, Y_train)

In [179]:
# accuracy on training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [180]:
print('Accuracy on Training data : ', training_data_accuracy)


Accuracy on Training data :  0.85565


In [181]:
# accuracy on test data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)

In [182]:
print('Accuracy on Test data : ', test_data_accuracy)

Accuracy on Test data :  0.8537


In [183]:
print('min value')
print(df.min())
print('max value')
print(df.max())

min value
high            1050.45
low             1044.18
close           1048.53
status_low         2.00
status_high        1.00
status_close       1.00
dtype: float64
max value
high            1576.30
low             1573.70
close           1574.45
status_low         3.00
status_high        2.00
status_close       3.00
dtype: float64


In [195]:

# status
# 1 high
# 2 equals
# 3 less
#beli
input_data = (
    1100,  # high
    114,  # low
    1048,  # close
    3,  # status_low
    1,  # status_high
    # 2.0,  #status_close
)

# change the input data to a numpy array
input_data_as_numpy_array = np.asarray(input_data)

# reshape the numpy array as we are predicting for only on instance
input_data_reshaped = input_data_as_numpy_array.reshape(1, -1)

prediction = model.predict(input_data_reshaped)

print('------')
print(prediction)
print('------')
print(f"High is {input_data[0]}")
print(f"Low is {input_data[1]}")
print(f"Close is {input_data[2]}")
print(f"Status_low is {input_data[3]}")
print(f"Status_High is {input_data[4]}")
print('------')

if (prediction[0] == 1):
    print('high')
elif (prediction[0] == 2):
    print('medium')
else:
    print('low')
    

------
[1.]
------
High is 1100
Low is 114
Close is 1048
Status_low is 3
Status_High is 1
------
high




In [184]:

# status
# 1 high
# 2 equals
# 3 less
#beli
input_data = (
    1050,  # high
    1050,  # low
    1050,  # close
    3,  # status_low
    2,  # status_high
    # 2.0,  #status_close
)

# change the input data to a numpy array
input_data_as_numpy_array = np.asarray(input_data)

# reshape the numpy array as we are predicting for only on instance
input_data_reshaped = input_data_as_numpy_array.reshape(1, -1)

prediction = model.predict(input_data_reshaped)

print('------')
print(prediction)
print('------')
print(f"High is {input_data[0]}")
print(f"Low is {input_data[1]}")
print(f"Close is {input_data[2]}")
print(f"Status_low is {input_data[3]}")
print(f"Status_High is {input_data[4]}")
print('------')

if (prediction[0] == 1):
    print('high')
elif (prediction[0] == 2):
    print('medium')
else:
    print('low')
    

------
[3.]
------
High is 1050
Low is 1050
Close is 1050
Status_low is 3
Status_High is 2
------
low




In [186]:
#sell
input_data = (
    1000,  # high
    1100,  # low
    1050,  # close
    1,  # status_low
    2,  # status_high
    # 2.0,  #status_close
)

# change the input data to a numpy array
input_data_as_numpy_array = np.asarray(input_data)

# reshape the numpy array as we are predicting for only on instance
input_data_reshaped = input_data_as_numpy_array.reshape(1, -1)

prediction = model.predict(input_data_reshaped)

print('------')
print(prediction)
print('------')
print(f"High is {input_data[0]}")
print(f"Low is {input_data[1]}")
print(f"Close is {input_data[2]}")
print(f"Status_low is {input_data[3]}")
print(f"Status_High is {input_data[4]}")
print('------')

if (prediction[0] == 1):
    print('low')
elif (prediction[0] == 2):
    print('medium')
else:
    print('high')
    

------
[2.]
------
High is 1000
Low is 1100
Close is 1050
Status_low is 1
Status_High is 2
------
medium


