In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')



In [2]:
df_original = pd.read_csv('Dataset/crypto_price_movement_dataset.csv')
df_original.head()

Unnamed: 0,Date,Open_Price,Close_Price,High_Price,Low_Price,Price_Change,Volume,MA_5,MA_10,RSI,Volatility,Sentiment_Score,Global_Economy,Event_Impact,Price_Movement
0,2023-01-01,41236.203565,41583.440143,42164.219187,41514.228466,347.236578,6746503,0.0,0.0,39.78297,0.011679,0.806251,1,0.05281,1
1,2023-01-02,58521.429192,58515.946236,59042.917886,57930.249488,-5.482956,6569064,0.0,0.0,52.66776,0.042812,0.010386,0,0.08215,0
2,2023-01-03,51959.818254,51655.283865,52006.320815,50856.415124,-304.534389,8066677,0.0,0.0,30.473258,0.031347,0.555665,1,0.092787,0
3,2023-01-04,47959.754526,48196.396313,48689.60897,47431.923218,236.641787,8759881,0.0,0.0,54.708497,0.038986,-0.56344,0,0.079497,1
4,2023-01-05,34680.559213,34599.237349,34964.333984,33761.841141,-81.321864,7696906,46910.060781,0.0,30.916759,0.045281,0.242467,0,0.064309,0


In [3]:
#drop Price_Movement and Price_Change 
df = df_original.drop(['Price_Movement', 'Price_Change'], axis=1)
#drop rows with NaN values
df.dropna(inplace=True)

print(df.head())

         Date    Open_Price   Close_Price    High_Price     Low_Price  \
0  2023-01-01  41236.203565  41583.440143  42164.219187  41514.228466   
1  2023-01-02  58521.429192  58515.946236  59042.917886  57930.249488   
2  2023-01-03  51959.818254  51655.283865  52006.320815  50856.415124   
3  2023-01-04  47959.754526  48196.396313  48689.608970  47431.923218   
4  2023-01-05  34680.559213  34599.237349  34964.333984  33761.841141   

    Volume          MA_5  MA_10        RSI  Volatility  Sentiment_Score  \
0  6746503      0.000000    0.0  39.782970    0.011679         0.806251   
1  6569064      0.000000    0.0  52.667760    0.042812         0.010386   
2  8066677      0.000000    0.0  30.473258    0.031347         0.555665   
3  8759881      0.000000    0.0  54.708497    0.038986        -0.563440   
4  7696906  46910.060781    0.0  30.916759    0.045281         0.242467   

   Global_Economy  Event_Impact  
0               1      0.052810  
1               0      0.082150  
2       

In [4]:
df.dtypes

Date                object
Open_Price         float64
Close_Price        float64
High_Price         float64
Low_Price          float64
Volume               int64
MA_5               float64
MA_10              float64
RSI                float64
Volatility         float64
Sentiment_Score    float64
Global_Economy       int64
Event_Impact       float64
dtype: object

In [5]:
df['Date'] = pd.to_datetime(df['Date'], format='%Y-%m-%d')
df.dtypes

Date               datetime64[ns]
Open_Price                float64
Close_Price               float64
High_Price                float64
Low_Price                 float64
Volume                      int64
MA_5                      float64
MA_10                     float64
RSI                       float64
Volatility                float64
Sentiment_Score           float64
Global_Economy              int64
Event_Impact              float64
dtype: object

In [6]:
is_sorted = df['Date'].is_monotonic_increasing
print("DataFrame sorted by date:", is_sorted)

DataFrame sorted by date: True


In [10]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Ensure 'Date' is in datetime format and sort by date (important for time-series)
df['Date'] = pd.to_datetime(df['Date'])
df = df.sort_values('Date')

# Define your target and features.
# Here, we assume the target is 'Close_Price' so we drop both 'Date' and 'Close_Price' from the features.
features_columns = df.columns.difference(['Date', 'Close_Price'])
target_column = 'Close_Price'

# Scale only the feature columns
scaler = StandardScaler()
scaled_features = scaler.fit_transform(df[features_columns])
scaled_features_df = pd.DataFrame(scaled_features, columns=features_columns, index=df.index)

# Optionally, add the target back (kept in its original scale) to the DataFrame if needed for later processing
scaled_features_df[target_column] = df[target_column]

# If you need the sorted dates for reference or splitting, keep them in a separate variable
dates = df['Date']

print(scaled_features_df.head())

   Event_Impact  Global_Economy  High_Price  Low_Price      MA_10       MA_5  \
0      0.095368        0.996765   -0.381495  -0.340800 -16.115421 -11.554465   
1      1.113404       -1.003245    1.567637   1.555072 -16.115421 -11.554465   
2      1.482448        0.996765    0.755059   0.738121 -16.115421 -11.554465   
3      1.021326       -1.003245    0.372049   0.342630 -16.115421 -11.554465   
4      0.494340       -1.003245   -1.212929  -1.236116 -16.115421   0.500609   

   Open_Price       RSI  Sentiment_Score  Volatility    Volume   Close_Price  
0   -0.431074 -0.889210         1.397085   -1.585910  0.481725  41583.440143  
1    1.566732  0.227973         0.019860    1.115561  0.413334  58515.946236  
2    0.808348 -1.696415         0.963452    0.120672  0.990565  51655.283865  
3    0.346026  0.404917        -0.973132    0.783532  1.257749  48196.396313  
4   -1.188768 -1.657960         0.421471    1.329813  0.848043  34599.237349  


In [20]:
import numpy as np

# Define the window size
window_size = 10

def create_time_windows(features, target, window_size):
    """
    Create sliding window sequences for features and corresponding targets.
    
    Parameters:
        features (pd.DataFrame): DataFrame containing only the feature columns.
        target (pd.Series): Series containing the scaled target variable.
        window_size (int): Number of time steps in each input window.
        
    Returns:
        X_windows (np.array): Array of feature sequences with shape (samples, window_size, num_features).
        y_windows (np.array): Array of target values with shape (samples,).
    """
    X_windows = []
    y_windows = []
    
    # Loop through the data, starting at window_size index
    for i in range(window_size, len(features)):
        # Get the previous window_size days as the feature window
        X_window = features.iloc[i-window_size:i].values
        # Get the target for the current day from the scaled target values
        y_window = target.iloc[i]
        X_windows.append(X_window)
        y_windows.append(y_window)
    
    return np.array(X_windows), np.array(y_windows)

# Prepare the features. Exclude 'Date', 'Close_Price' and also any unscaled columns you don't want.
features = scaled_features_df.drop(['Close_Price', 'Close_Price_scaled'], axis=1)

# Use the scaled target column for learning
target = scaled_features_df['Close_Price_scaled']

# Create time window sequences
X_seq, y_seq = create_time_windows(features, target, window_size)

# Check the shapes: X_seq should have shape (samples, window_size, num_features)
print("X_seq shape:", X_seq.shape)
print("y_seq shape:", y_seq.shape)


X_seq shape: (49990, 10, 11)
y_seq shape: (49990,)


In [21]:
def temporal_train_val_test_split(X, y, train_ratio=0.7, val_ratio=0.15):
    """
    Splits the sequence data into training, validation, and test sets sequentially.
    
    Parameters:
      X (np.array): Input sequences of shape (samples, window_size, num_features).
      y (np.array): Target values array of shape (samples,).
      train_ratio (float): Fraction of data to use for training.
      val_ratio (float): Fraction of data to use for validation.
      
    Returns:
      X_train, y_train, X_val, y_val, X_test, y_test: The split datasets.
    """
    n = X.shape[0]
    train_end = int(n * train_ratio)
    val_end = train_end + int(n * val_ratio)
    
    X_train, y_train = X[:train_end], y[:train_end]
    X_val, y_val = X[train_end:val_end], y[train_end:val_end]
    X_test, y_test = X[val_end:], y[val_end:]
    
    return X_train, y_train, X_val, y_val, X_test, y_test

# Split the data
X_train, y_train, X_val, y_val, X_test, y_test = temporal_train_val_test_split(X_seq, y_seq)

# Verify the shapes
print(f"Training set: X shape = {X_train.shape}, y shape = {y_train.shape}")
print(f"Validation set: X shape = {X_val.shape}, y shape = {y_val.shape}")
print(f"Test set: X shape = {X_test.shape}, y shape = {y_test.shape}")


Training set: X shape = (34993, 10, 11), y shape = (34993,)
Validation set: X shape = (7498, 10, 11), y shape = (7498,)
Test set: X shape = (7499, 10, 11), y shape = (7499,)


In [22]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Build the LSTM model
model = tf.keras.Sequential([
    tf.keras.layers.LSTM(64, input_shape=(X_train.shape[1], X_train.shape[2])),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(1)
])
model.compile(optimizer='adam', loss='mse', metrics=['mae'])
model.summary()
callbacks = [
    tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True),
    tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, min_lr=1e-6)
]
# Train the model using training and validation sets
history = model.fit(
    X_train, y_train,
    epochs=20,
    batch_size=32,
    validation_data=(X_val, y_val)
)

# Evaluate model performance on the validation set
val_loss, val_mae = model.evaluate(X_val, y_val)
print("Validation MSE:", val_loss)
print("Validation MAE:", val_mae)

# Compute additional performance metrics

y_val_pred = model.predict(X_val)
mse_val = mean_squared_error(y_val, y_val_pred)
mae_val = mean_absolute_error(y_val, y_val_pred)
r2_val = r2_score(y_val, y_val_pred)

print(f"Sklearn MSE: {mse_val:.4f}")
print(f"Sklearn MAE: {mae_val:.4f}")
print(f"Sklearn R2 Score: {r2_val:.4f}")

Epoch 1/20
[1m1094/1094[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - loss: 1.0098 - mae: 0.8669 - val_loss: 1.0153 - val_mae: 0.8720
Epoch 2/20
[1m1094/1094[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - loss: 0.9924 - mae: 0.8594 - val_loss: 1.0124 - val_mae: 0.8709
Epoch 3/20
[1m1094/1094[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - loss: 0.9899 - mae: 0.8596 - val_loss: 1.0133 - val_mae: 0.8711
Epoch 4/20
[1m1094/1094[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - loss: 0.9938 - mae: 0.8604 - val_loss: 1.0166 - val_mae: 0.8723
Epoch 5/20
[1m1094/1094[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - loss: 0.9892 - mae: 0.8586 - val_loss: 1.0169 - val_mae: 0.8726
Epoch 6/20
[1m1094/1094[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - loss: 0.9895 - mae: 0.8596 - val_loss: 1.0158 - val_mae: 0.8720
Epoch 7/20
[1m1094/1094[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[

In [None]:
import tensorflow as tf
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

from tensorflow.keras import layers, Sequential

model = Sequential([
    layers.LSTM(64, return_sequences=True, input_shape=(X_train.shape[1], X_train.shape[2])),
    layers.Dropout(0.2),
    layers.LSTM(32),
    layers.Dropout(0.2),
    layers.Dense(32, activation='relu'),
    layers.Dense(1)
])
model.compile(optimizer='adam', loss='mse', metrics=['mae'])
model.summary()


# Callbacks for early stopping and learning rate reduction
callbacks = [
    tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True),
    tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, min_lr=1e-6)
]

history = model.fit(
    X_train, y_train,
    epochs=50,  # increased number of epochs
    batch_size=32,
    validation_data=(X_val, y_val),
    callbacks=callbacks
)


# Evaluate model performance on the validation set
val_loss, val_mae = model.evaluate(X_val, y_val)
print("Validation MSE (scaled):", val_loss)
print("Validation MAE (scaled):", val_mae)

# Predict using the validation set
y_val_pred_scaled = model.predict(X_val)

# Inverse scale predictions and true values back to original scale
y_val_pred = target_scaler.inverse_transform(y_val_pred_scaled)
y_val_true = target_scaler.inverse_transform(y_val.reshape(-1, 1))

mse_val = mean_squared_error(y_val_true, y_val_pred)
mae_val = mean_absolute_error(y_val_true, y_val_pred)
r2_val = r2_score(y_val_true, y_val_pred)

print(f"Inversed Sklearn MSE: {mse_val:.4f}")
print(f"Inversed Sklearn MAE: {mae_val:.4f}")
print(f"Inversed Sklearn R2 Score: {r2_val:.4f}")


Epoch 1/20
[1m1094/1094[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - loss: 0.9993 - mae: 0.8645 - val_loss: 1.0131 - val_mae: 0.8718 - learning_rate: 0.0010
Epoch 2/20
[1m1094/1094[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - loss: 0.9953 - mae: 0.8616 - val_loss: 1.0132 - val_mae: 0.8720 - learning_rate: 0.0010
Epoch 3/20
[1m1094/1094[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - loss: 0.9941 - mae: 0.8613 - val_loss: 1.0120 - val_mae: 0.8715 - learning_rate: 0.0010
Epoch 4/20
[1m1094/1094[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - loss: 0.9956 - mae: 0.8628 - val_loss: 1.0141 - val_mae: 0.8723 - learning_rate: 0.0010
Epoch 5/20
[1m1094/1094[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - loss: 0.9966 - mae: 0.8637 - val_loss: 1.0137 - val_mae: 0.8719 - learning_rate: 0.0010
Epoch 6/20
[1m1094/1094[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - loss: 0.9879 - mae: 

In [24]:
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Baseline: Predict the mean of the training target
baseline_pred_train = np.full(shape=y_train.shape, fill_value=y_train.mean())
baseline_pred_val = np.full(shape=y_val.shape, fill_value=y_train.mean())  # note using training mean

# Inverse-transform baseline predictions for comparison (if needed)
baseline_pred_val_inv = target_scaler.inverse_transform(baseline_pred_val.reshape(-1,1))
y_val_true_inv = target_scaler.inverse_transform(y_val.reshape(-1,1))

print("Baseline Validation MSE:", mean_squared_error(y_val_true_inv, baseline_pred_val_inv))
print("Baseline Validation MAE:", mean_absolute_error(y_val_true_inv, baseline_pred_val_inv))


Baseline Validation MSE: 75794707.61328782
Baseline Validation MAE: 7542.546320072083
