**Settings**

In [1]:
# Define the cryptocurrency pair you are interested in
SYMBOL = 'BTC/USDT'

# Define the timeframe
TIMEFRAME = '1h'

# Define the lookback period (2 years in milliseconds)
LOOKBACK_PERIOD = 2 * 365 * 24 * 60 * 60 * 1000

**Logger Setup**

In [2]:
import logging
from enum import Enum
from typing import Optional

class LogLevel(Enum):
    DEBUG = "DEBUG"
    INFO = "INFO"
    WARNING = "WARNING"
    ERROR = "ERROR"
    CRITICAL = "CRITICAL"


class LoggerSetup:
    def __init__(
        self,
        name: str,
        level: LogLevel = LogLevel.DEBUG,
        formatter_str: Optional[str] = None,
        save_to_file: bool = False,
        log_file_path: Optional[str] = './app.log'
    ) -> None:
        """
        Initialize the LoggerSetup with a given name and level.

        :param name: Name of the logger.
        :param level: Logging level.
        :param formatter_str: Custom format string for logging messages.
        :param save_to_file: Flag to determine if logs should be saved to a file.
        :param log_file_path: Location where the log file should be saved.
        """
        self.logger = logging.getLogger(name)
        self.logger.setLevel(level.value)
        self._setup_handler(level, formatter_str, save_to_file, log_file_path)

    def _setup_handler(
        self,
        level: LogLevel,
        formatter_str: Optional[str],
        save_to_file: bool,
        log_file_path: Optional[str]
    ) -> None:
        """
        Set up the logging handler with the given level and formatter.

        :param level: Logging level.
        :param formatter_str: Custom format string for logging messages.
        :param save_to_file: Flag to determine if logs should be saved to a file.
        :param log_file_path: Location where the log file should be saved.
        """
        if not formatter_str:
            formatter_str = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
        formatter = logging.Formatter(formatter_str)

        self._remove_existing_stream_handlers()

        # Setting up StreamHandler
        ch = logging.StreamHandler()
        ch.setLevel(level.value)
        ch.setFormatter(formatter)
        self.logger.addHandler(ch)

        # If save_to_file is True, set up FileHandler
        if save_to_file:
            fh = logging.FileHandler(log_file_path)
            fh.setLevel(level.value)
            fh.setFormatter(formatter)
            self.logger.addHandler(fh)

    def _remove_existing_stream_handlers(self) -> None:
        """
        Remove any existing stream handlers from the logger.
        This avoids duplicate log lines.
        """
        handlers_to_remove = [
            h for h in self.logger.handlers if isinstance(h, (logging.StreamHandler, logging.FileHandler))
        ]
        for handler in handlers_to_remove:
            self.logger.removeHandler(handler)

    def get_logger(self) -> logging.Logger:
        """
        Retrieve the logger instance.

        :return: Logger instance.
        """
        return self.logger

**Data Collector**

In [3]:
import os
from time import sleep
from typing import Any, List, Optional

import ccxt
import pandas as pd

class DataCollector:
    """Class responsible for collecting and managing data from a crypto exchange."""

    MAX_RETRIES = 3
    NETWORK_ERROR_SLEEP = 5
    EXCHANGE_ERROR_SLEEP = 10

    def __init__(
        self,
        symbol: str,
        timeframe: str,
        lookback_period: int,
        data_dir: str = 'data',
        exchange_name: str = 'binance'
    ) -> None:
        """
        Initialize the DataCollector.

        :param symbol: The symbol of the cryptocurrency to fetch.
        :param timeframe: Time interval for data fetching.
        :param lookback_period: Duration in milliseconds to look back and fetch data.
        :param data_dir: Directory where data will be saved.
        :param exchange_name: Name of the exchange platform.
        """
        self.exchange = self.initialize_exchange(exchange_name)
        log_setup = LoggerSetup("Data Collector")
        self.log: Any = log_setup.get_logger()

        symbol_name = symbol.replace('/', '_').lower()
        self.file_path = os.path.join(
            data_dir, 'raw', f'crypto_data_{symbol_name}.csv')
        self.symbol = symbol
        self.timeframe = timeframe
        self.lookback_period = lookback_period

        # Ensure directory exists
        os.makedirs(os.path.dirname(self.file_path), exist_ok=True)

    def initialize_exchange(self, exchange_name: str) -> ccxt.Exchange:
        """
        Dynamically initialize the requested exchange.

        :param exchange_name: Name of the exchange platform.
        :return: Initialized exchange object.
        """
        exchange_class = getattr(ccxt, exchange_name, None)
        if not exchange_class:
            raise ValueError(
                f"Exchange '{exchange_name}' not supported by ccxt.")
        return exchange_class()

    def _determine_since_timestamp(self, last_timestamp: Optional[str] = None) -> int:
        """
        Determine the timestamp to start fetching data from.

        :param last_timestamp: The last known timestamp.
        :return: Starting timestamp for fetching.
        """
        if last_timestamp:
            return int(pd.Timestamp(last_timestamp).timestamp() * 1000)
        else:
            return self.exchange.milliseconds() - self.lookback_period

    def _get_last_timestamp(self) -> Optional[str]:
        """
        Retrieve the last timestamp from the stored data.

        :return: Last known timestamp or None.
        """
        data = self.get_data()
        if not data.empty:
            return data['timestamp'].iloc[-1]
        return None

    def fetch_data(self, last_timestamp: Optional[str] = None) -> pd.DataFrame:
        """
        Fetch historical market data from the exchange.

        :param last_timestamp: The last known timestamp.
        :return: Fetched data as a DataFrame.
        """
        since = self._determine_since_timestamp(last_timestamp)
        retries = 0

        # Logging
        if last_timestamp:
            self.log.info(
                f"Fetching data since last timestamp: {last_timestamp}")
        else:
            self.log.info(f"Fetching initial data for {self.symbol}...")

        ohlcv: List[Any] = []
        fetch_count: int = 0
        while retries < self.MAX_RETRIES:
            try:
                new_data: List[Any] = self.exchange.fetch_ohlcv(
                    self.symbol,
                    self.timeframe,
                    since
                )

                if not new_data:
                    break

                fetch_count += len(new_data)
                self.log.info(
                    f"Retrieved {len(new_data)} new data points. Total data points fetched for {self.symbol}: {fetch_count}")

                since = new_data[-1][0] + 1
                ohlcv += new_data

            except ccxt.NetworkError as e:
                self.log.error(f"Network Error at timestamp {since}: {e}")
                sleep(self.NETWORK_ERROR_SLEEP)
                retries += 1

            except ccxt.ExchangeError as e:
                self.log.error(f"Exchange Error at timestamp {since}: {e}")
                sleep(self.EXCHANGE_ERROR_SLEEP)
                retries += 1

        self.log.info(
            f"Data fetch completed. Total data points for {self.symbol}: {len(ohlcv)}")

        df: pd.DataFrame = pd.DataFrame(
            ohlcv, columns=['timestamp', 'open', 'high', 'low', 'close', 'volume'])
        df['timestamp'] = pd.to_datetime(df['timestamp'], unit='ms')

        return df

    def fetch_and_append_data(self) -> pd.DataFrame:
        """
        Fetch new data and append it to the CSV file.

        :return: Appended data as a DataFrame.
        """
        last_timestamp = self._get_last_timestamp()
        new_data = self.fetch_data(last_timestamp)

        file_exists = os.path.exists(self.file_path)
        if not new_data.empty:
            new_data.to_csv(self.file_path, mode='a' if file_exists else 'w',
                            header=not file_exists, index=False)
            self.log.info(
                f"Appended {len(new_data)} new rows of data to {self.file_path}.")
            return new_data
        else:
            self.log.info("No new data to append.")
            return pd.DataFrame()

    def get_data(self) -> pd.DataFrame:
        """
        Retrieve all data from the CSV file.

        :return: Data as a DataFrame or an empty DataFrame if no data is found.
        """
        if os.path.exists(self.file_path):
            return pd.read_csv(self.file_path, parse_dates=['timestamp'])
        else:
            self.log.warning(f"No data found at {self.file_path}.")
            return pd.DataFrame()

**Define the Data Frame**

In [4]:
import pandas as pd

# Initialize the logger
log_setup = LoggerSetup("Main Logger")
log = log_setup.get_logger()
log.info("Main Logger Initialized")

# Initialize the data collector
collector = DataCollector(
    symbol=SYMBOL,
    timeframe=TIMEFRAME,
    lookback_period=LOOKBACK_PERIOD
)

# Fetch the data and print head and tail
data = pd.concat(
    [
        collector.get_data(),
        collector.fetch_and_append_data()
    ],
    ignore_index=True
)

# print(data.head())
# print(data.tail())

2023-10-28 11:18:00,955 - Main Logger - INFO - Main Logger Initialized
2023-10-28 11:18:01,281 - Data Collector - INFO - Fetching data since last timestamp: 2023-10-27 17:00:00
2023-10-28 11:18:09,396 - Data Collector - INFO - Retrieved 23 new data points. Total data points fetched for BTC/USDT: 23
2023-10-28 11:18:09,813 - Data Collector - INFO - Data fetch completed. Total data points for BTC/USDT: 23
2023-10-28 11:18:09,820 - Data Collector - INFO - Appended 23 new rows of data to data\raw\crypto_data_btc_usdt.csv.


**Data Preprocessing**

In [5]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import numpy as np

# Extracting 'close' prices
prices = data['close'].values.reshape(-1, 1)

# Normalize the prices between 0 and 1
scaler = MinMaxScaler()
scaled_prices = scaler.fit_transform(prices)

# Create sequences of 60 (or another desired number) time steps
sequence_length = 60
X, y = [], []
for i in range(sequence_length, len(scaled_prices)):
    X.append(scaled_prices[i-sequence_length:i])
    y.append(scaled_prices[i])

X, y = np.array(X), np.array(y)

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

**Build & Train Model**

In [6]:
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout

model = Sequential()

# Add layers
model.add(LSTM(units=50, return_sequences=True, input_shape=(X_train.shape[1], 1)))
model.add(Dropout(0.2))
model.add(LSTM(units=50, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(units=50))
model.add(Dropout(0.2))
model.add(Dense(units=1))

# Compile and train the model
model.compile(optimizer='adam', loss='mean_squared_error')
model.fit(X_train, y_train, epochs=5, batch_size=64)

# Evaluate the model
loss = model.evaluate(X_test, y_test)
log.info(f"Model loss on test data: {loss}")

KeyboardInterrupt: 

**Prediction**

In [None]:
import plotly.graph_objects as go

# Predict prices for train and test datasets
predicted_train = model.predict(X_train)
predicted_test = model.predict(X_test)

# Transform the predicted prices back to original scale
predicted_train_original = scaler.inverse_transform(predicted_train)
predicted_test_original = scaler.inverse_transform(predicted_test)

# Create a figure with dark theme
fig = go.Figure(layout=go.Layout(template='plotly_dark', height=800))

# Actual train prices
fig.add_trace(go.Scatter(x=data['timestamp'][:len(predicted_train_original)],
                         y=prices[sequence_length:sequence_length+len(predicted_train_original)].flatten(),
                         mode='lines',
                         name='Actual Train Prices',
                         line=dict(color='blue')))

# Predicted train prices
fig.add_trace(go.Scatter(x=data['timestamp'][:len(predicted_train_original)],
                         y=predicted_train_original.flatten(),
                         mode='lines',
                         name='Predicted Train Prices',
                         line=dict(color='red')))

# Actual test prices
fig.add_trace(go.Scatter(x=data['timestamp'][sequence_length+len(predicted_train_original):],
                         y=prices[-len(predicted_test_original):].flatten(),
                         mode='lines',
                         name='Actual Test Prices',
                         line=dict(color='cyan')))

# Predicted test prices
fig.add_trace(go.Scatter(x=data['timestamp'][sequence_length+len(predicted_train_original):],
                         y=predicted_test_original.flatten(),
                         mode='lines',
                         name='Predicted Test Prices',
                         line=dict(color='green')))

fig.update_layout(title=f'{SYMBOL} Price Prediction using LSTM',
                  xaxis_title='Time',
                  yaxis_title='Price')

# Display the figure
fig.show()