# **<div align="center">DATASETS CREATION AND SPLIT </div>**

In [12]:
from typing import Tuple
import pandas as pd
from sklearn.model_selection import train_test_split

In [13]:
def create_price_dataset(df: pd.DataFrame, target_col: str = 'price_usd', split_ratio: float = 0.8) -> Tuple:

    '''
    Create a dataset for price prediction tasks.

    The target variable is the price shifted by one time step ahead. 
    The dataset is split into training and testing subsets based on a temporal index.

    Parameters:
        - df: Input DataFrame containing time series features and a price column.
        - target_col: Name of the column containing price data. Default is 'price_usd'.
        - split_ratio: Proportion of data to include in the training set (default is 0.8).

    Returns:
        - Tuple containing the training and testing DataFrames.
    '''
    df = df.copy()
    df['target_price'] = df[target_col].shift(-1)
    df = df.dropna(subset=['target_price'])
    split_idx = int(len(df) * split_ratio)
    return df.iloc[:split_idx], df.iloc[split_idx:]


In [14]:
def create_returns_dataset(df: pd.DataFrame, target_col: str = 'returns', split_ratio: float = 0.8) -> Tuple:

    '''
    Create a dataset for returns prediction and classification tasks.

    This function generates both a regression target (next-period return)
    and a binary classification target (1 = positive return, 0 = negative or zero).
    The dataset is then split into training and testing subsets based on a temporal index.

    Parameters:
        - df: Input DataFrame containing time series features and return data.
        - target_col: Name of the column containing return data. Default is 'returns'.
        - split_ratio: Proportion of data to include in the training set (default is 0.8).

    Returns
        - Tuple containing the training and testing DataFrames.
    '''
    df = df.copy()
    df['target_return'] = df[target_col].shift(-1)
    df['target_class'] = (df['target_return'] > 0).astype(int)
    df = df.dropna(subset=['target_return'])
    split_idx = int(len(df) * split_ratio)
    return df.iloc[:split_idx], df.iloc[split_idx:]

In [15]:
def export_dataset(df: pd.DataFrame, path: str, include_index: bool = False) -> None:

    '''
    Export the DataFrame to a CSV file.

    Parameters:
    df (pd.DataFrame): The DataFrame to export.
    path (str): The file path where to save the CSV.
    include_index (bool): Whether to include the index in the CSV file (default False).

    Returns:
    None: This function prints status messages but does not return a value.
    '''
    
    if not isinstance(path, str):
        raise ValueError("The path must be a string.")

    try:
        df.to_csv(path, index=include_index)
        print(f"Data exported successfully to {path}")
    except Exception as e:
        print(f"Error exporting data: {e}")

## **Load data**

In [16]:
binance_features = pd.read_csv("../data/processed/binance_features.csv", index_col=0, parse_dates=['date'])
binance_features = binance_features.sort_index()

## **Datasets creations**

### **Dataset 1: Price predictions (ARIMA, Prophet, LSTM, etc.)**

In [17]:
binance_price = binance_features.copy() 
binance_price['target_price'] = binance_price['price_usd'].shift(-1) 
binance_price = binance_price.dropna(subset=['target_price'])
train_price, test_price = create_price_dataset(binance_price)

print("Dataset PRICE prepared:")
print(f"Train: {train_price.shape}, Test: {test_price.shape}")

Dataset PRICE prepared:
Train: (65, 46), Test: (17, 46)


### **Dataset 2: Returns prediction (ML and DL)**

In [18]:
binance_returns = binance_features.copy() 
binance_returns['target_return'] = binance_returns['returns'].shift(-1) 
binance_returns = binance_returns.dropna(subset=['target_return']) 

# Also, a binary variable is create (increase / decrease)
binance_returns['target_class'] = (binance_returns['target_return'] > 0).astype(int)

train_returns, test_returns = create_returns_dataset(binance_features)
print("Dataset RETURNS prepared:")
print(f"Train: {train_returns.shape}, Test: {test_returns.shape}")

Dataset RETURNS prepared:
Train: (66, 47), Test: (17, 47)


## **Export datasets**

In [19]:
export_dataset(binance_price, "../data/processed/binance_target_price.csv")
export_dataset(binance_returns, "../data/processed/binance_target_returns.csv")
export_dataset(train_price, "../data/processed/binance_train_price.csv")
export_dataset(test_price, "../data/processed/binance_test_price.csv")
export_dataset(train_returns, "../data/processed/binance_train_returns.csv")
export_dataset(test_returns, "../data/processed/binance_test_returns.csv")

Data exported successfully to ../data/processed/binance_target_price.csv
Data exported successfully to ../data/processed/binance_target_returns.csv
Data exported successfully to ../data/processed/binance_train_price.csv
Data exported successfully to ../data/processed/binance_test_price.csv
Data exported successfully to ../data/processed/binance_train_returns.csv
Data exported successfully to ../data/processed/binance_test_returns.csv
