# **<div align="center">ADVANCED FEATURE ENGINEERING FOR BINANCE COIN </div>**

In [5]:
import pandas as pd
import numpy as np

In [6]:
def compute_RSI(series:pd.Series, window:int = 14) -> pd.Series:
    
    '''
    Calculate the Relative Strenght Index (RSI) for a price series in a temporal windows.

    Parameters:
        - series: price temporal series
        - wondows: the windows size. Default is 14 days, which is typical for this indicator 

    Returns:
        - rsi: Relative Strength Index series
    '''
    # Calculate price differences between consecutive periods
    delta = series.diff()
    gain = delta.clip(lower=0)
    loss = -delta.clip(upper=0)

    # Calculate rolling averages of gains and losses over the window
    avg_gain = gain.rolling(window=window).mean()
    avg_loss = loss.rolling(window=window).mean()

    #Calculate RS (Relative Strengh)
    rs = avg_gain / avg_loss

    # Calculate RSI
    rsi = 100 - (100 / (1 + rs))

    return rsi

In [7]:
def compute_MACD(series: pd.Series, fast: int = 12, slow: int = 26, 
                 signal: int = 9) -> tuple[pd.Series, pd.Series, pd.Series]: 

    '''
    Calculate the Moving Average Convergence Divergence (MACD) indicator.
    It is defined as the difference between two exponential moving averages (EMA) 
    of different periods, along with a signal line and histogram.

    Parameters:
        - series: Price time series.
        - fast: Period for the fast EMA (more sensitive, captures recent changes). Default is 12.
        - slow: Period for the slow EMA (less sensitive, captures general trend). Default is 26.
        - signal: Period for the EMA of the MACD line (signal line). Default is 9.

    Returns:
        - macd: MACD line (difference between fast and slow EMA).
        - signal_line: EMA of the MACD line (signal line).
        - hist: MACD histogram (MACD line minus signal line).
    '''
    
    # Calculate the fast and slow exponential moving averages (EMA)
    ema_fast = series.ewm(span=fast, adjust=False).mean()
    ema_slow = series.ewm(span=slow, adjust=False).mean()

    # Calculate MACD line
    macd = ema_fast - ema_slow

    # Calculate signal line as EMA of the MACD line
    signal_line = macd.ewm(span=signal, adjust=False).mean()
    
    # Calculate histogram
    hist = macd - signal_line

    return macd, signal_line, hist

In [13]:
def compute_Bollinger_Bands(series:pd.Series, window: int = 14, n_std: int =2) -> tuple[pd.Series, pd.Series, pd.Series]:
    
    '''
    Calculate Bollinger Bands for a given price time series.

    Parameters:
        - series: Price time series.
        - window: Rolling window size for the moving average. Default is 14.
        - n_std: Number of standard deviations for the bands. Default is 2.

    Returns:
        - sma: Rolling mean of the series (SMA).
        - upper: Middle band + n_std * rolling standard deviation.
        - lower: Middle band - n_std * rolling standard deviation.
    '''
    sma = series.rolling(window=window).mean()
    std = series.rolling(window=window).std()
    upper = sma + n_std * std
    lower = sma - n_std * std
    return sma, upper, lower

In [21]:
def make_features(df: pd.DataFrame, price_col: str = 'price_usd', volume_col: str = 'volume', 
                  market_cap_col: str = 'market_cap') -> pd.DataFrame:
    '''
    Create advanced features for cryptocurrency time series.

    Parameters
        - df: Input DataFrame containing time series data (must have a DateTime index).
        - price_col: Name of the column containing price data. Default is 'price_usd'.
        - volume_col: Name of the column containing trading volume. Default is 'volume'.
        - market_cap_col: Name of the column containing market capitalization. Default is 'market_cap'.

    Returns
        pd.DataFrame: A new DataFrame containing the original data along with generated features:

            **Lag features**:
                - {price_col}_lag1, {price_col}_lag2, {price_col}_lag3, {price_col}_lag7, {price_col}_lag14
                - {volume_col}_lag1, {volume_col}_lag2, {volume_col}_lag3, {volume_col}_lag7, {volume_col}_lag14
                - {market_cap_col}_lag1, {market_cap_col}_lag2, {market_cap_col}_lag3, {market_cap_col}_lag7, {market_cap_col}_lag14

            **Return-based features**:
                - returns: Daily percentage change in price.
                - volatility_7d: Rolling 7-day standard deviation of returns.
                - volatility_14d: Rolling 14-day standard deviation of returns.
                - returns_rolling_mean_7d: Rolling 7-day mean of returns.

            **Rolling statistics**:
                - {price_col}_sma7, {price_col}_sma14, {price_col}_sma30
                - {price_col}_std7, {price_col}_std14, {price_col}_std30
                - {volume_col}_sma7, {volume_col}_sma14, {volume_col}_sma30
                - {market_cap_col}_sma7, {market_cap_col}_sma14, {market_cap_col}_sma30

            **Technical indicators**:
                - RSI_14: Relative Strength Index with a 14-period window.
                - MACD, MACD_signal, MACD_hist: MACD indicator components.
                - Bollinger_upper, Bollinger_lower: Upper and lower Bollinger Bands.

            **Temporal features**:
                - day_of_week: Integer (0=Monday, 6=Sunday).
                - month: Calendar month (1–12).
                - quarter: Calendar quarter (1–4).
    '''
    df = df.copy()
    df = df.sort_index()  
    
    #Lag
    for lag in [1,2,3,7,14]:
        df[f'{price_col}_lag{lag}'] = df[price_col].shift(lag)
        df[f'{volume_col}_lag{lag}'] = df[volume_col].shift(lag)
        df[f'{market_cap_col}_lag{lag}'] = df[market_cap_col].shift(lag)
    
    # Returns
    df['returns'] = df[price_col].pct_change() * 100
    df['volatility_7d'] = df['returns'].rolling(7).std()
    df['volatility_14d'] = df['returns'].rolling(14).std()
    df['returns_rolling_mean_7d'] = df['returns'].rolling(7).mean()
    
    # Rolling windows
    for window in [7,14,30]:
        df[f'{price_col}_sma{window}'] = df[price_col].rolling(window).mean()
        df[f'{price_col}_std{window}'] = df[price_col].rolling(window).std()
        df[f'{volume_col}_sma{window}'] = df[volume_col].rolling(window).mean()
        df[f'{market_cap_col}_sma{window}'] = df[market_cap_col].rolling(window).mean()
    
    # Index
    df['RSI_14'] = compute_RSI(df[price_col], window=14)
    macd, signal_line, hist = compute_MACD(df[price_col])
    df['MACD'] = macd
    df['MACD_signal'] = signal_line
    df['MACD_hist'] = hist
    df['Bollinger_sma'], df['Bollinger_upper'], df['Bollinger_lower'] = compute_Bollinger_Bands(df[price_col])
    
    # Temporal features
    df['day_of_week'] = df.index.dayofweek  # mondays=0, sundays=6
    df['month'] = df.index.month
    df['quarter'] = df.index.quarter
    
    
    df = df.dropna()
    
    return df


In [24]:
def export_dataset(df: pd.DataFrame, path: str, include_index: bool = False) -> None:

    '''
    Export the DataFrame to a CSV file.

    Parameters:
    df (pd.DataFrame): The DataFrame to export.
    path (str): The file path where to save the CSV.
    include_index (bool): Whether to include the index in the CSV file (default False).

    Returns:
    None: This function prints status messages but does not return a value.
    '''
    
    if not isinstance(path, str):
        raise ValueError("The path must be a string.")

    try:
        df.to_csv(path, index=include_index)
        print(f"Data exported successfully to {path}")
    except Exception as e:
        print(f"Error exporting data: {e}")

## **Import dataset**

In [10]:
binance = pd.read_csv("../data/processed/binance_cleared.csv")
binance['date'] = pd.to_datetime(binance['date'])
binance = binance.set_index('date', drop=False).sort_index()
binance.head()

Unnamed: 0_level_0,date,price_usd,market_cap,volume
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2025-02-05,2025-02-05,571.03,83275330000.0,1241403000.0
2025-02-06,2025-02-06,580.05,84578180000.0,471188900.0
2025-02-07,2025-02-07,586.11,85563000000.0,1075487000.0
2025-02-09,2025-02-09,634.258701,85563000000.0,1075487000.0
2025-02-10,2025-02-10,605.28,88258100000.0,1516135000.0


## **Feature engineering**

In [22]:
binance_features = make_features(binance, price_col= 'price_usd', volume_col = 'volume', 
                  market_cap_col = 'market_cap')
binance_features.head()

Unnamed: 0_level_0,date,price_usd,market_cap,volume,price_usd_lag1,volume_lag1,market_cap_lag1,price_usd_lag2,volume_lag2,market_cap_lag2,...,RSI_14,MACD,MACD_signal,MACD_hist,Bollinger_sma,Bollinger_upper,Bollinger_lower,day_of_week,month,quarter
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2025-03-11,2025-03-11,558.43,81532670000.0,1881237000.0,564.46,1039957000.0,82276350000.0,577.116083,883532100.0,87538880000.0,...,32.393096,-11.745815,-3.812251,-7.933563,596.115668,645.380104,546.851233,1,3,1
2025-03-12,2025-03-12,557.54,81366460000.0,1231965000.0,558.43,1881237000.0,81532670000.0,564.46,1039957000.0,82276350000.0,...,35.052632,-13.571649,-5.764131,-7.807518,590.334954,637.121078,543.54883,2,3,1
2025-03-13,2025-03-13,581.45,84729580000.0,1171115000.0,557.54,1231965000.0,81366460000.0,558.43,1881237000.0,81532670000.0,...,45.12147,-12.940133,-7.199331,-5.740802,588.499954,634.454618,542.54529,3,3,1
2025-03-14,2025-03-14,580.22,84579800000.0,1728902000.0,581.45,1171115000.0,84729580000.0,557.54,1231965000.0,81366460000.0,...,40.254464,-12.396009,-8.238667,-4.157342,585.10424,625.216488,544.991992,4,3,1
2025-03-16,2025-03-16,598.131721,84579800000.0,1728902000.0,580.22,1728902000.0,84579800000.0,581.45,1171115000.0,84729580000.0,...,46.501562,-10.399581,-8.67085,-1.728731,583.856506,620.847787,546.865224,6,3,1


## **Export dataframe**

In [25]:
export_dataset(binance_features, "../data/processed/binance_features.csv", include_index = True)

Data exported successfully to ../data/processed/binance_features.csv
