# Install all required dependencies

In [2]:
!pip install pandas-ta tqdm



In [3]:
import pandas as pd
import numpy as np
import pandas_ta as ta
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor
from typing import Tuple
from tqdm import tqdm

# 1. Step 1: Works on the columns

Some columns are duplicated and redundant.

In [5]:
data_path = Path("btc_2024-2025_combined_data.csv")
df = pd.read_csv(data_path)

In [6]:
import pandas as pd
import numpy as np
from typing import Dict, List, Tuple

class OHLCVUnifier:
    """
    Unify OHLCV metrics from 'cg' and 'cq' sources into robust single series per metric.
    Specifically combines:
      - cg_c + cq_close → unified_close
      - cg_h + cq_high   → unified_high
      - cg_l + cq_low    → unified_low
      - cg_o + cq_open   → unified_open
    Uses MAD-based outlier filtering, then median aggregation, and drops original columns.
    """
    def __init__(self):
        # define the mapping of unified metric -> source column patterns
        self.mapping: Dict[str, List[str]] = {
            'close': ['cg_c', 'cq_close'],
            'high':  ['cg_h', 'cq_high'],
            'low':   ['cg_l', 'cq_low'],
            'open':  ['cg_o', 'cq_open'],
        }

    def unify(self, df: pd.DataFrame) -> Tuple[pd.DataFrame, Dict[str, List[str]]]:
        """
        Returns:
          - df_out: DataFrame with new columns unified_<metric> and original cg_/cq_ cols removed
          - used_cols: dict mapping each metric to the list of columns that were combined
        """
        df_out = df.copy()
        used_cols: Dict[str, List[str]] = {}

        for metric, cols in self.mapping.items():
            # select only cols that actually exist
            present = [c for c in cols if c in df_out.columns]
            if not present:
                continue
            used_cols[metric] = present
            group = df_out[present]
            # median and MAD
            med = group.median(axis=1)
            mad = group.sub(med, axis=0).abs().median(axis=1)
            # mask out outliers beyond 3*MAD
            mask = group.sub(med, axis=0).abs().le(3 * mad, axis=0)
            filtered = group.where(mask)
            # unified result
            unified = filtered.median(axis=1)
            df_out[f'unified_{metric}'] = unified
            # drop original columns
            df_out.drop(columns=present, inplace=True)

        return df_out, used_cols

In [7]:
def detect_and_prepare_datetime(df: pd.DataFrame) -> pd.DataFrame:
    if "date" in df.columns:
        return df
    if "start_time" in df.columns:
        df["date"] = pd.to_datetime(df["start_time"], unit="ms")
        return df
    for col in df.columns:
        if col.endswith("_time"):
            df["date"] = pd.to_datetime(df[col], unit="s")
            return df
    raise ValueError("No suitable timestamp column found")

In [8]:
unifier = OHLCVUnifier()
df_unified, used = unifier.unify(df)
df_unified = detect_and_prepare_datetime(df_unified)

In [9]:
class ColumnAbbreviator:
    """
    Renames DataFrame columns by replacing full words with abbreviations.
    """
    # constant map: full word (singular/plural) -> abbreviation
    PART_ABBREV: dict[str, str] = {
        'transaction': 'txn',
        'transactions': 'txn',
        'address': 'addr',
        'addresses': 'addr',
        'blockchain': 'bc',
        'count': 'cnt',
        'counts': 'cnt',
        'exchange': 'exch',
        'exchanges': 'exch',
        'breakdown': 'brkdwn',
        'breakdowns': 'brkdwn',
        'liquidation': 'liq',
        'liquidations': 'liq',
    }

    @classmethod
    def abbreviate(cls, col: str) -> str:
        """
        Replace any segment in the column name that matches PART_ABBREV.
        """
        parts = col.split('_')
        return '_'.join(cls.PART_ABBREV.get(p, p) for p in parts)

    @classmethod
    def rename_df(cls, df: pd.DataFrame) -> pd.DataFrame:
        """
        Return a new DataFrame with its columns renamed via the abbreviate method.
        """
        return df.rename(columns=cls.abbreviate)

In [10]:
df_renamed = ColumnAbbreviator.rename_df(df_unified)

# Step 2: Feature Engineering

In [12]:
from dataclasses import dataclass
from typing import List, Callable, Dict
import pandas as pd
import numpy as np
import pandas_ta as ta

@dataclass
class BTCFeatureEngineer:
    """
    Clean, modular pipeline for BTC features: price returns, technical indicators, on-chain metrics.
    """
    key_date: str = 'date'
    price_col: str = 'unified_close'
    high_col: str = 'unified_high'
    low_col: str = 'unified_low'
    # core on-chain
    market_cap_col: str = 'market_cap'
    tx_volume_col: str = 'tx_volume_usd'
    realized_cap_col: str = 'realized_cap'
    active_addr_col: str = 'addresses_active_count'
    tx_count_col: str = 'transactions_count'
    cdd_col: str = 'coin_days_destroyed'
    hash_rate_col: str = 'hash_rate'
    difficulty_col: str = 'difficulty'
    mempool_txs_col: str = 'mempool_txs_count_sum'
    fear_greed_col: str = 'fear_greed_index'
    sopr_col: str = 'spent_output_profit_ratio'
    # additional on-chain (required by default)
    realized_price_col: str = 'realized_price'
    miner_outflows_col: str = 'miner_outflows'
    exchange_flow_col: str = 'exchange_flows'
    dormant_coins_col: str = 'avg_dormancy'
    hodl_waves_col: str = 'hodl_waves_1y_share'
    miner_revenue_col: str = 'miner_revenue'
    whale_balances_col: str = 'whale_balance_ratio'
    active_addrs_col: str = 'active_addresses'
    mev_col: str = 'mev_value'
    exchange_reserves_col: str = 'exchange_reserves'
    tx_fees_col: str = 'tx_fees_usd'

    def _validate(self, df: pd.DataFrame):
        missing = [c for c in [self.key_date, self.price_col, self.high_col, self.low_col]
                   if c not in df.columns]
        if missing:
            raise KeyError(f"Missing required columns: {missing}")

    def _set_index(self, df: pd.DataFrame) -> pd.DataFrame:
        df = df.copy()
        df[self.key_date] = pd.to_datetime(df[self.key_date])
        return df.set_index(self.key_date)

    def _clean(self, df: pd.DataFrame) -> pd.DataFrame:
        return df.drop_duplicates().dropna()

    def _add_price_features(self, df: pd.DataFrame) -> pd.DataFrame:
        df['ret']     = df[self.price_col].pct_change()
        df['log_ret'] = np.log(df[self.price_col] / df[self.price_col].shift(1))
        df['cum_ret'] = df['ret'].cumsum()
        return df

    def _add_moving_averages(self, df: pd.DataFrame, windows: List[int] = (7,21,50,100,200)) -> pd.DataFrame:
        for w in windows:
            df[f'sma_{w}'] = df[self.price_col].rolling(w).mean()
            df[f'ema_{w}'] = ta.ema(df[self.price_col], length=w)
        return df

    def _add_volatility(self, df: pd.DataFrame, window: int = 14) -> pd.DataFrame:
        df[f'vol_{window}'] = df['log_ret'].rolling(window).std()
        df['atr'] = ta.atr(df[self.high_col], df[self.low_col], df[self.price_col], length=window)
        bb = ta.bbands(df[self.price_col], length=window)
        df[['bb_up','bb_mid','bb_low']] = bb[[f'BBU_{window}_2.0', f'BBM_{window}_2.0', f'BBL_{window}_2.0']]
        return df

    def _add_momentum(self, df: pd.DataFrame) -> pd.DataFrame:
        df['rsi_14'] = ta.rsi(df[self.price_col], length=14)
        macd = ta.macd(df[self.price_col])
        df[['macd','macd_signal','macd_hist']] = macd[['MACD_12_26_9','MACDs_12_26_9','MACDh_12_26_9']]
        return df

    def _add_trend_strength(self, df: pd.DataFrame) -> pd.DataFrame:
        adx = ta.adx(df[self.high_col], df[self.low_col], df[self.price_col], length=14)
        df['adx_14'] = adx['ADX_14']
        ar = ta.aroon(df[self.high_col], df[self.low_col], length=14)
        df['aroon_up'], df['aroon_dn'] = ar['AROONU_14'], ar['AROOND_14']
        df['psar'] = ta.psar(df[self.high_col], df[self.low_col], df[self.price_col])['PSARl_0.02_0.2']
        return df

    def _add_volume_indicators(self, df: pd.DataFrame) -> pd.DataFrame:
        # Skip volume-based indicators if volume_col is not available
        return df

    def _add_onchain_metrics(self, df: pd.DataFrame) -> pd.DataFrame:
        # mapping of feature name → generator
        funcs: Dict[str, Callable[[pd.DataFrame], pd.Series]] = {
            'addr_growth':     lambda d: d[self.active_addr_col].pct_change(),
            'tx_per_addr':     lambda d: d[self.tx_count_col] / d[self.active_addr_col],
            'nvt':             lambda d: d[self.market_cap_col] / d[self.tx_volume_col],
            'mvrv':            lambda d: d[self.market_cap_col] / d[self.realized_cap_col],
            'nupl':            lambda d: (d[self.market_cap_col]-d[self.realized_cap_col]) / d[self.market_cap_col],
            'cdd_diff':        lambda d: d[self.cdd_col].diff(),
            'hashrate_chg':    lambda d: d[self.hash_rate_col].pct_change(),
            'diff_chg':        lambda d: d[self.difficulty_col].pct_change(),
            'mempool_backlog': lambda d: d[self.mempool_txs_col],
            'fear_greed':      lambda d: d[self.fear_greed_col],
            'sopr':            lambda d: d[self.sopr_col],
            # required additional on-chain
            'realized_price':  lambda d: d[self.realized_price_col],
            'miner_outflows':  lambda d: d[self.miner_outflows_col],
            'exchange_flows':  lambda d: d[self.exchange_flow_col],
            'avg_dormancy':    lambda d: d[self.dormant_coins_col],
            'hodl_1y_share':   lambda d: d[self.hodl_waves_col],
            'miner_revenue':   lambda d: d[self.miner_revenue_col],
            'whale_balance_ratio': lambda d: d[self.whale_balances_col],
            'active_addresses':    lambda d: d[self.active_addrs_col],
            'mev_value':           lambda d: d[self.mev_col],
            'exchange_reserves':   lambda d: d[self.exchange_reserves_col],
            'tx_fees_usd':         lambda d: d[self.tx_fees_col],
            'avg_tx_fee':          lambda d: d[self.tx_fees_col] / d[self.tx_count_col]
        }
        for name, fn in funcs.items():
            try:
                df[name] = fn(df)
            except KeyError:
                # skip if the required column isn't present
                continue
        return df

    def run(self, raw_df: pd.DataFrame) -> pd.DataFrame:
        self._validate(raw_df)
        df = self._set_index(raw_df)
        df = self._clean(df)
        pipeline = [
            self._add_price_features,
            self._add_moving_averages,
            self._add_volatility,
            self._add_momentum,
            self._add_trend_strength,
            self._add_volume_indicators,
            self._add_onchain_metrics
        ]
        for step in pipeline:
            df = step(df)
        # drop columns with zero variance or missing
        df = df.loc[:, df.nunique() > 1].dropna()
        return df

In [13]:
fe = BTCFeatureEngineer()

df_features = fe.run(df_renamed)

# Preview engineered features
df_features.head()

Unnamed: 0_level_0,start_time,cg_longShortRatio,cg_time,cg_buy,cg_sell,cg_longAccount,cg_shortAccount,cg_t,cq_datetime,cq_netflow_total,...,bb_mid,bb_low,rsi_14,macd,macd_signal,macd_hist,adx_14,aroon_up,aroon_dn,psar
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2024-04-28 10:00:00,1714298400000,0.72,1714298400,3508.786,4811.658,65.42,34.58,1714298400,2024-04-28 10:00:00,-233.803723,...,65649.061429,65006.513014,48.833304,94.707828,66.389041,28.318787,24.021262,50.0,14.285714,65202.865242
2024-04-28 11:00:00,1714302000000,0.75,1714302000,2373.46,3050.441,65.42,34.58,1714302000,2024-04-28 11:00:00,207.558104,...,65676.869286,65114.288201,45.668961,64.537394,66.018712,-1.481318,22.426558,42.857143,7.142857,65241.823432
2024-04-28 12:00:00,1714305600000,1.18,1714305600,2347.462,1904.684,65.9,34.1,1714305600,2024-04-28 12:00:00,-309.672804,...,65718.956429,65311.468069,48.666744,49.837283,62.782426,-12.945143,21.060897,35.714286,0.0,65279.223295
2024-04-28 13:00:00,1714309200000,1.28,1714309200,3174.536,2607.007,65.9,34.1,1714309200,2024-04-28 13:00:00,597.516932,...,65743.070357,65399.713445,51.167809,46.089934,59.443928,-13.353994,19.820982,28.571429,0.0,65315.127163
2024-04-28 14:00:00,1714312800000,0.78,1714312800,2265.655,2630.369,65.83,34.17,1714312800,2024-04-28 14:00:00,534.232012,...,65730.438214,65357.749191,46.29424,25.900131,52.735168,-26.835038,18.470062,21.428571,0.0,65349.594876


# Step 3: Data Cleaning

In [15]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import RobustScaler
from sklearn.ensemble import IsolationForest
from sklearn.feature_selection import VarianceThreshold, SelectKBest, f_regression
from sklearn.impute import SimpleImputer

class OutlierRemover:
    """
    Removes outlier rows using IsolationForest with a fixed contamination rate.
    Only numeric columns are considered; non-numeric columns and 'date' are preserved.
    """
    def __init__(self):
        self.detector = IsolationForest(contamination=0.05, random_state=0)

    def remove(self, df: pd.DataFrame) -> pd.DataFrame:
        df = df.copy()
        numeric = df.select_dtypes(include=[np.number]).columns.tolist()
        if 'date' in numeric:
            numeric.remove('date')
        preds = self.detector.fit_predict(df[numeric])
        return df.loc[preds == 1].reset_index(drop=True)

class VarianceCorrelationFilter:
    """
    Drops features with zero variance and features exhibiting high collinearity.
    Zero-variance removal uses VarianceThreshold;
    collinearity threshold is fixed at 0.95.
    Always retains key unified price columns.
    """
    def __init__(self):
        self.var_thresh = VarianceThreshold(threshold=0.0)
        self.corr_threshold = 0.95
        self.preserve = {'unified_close', 'unified_high', 'unified_low'}

    def filter(self, df: pd.DataFrame) -> pd.DataFrame:
        df = df.copy()
        numeric = df.select_dtypes(include=[np.number]).columns.tolist()
        if 'date' in numeric:
            numeric.remove('date')
        if not numeric:
            return df
        _ = self.var_thresh.fit_transform(df[numeric])
        kept = [col for col, var in zip(numeric, self.var_thresh.variances_) if var > 0 or col in self.preserve]
        for col in self.preserve:
            if col in df.columns and col not in kept:
                kept.append(col)
        df = df[kept + [c for c in df.columns if c not in numeric]]
        corr = df[kept].corr().abs()
        mask = np.triu(np.ones(corr.shape), k=1).astype(bool)
        to_drop = [col for col in corr.columns if col not in self.preserve and any(corr[col][mask[:, corr.columns.get_loc(col)]] > self.corr_threshold)]
        return df.drop(columns=to_drop)

class TopKSelector:
    """
    Selects the top K numeric features within each source prefix (cg_, cq_, gn_) based on univariate regression
    score to the target price column. Always retains unified_close, unified_high, unified_low, time/date columns.
    """
    def __init__(self, k: int = 50):
        self.k = k

    def select(self, df: pd.DataFrame) -> pd.DataFrame:
        df = df.copy()
        # Columns always kept
        time_cols = [c for c in df.columns if 'time' in c.lower()]
        keep_cols = set(time_cols + (['date'] if 'date' in df.columns else []))
        for fixed in ('unified_close', 'unified_high', 'unified_low'):
            if fixed in df.columns:
                keep_cols.add(fixed)

        # Detect target
        candidates = [c for c in df.columns if c == 'unified_close']
        if not candidates:
            candidates = [c for c in df.columns if 'close' in c.lower()]
        if not candidates:
            candidates = [c for c in df.columns if 'price' in c.lower() or 'usd' in c.lower()]
        if not candidates:
            raise ValueError("No suitable price target column found for TopKSelector")
        target = candidates[0]

        # Prepare features
        numeric = df.select_dtypes(include=[np.number]).columns.tolist()
        feature_cols = [c for c in numeric if c not in keep_cols and c != target]
        if not feature_cols:
            return df

        selected = []
        for prefix in ('cg_', 'cq_', 'gn_'):
            pref_cols = [c for c in feature_cols if c.startswith(prefix)]
            if not pref_cols:
                continue
            X = df[pref_cols]
            y = df[target]
            k_here = min(self.k, len(pref_cols))
            selector = SelectKBest(score_func=f_regression, k=k_here)
            selector.fit(X, y)
            mask = selector.get_support()
            selected += [col for col, keep in zip(pref_cols, mask) if keep]

        # Assemble final columns: target, selected, keep_cols
        final_cols = [target] + selected + list(keep_cols)
        ordered = [c for c in df.columns if c in final_cols]
        return df[ordered]

class BTCFeatureScaler:
    """
    Scales numeric features robustly using scikit-learn's RobustScaler:
     - Centers on median
     - Scales according to IQR
    Automatically ignores non-numeric columns and any column containing 'time'.
    """
    def __init__(self):
        self.scaler = RobustScaler(quantile_range=(25.0, 75.0), with_centering=True, with_scaling=True)

    def scale(self, df: pd.DataFrame) -> pd.DataFrame:
        df = df.copy()
        skip = {col for col in df.columns if 'time' in col.lower()} | {'date'}
        numeric_cols = [col for col in df.select_dtypes(include=[np.number]).columns if col not in skip]
        if not numeric_cols:
            return df
        scaled = self.scaler.fit_transform(df[numeric_cols].values)
        df[numeric_cols] = pd.DataFrame(scaled, columns=numeric_cols, index=df.index)
        return df

class NullDuplicateCleaner:
    """
    Cleans a DataFrame by:
      - Dropping duplicate rows
      - Imputing missing numeric values with median
      - Imputing missing categorical/text values with most frequent
    """
    def __init__(self):
        self.num_imputer = SimpleImputer(strategy='median')
        self.cat_imputer = SimpleImputer(strategy='most_frequent')

    def clean(self, df: pd.DataFrame) -> pd.DataFrame:
        df = df.copy().drop_duplicates().reset_index(drop=True)
        num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
        cat_cols = df.select_dtypes(include=['object']).columns.tolist()
        if num_cols:
            df[num_cols] = self.num_imputer.fit_transform(df[num_cols])
        if cat_cols:
            df[cat_cols] = self.cat_imputer.fit_transform(df[cat_cols])
        return df

class JSONColumnRemover:
    """
    Drops any column whose values are strings (object dtype).
    """
    def remove(self, df: pd.DataFrame) -> pd.DataFrame:
        df = df.copy()
        string_cols = df.select_dtypes(include=['object']).columns.tolist()
        return df.drop(columns=string_cols)


In [16]:
str_remover = JSONColumnRemover()
cleaner = NullDuplicateCleaner()
outlier = OutlierRemover()
scaler = BTCFeatureScaler()
var_filter = VarianceCorrelationFilter()
selector = TopKSelector(k=30)

In [17]:
df_unified

Unnamed: 0,start_time,cg_longShortRatio,cg_time,cg_buy,cg_sell,cg_longAccount,cg_shortAccount,cg_t,cq_datetime,cq_netflow_total,...,gn_transactions_transfers_volume_whales_to_exchanges_sum,gn_transactions_transfers_volume_sth_to_exchanges_sum,gn_transactions_transfers_volume_to_exchanges_mean,gn_transactions_transfers_whales_to_exchanges_count,gn_transactions_transfers_volume_within_exchanges_sum,unified_close,unified_high,unified_low,unified_open,date
0,1713571200000,0.98,1713571200,9022.680,9139.190,59.37,40.63,1713571200,2024-04-20 00:00:00,-10.350818,...,17.748878,303.488061,0.280791,686,796.289480,64872.965,65332.395,64638.410,64841.175,2024-04-20 00:00:00
1,1713574800000,1.10,1713574800,6180.941,5526.508,60.52,39.48,1713574800,2024-04-20 01:00:00,-66.924835,...,26.063658,57.187703,0.084468,934,3381.814742,64712.235,65039.225,64376.130,64872.965,2024-04-20 01:00:00
2,1713578400000,1.19,1713578400,3679.545,3254.022,60.52,39.48,1713578400,2024-04-20 02:00:00,-209.168108,...,7.988589,43.403876,0.122227,474,1750.341869,64683.520,64983.365,64531.510,64712.235,2024-04-20 02:00:00
3,1713582000000,1.06,1713582000,2210.444,2239.444,60.80,39.20,1713582000,2024-04-20 03:00:00,-118.699493,...,37.715779,373.912846,0.513910,321,883.518175,64898.015,64957.820,64639.840,64683.515,2024-04-20 03:00:00
4,1713585600000,1.05,1713585600,3432.443,3542.689,60.70,39.30,1713585600,2024-04-20 04:00:00,-29.468521,...,22.510729,236.541572,0.286910,346,1599.332184,64955.665,65112.600,64793.585,64898.020,2024-04-20 04:00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8642,1744682400000,1.43,1744682400,3169.000,2103.294,50.35,49.65,1744682400,2025-04-15 02:00:00,82.324000,...,5.619835,633.814736,0.353428,181,4118.908258,81150.855,81184.445,80592.675,80731.115,2025-04-15 02:00:00
8643,1744686000000,0.99,1744686000,3307.964,2994.788,49.92,50.08,1744686000,2025-04-15 03:00:00,-508.505838,...,39.690454,1074.282391,0.769706,189,3872.841206,81290.530,81317.840,81009.560,81150.855,2025-04-15 03:00:00
8644,1744689600000,1.06,1744689600,2387.399,2307.978,49.92,50.08,1744689600,2025-04-15 04:00:00,683.257952,...,72.989646,502.094219,0.410894,148,3809.753024,81533.125,81567.105,81176.390,81290.535,2025-04-15 04:00:00
8645,1744693200000,1.18,1744693200,6363.665,4730.791,48.55,51.45,1744693200,2025-04-15 05:00:00,-114.225874,...,105.987922,641.097218,0.483340,193,3619.336579,81754.205,82181.340,81530.690,81533.130,2025-04-15 05:00:00


In [18]:
df_no_str = str_remover.remove(df_unified)
df_no_str

Unnamed: 0,start_time,cg_longShortRatio,cg_time,cg_buy,cg_sell,cg_longAccount,cg_shortAccount,cg_t,cq_netflow_total,cq_funding_rates,...,gn_transactions_transfers_volume_whales_to_exchanges_sum,gn_transactions_transfers_volume_sth_to_exchanges_sum,gn_transactions_transfers_volume_to_exchanges_mean,gn_transactions_transfers_whales_to_exchanges_count,gn_transactions_transfers_volume_within_exchanges_sum,unified_close,unified_high,unified_low,unified_open,date
0,1713571200000,0.98,1713571200,9022.680,9139.190,59.37,40.63,1713571200,-10.350818,0.010000,...,17.748878,303.488061,0.280791,686,796.289480,64872.965,65332.395,64638.410,64841.175,2024-04-20 00:00:00
1,1713574800000,1.10,1713574800,6180.941,5526.508,60.52,39.48,1713574800,-66.924835,0.010000,...,26.063658,57.187703,0.084468,934,3381.814742,64712.235,65039.225,64376.130,64872.965,2024-04-20 01:00:00
2,1713578400000,1.19,1713578400,3679.545,3254.022,60.52,39.48,1713578400,-209.168108,0.010000,...,7.988589,43.403876,0.122227,474,1750.341869,64683.520,64983.365,64531.510,64712.235,2024-04-20 02:00:00
3,1713582000000,1.06,1713582000,2210.444,2239.444,60.80,39.20,1713582000,-118.699493,0.009197,...,37.715779,373.912846,0.513910,321,883.518175,64898.015,64957.820,64639.840,64683.515,2024-04-20 03:00:00
4,1713585600000,1.05,1713585600,3432.443,3542.689,60.70,39.30,1713585600,-29.468521,0.006602,...,22.510729,236.541572,0.286910,346,1599.332184,64955.665,65112.600,64793.585,64898.020,2024-04-20 04:00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8642,1744682400000,1.43,1744682400,3169.000,2103.294,50.35,49.65,1744682400,82.324000,0.000097,...,5.619835,633.814736,0.353428,181,4118.908258,81150.855,81184.445,80592.675,80731.115,2025-04-15 02:00:00
8643,1744686000000,0.99,1744686000,3307.964,2994.788,49.92,50.08,1744686000,-508.505838,0.002832,...,39.690454,1074.282391,0.769706,189,3872.841206,81290.530,81317.840,81009.560,81150.855,2025-04-15 03:00:00
8644,1744689600000,1.06,1744689600,2387.399,2307.978,49.92,50.08,1744689600,683.257952,0.005216,...,72.989646,502.094219,0.410894,148,3809.753024,81533.125,81567.105,81176.390,81290.535,2025-04-15 04:00:00
8645,1744693200000,1.18,1744693200,6363.665,4730.791,48.55,51.45,1744693200,-114.225874,0.006364,...,105.987922,641.097218,0.483340,193,3619.336579,81754.205,82181.340,81530.690,81533.130,2025-04-15 05:00:00


In [19]:
df_cleaned = cleaner.clean(df_no_str)
df_cleaned

Unnamed: 0,start_time,cg_longShortRatio,cg_time,cg_buy,cg_sell,cg_longAccount,cg_shortAccount,cg_t,cq_netflow_total,cq_funding_rates,...,gn_transactions_transfers_volume_whales_to_exchanges_sum,gn_transactions_transfers_volume_sth_to_exchanges_sum,gn_transactions_transfers_volume_to_exchanges_mean,gn_transactions_transfers_whales_to_exchanges_count,gn_transactions_transfers_volume_within_exchanges_sum,unified_close,unified_high,unified_low,unified_open,date
0,1.713571e+12,0.98,1.713571e+09,9022.680,9139.190,59.37,40.63,1.713571e+09,-10.350818,0.010000,...,17.748878,303.488061,0.280791,686.0,796.289480,64872.965,65332.395,64638.410,64841.175,2024-04-20 00:00:00
1,1.713575e+12,1.10,1.713575e+09,6180.941,5526.508,60.52,39.48,1.713575e+09,-66.924835,0.010000,...,26.063658,57.187703,0.084468,934.0,3381.814742,64712.235,65039.225,64376.130,64872.965,2024-04-20 01:00:00
2,1.713578e+12,1.19,1.713578e+09,3679.545,3254.022,60.52,39.48,1.713578e+09,-209.168108,0.010000,...,7.988589,43.403876,0.122227,474.0,1750.341869,64683.520,64983.365,64531.510,64712.235,2024-04-20 02:00:00
3,1.713582e+12,1.06,1.713582e+09,2210.444,2239.444,60.80,39.20,1.713582e+09,-118.699493,0.009197,...,37.715779,373.912846,0.513910,321.0,883.518175,64898.015,64957.820,64639.840,64683.515,2024-04-20 03:00:00
4,1.713586e+12,1.05,1.713586e+09,3432.443,3542.689,60.70,39.30,1.713586e+09,-29.468521,0.006602,...,22.510729,236.541572,0.286910,346.0,1599.332184,64955.665,65112.600,64793.585,64898.020,2024-04-20 04:00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8642,1.744682e+12,1.43,1.744682e+09,3169.000,2103.294,50.35,49.65,1.744682e+09,82.324000,0.000097,...,5.619835,633.814736,0.353428,181.0,4118.908258,81150.855,81184.445,80592.675,80731.115,2025-04-15 02:00:00
8643,1.744686e+12,0.99,1.744686e+09,3307.964,2994.788,49.92,50.08,1.744686e+09,-508.505838,0.002832,...,39.690454,1074.282391,0.769706,189.0,3872.841206,81290.530,81317.840,81009.560,81150.855,2025-04-15 03:00:00
8644,1.744690e+12,1.06,1.744690e+09,2387.399,2307.978,49.92,50.08,1.744690e+09,683.257952,0.005216,...,72.989646,502.094219,0.410894,148.0,3809.753024,81533.125,81567.105,81176.390,81290.535,2025-04-15 04:00:00
8645,1.744693e+12,1.18,1.744693e+09,6363.665,4730.791,48.55,51.45,1.744693e+09,-114.225874,0.006364,...,105.987922,641.097218,0.483340,193.0,3619.336579,81754.205,82181.340,81530.690,81533.130,2025-04-15 05:00:00


In [20]:
df_no_outliers = outlier.remove(df_cleaned)
df_no_outliers

Unnamed: 0,start_time,cg_longShortRatio,cg_time,cg_buy,cg_sell,cg_longAccount,cg_shortAccount,cg_t,cq_netflow_total,cq_funding_rates,...,gn_transactions_transfers_volume_whales_to_exchanges_sum,gn_transactions_transfers_volume_sth_to_exchanges_sum,gn_transactions_transfers_volume_to_exchanges_mean,gn_transactions_transfers_whales_to_exchanges_count,gn_transactions_transfers_volume_within_exchanges_sum,unified_close,unified_high,unified_low,unified_open,date
0,1.713571e+12,0.98,1.713571e+09,9022.680,9139.190,59.37,40.63,1.713571e+09,-10.350818,0.010000,...,17.748878,303.488061,0.280791,686.0,796.289480,64872.965,65332.395,64638.410,64841.175,2024-04-20 00:00:00
1,1.713575e+12,1.10,1.713575e+09,6180.941,5526.508,60.52,39.48,1.713575e+09,-66.924835,0.010000,...,26.063658,57.187703,0.084468,934.0,3381.814742,64712.235,65039.225,64376.130,64872.965,2024-04-20 01:00:00
2,1.713578e+12,1.19,1.713578e+09,3679.545,3254.022,60.52,39.48,1.713578e+09,-209.168108,0.010000,...,7.988589,43.403876,0.122227,474.0,1750.341869,64683.520,64983.365,64531.510,64712.235,2024-04-20 02:00:00
3,1.713582e+12,1.06,1.713582e+09,2210.444,2239.444,60.80,39.20,1.713582e+09,-118.699493,0.009197,...,37.715779,373.912846,0.513910,321.0,883.518175,64898.015,64957.820,64639.840,64683.515,2024-04-20 03:00:00
4,1.713586e+12,1.05,1.713586e+09,3432.443,3542.689,60.70,39.30,1.713586e+09,-29.468521,0.006602,...,22.510729,236.541572,0.286910,346.0,1599.332184,64955.665,65112.600,64793.585,64898.020,2024-04-20 04:00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8209,1.744682e+12,1.43,1.744682e+09,3169.000,2103.294,50.35,49.65,1.744682e+09,82.324000,0.000097,...,5.619835,633.814736,0.353428,181.0,4118.908258,81150.855,81184.445,80592.675,80731.115,2025-04-15 02:00:00
8210,1.744686e+12,0.99,1.744686e+09,3307.964,2994.788,49.92,50.08,1.744686e+09,-508.505838,0.002832,...,39.690454,1074.282391,0.769706,189.0,3872.841206,81290.530,81317.840,81009.560,81150.855,2025-04-15 03:00:00
8211,1.744690e+12,1.06,1.744690e+09,2387.399,2307.978,49.92,50.08,1.744690e+09,683.257952,0.005216,...,72.989646,502.094219,0.410894,148.0,3809.753024,81533.125,81567.105,81176.390,81290.535,2025-04-15 04:00:00
8212,1.744693e+12,1.18,1.744693e+09,6363.665,4730.791,48.55,51.45,1.744693e+09,-114.225874,0.006364,...,105.987922,641.097218,0.483340,193.0,3619.336579,81754.205,82181.340,81530.690,81533.130,2025-04-15 05:00:00


In [21]:
df_scaled = scaler.scale(df_no_outliers)
df_scaled

Unnamed: 0,start_time,cg_longShortRatio,cg_time,cg_buy,cg_sell,cg_longAccount,cg_shortAccount,cg_t,cq_netflow_total,cq_funding_rates,...,gn_transactions_transfers_volume_whales_to_exchanges_sum,gn_transactions_transfers_volume_sth_to_exchanges_sum,gn_transactions_transfers_volume_to_exchanges_mean,gn_transactions_transfers_whales_to_exchanges_count,gn_transactions_transfers_volume_within_exchanges_sum,unified_close,unified_high,unified_low,unified_open,date
0,1.713571e+12,0.000000,1.713571e+09,1.641255,1.671783,-0.066354,0.066354,-0.966250,0.114417,0.329767,...,-0.332057,-0.369954,-0.292316,3.192857,-0.926827,-0.728339,-0.712158,-0.725960,-0.729968,2024-04-20 00:00:00
1,1.713575e+12,0.444444,1.713575e+09,0.851569,0.663849,0.023419,-0.023419,-0.966020,-0.027367,0.329767,...,-0.296612,-0.722696,-0.893277,4.964286,-0.394475,-0.738678,-0.731037,-0.742762,-0.727924,2024-04-20 01:00:00
2,1.713578e+12,0.777778,1.713578e+09,0.156460,0.029828,0.023419,-0.023419,-0.965790,-0.383851,0.329767,...,-0.373665,-0.742436,-0.777695,1.678571,-0.730391,-0.740525,-0.734634,-0.732808,-0.738258,2024-04-20 02:00:00
3,1.713582e+12,0.296296,1.713582e+09,-0.251786,-0.253238,0.045277,-0.045277,-0.965561,-0.157122,0.220576,...,-0.246940,-0.269095,0.421279,0.585714,-0.908867,-0.726727,-0.736279,-0.725868,-0.740104,2024-04-20 03:00:00
4,1.713586e+12,0.259259,1.713586e+09,0.087793,0.110366,0.037471,-0.037471,-0.965331,0.066505,-0.132023,...,-0.311758,-0.465832,-0.273586,0.764286,-0.761483,-0.723019,-0.726312,-0.716019,-0.726313,2024-04-20 04:00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8209,1.744682e+12,1.666667,1.744682e+09,0.014585,-0.291224,-0.770492,0.770492,1.017908,0.346674,-1.016244,...,-0.383763,0.103126,-0.069969,-0.414286,-0.242710,0.318785,0.308660,0.296070,0.291687,2025-04-15 02:00:00
8210,1.744686e+12,0.037037,1.744686e+09,0.053202,-0.042498,-0.804059,0.804059,1.018138,-1.134038,-0.644411,...,-0.238522,0.733946,1.204293,-0.357143,-0.293374,0.327770,0.317250,0.322776,0.318674,2025-04-15 03:00:00
8211,1.744690e+12,0.296296,1.744690e+09,-0.202612,-0.234117,-0.804059,0.804059,1.018368,1.852709,-0.320454,...,-0.096570,-0.085519,0.105939,-0.650000,-0.306364,0.343375,0.333302,0.333463,0.327655,2025-04-15 04:00:00
8212,1.744693e+12,0.740741,1.744693e+09,0.902346,0.441845,-0.911007,0.911007,1.018597,-0.145910,-0.164433,...,0.044099,0.113556,0.327701,-0.328571,-0.345570,0.357597,0.372856,0.356160,0.343253,2025-04-15 05:00:00


In [22]:
# df_filtered = var_filter.filter(df_scaled)
df_filtered = df_scaled.copy()

In [23]:
# df_final = selector.select(df_filtered)
df_final = df_filtered.copy()

# Step 4: Remove outliers after scaling

In [25]:
out_path = Path("btc_features_output_2024_2025.csv")

try:
    df_final.to_csv(out_path, index=False)
    print(f"✅ Done — features saved to: {out_path}")
except PermissionError:
    print("❌ File is currently open or locked. Please close it and try again.")

df_final['date']

✅ Done — features saved to: btc_features_output_2024_2025.csv


0      2024-04-20 00:00:00
1      2024-04-20 01:00:00
2      2024-04-20 02:00:00
3      2024-04-20 03:00:00
4      2024-04-20 04:00:00
               ...        
8209   2025-04-15 02:00:00
8210   2025-04-15 03:00:00
8211   2025-04-15 04:00:00
8212   2025-04-15 05:00:00
8213   2025-04-15 06:00:00
Name: date, Length: 8214, dtype: datetime64[ns]