In [1]:
import sys
!{sys.executable} -m pip install --no-deps pygooglenews==0.1.2

!{sys.executable} -m pip install dateparser

!pip install numpy pandas yfinance scikit-learn tensorflow

!pip install git+https://github.com/aarigs/pandas-ta.git
    
!pip install xgboost lightgbm requests pytrends



Collecting git+https://github.com/aarigs/pandas-ta.git
  Cloning https://github.com/aarigs/pandas-ta.git to /tmp/pip-req-build-rlck92x9
  Running command git clone --filter=blob:none --quiet https://github.com/aarigs/pandas-ta.git /tmp/pip-req-build-rlck92x9
  Resolved https://github.com/aarigs/pandas-ta.git to commit 7a2a4210c71334929c482366d255d57eed5bdbfc
  Preparing metadata (setup.py) ... [?25ldone


In [4]:
import os
import numpy as np
import pandas as pd
import pandas_ta as ta
import yfinance as yf
from collections import Counter
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import GridSearchCV
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

START_DATE = "2020-01-01"
END_DATE = "2025-09-19"
TRAIN_END = "2024-12-31"
SENTIMENT_CSV = "daily_sentiment.csv"
TICKER = "ETH-USD"
LSTM_LOOKBACK = 7

gpus = tf.config.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
    except Exception:
        pass

def load_sentiment(csv_path):
    df = pd.read_csv(csv_path)
    df['date'] = pd.to_datetime(df['date']).dt.normalize()
    df = df[df['date'] >= pd.to_datetime(START_DATE)].copy()
    sentiment_col = next((col for col in ['mean_sentiment', 'sentiment', 'label'] if col in df.columns), None)
    df['sentiment'] = pd.to_numeric(df[sentiment_col], errors='coerce').fillna(0.0) if sentiment_col else 0.0
    df = df[['date','sentiment']].drop_duplicates(subset=['date']).set_index('date')
    return df

def fetch_price(start_date, end_date, ticker=TICKER):
    df = yf.download(ticker, start=start_date, end=(pd.to_datetime(end_date)+pd.Timedelta(days=1)).strftime("%Y-%m-%d"), progress=False, auto_adjust=False)
    if isinstance(df.columns, pd.MultiIndex):
        df.columns=df.columns.get_level_values(0)
    df = df.reset_index()
    df = df.rename(columns={'Date':'date', 'Open': 'open', 'High':'high', 'Low':'low', 'Close':'close', 'Volume':'volume'})
    df['date'] = pd.to_datetime(df['date']).dt.normalize()
    df = df[['date', 'open', 'high', 'low', 'close', 'volume']]
    df = df[df['date'] >= pd.to_datetime(START_DATE)].reset_index(drop=True)
    return df

def prepare_data(price_df, sentiment_df):
    df = price_df.copy().set_index('date').sort_index()
    
    df.ta.rsi(close='close', length=14, append=True)
    df.ta.macd(close='close', fast=12, slow=26, signal=9, append=True)
    df.ta.bbands(close='close', length=20, append=True)
    df.ta.obv(close='close', volume=df['volume'], append=True)
    
    df['return'] = df['close'].pct_change()
    df['target'] = (df['return'] > 0).astype(int)
    df['volatility_14d'] = df['return'].rolling(window=14).std()
    
    s = sentiment_df.copy()
    s['sentiment_ma_3'] = s['sentiment'].rolling(window=3).mean()
    s['sentiment_ma_7'] = s['sentiment'].rolling(window=7).mean()
    
    merged = df.merge(s, how='left', left_index=True, right_index=True)
    merged['sentiment'] = merged['sentiment'].fillna(0.0)
    merged['sentiment_ma_3'] = merged['sentiment_ma_3'].fillna(0.0)
    merged['sentiment_ma_7'] = merged['sentiment_ma_7'].fillna(0.0)

    merged['sentiment_x_volatility'] = merged['sentiment'] * merged['volatility_14d']
    
    feature_columns_to_shift = [col for col in merged.columns if col != 'target']
    merged[feature_columns_to_shift] = merged[feature_columns_to_shift].shift(1)
        
    merged = merged.dropna().reset_index().rename(columns={'index':'date'})
    
    price_only_cols = [col for col in merged.columns if any(indicator in col for indicator in ['RSI', 'MACD', 'BB', 'OBV', 'volatility']) and 'sentiment' not in col]
    feature_cols = price_only_cols + [col for col in merged.columns if col.startswith('sentiment')]

    return merged, feature_cols, price_only_cols

def create_lstm_dataset(X, y, lookback=LSTM_LOOKBACK):
    X_seq, y_seq = [], []
    for i in range(len(X) - lookback):
        X_seq.append(X[i:i+lookback])
        y_seq.append(y[i+lookback])
    return np.array(X_seq), np.array(y_seq)

def train_classifiers_basic(X_train, y_train):
    logit = LogisticRegression(max_iter=500, solver='liblinear', random_state=42).fit(X_train, y_train)
    mlp = MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=1000, early_stopping=True, random_state=42, n_iter_no_change=20).fit(X_train, y_train)
    svc = SVC(kernel='rbf', C=1.0, probability=True, random_state=42).fit(X_train, y_train)
    lgbm = LGBMClassifier(random_state=42, device='gpu').fit(X_train, y_train)
    return {'logit':logit, 'mlp':mlp, 'svc':svc, 'lgbm':lgbm}

def train_lstm_classifier(X_train_seq, y_train_seq, units=64, epochs=30, batch_size=32):
    model = Sequential([
        Input(shape=(X_train_seq.shape[1], X_train_seq.shape[2])),
        LSTM(units, activation='tanh', return_sequences=True),
        Dropout(0.3),
        LSTM(units // 2, activation='tanh'),
        Dropout(0.3),
        Dense(units // 4, activation='relu'),
        Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])
    early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
    model.fit(X_train_seq, y_train_seq, epochs=epochs, batch_size=batch_size, verbose=0, validation_split=0.15, callbacks=[early_stopping])
    return model

def predict_lstm_classifier(model, X_seq):
    p = model.predict(X_seq, verbose=0).flatten()
    return (p >= 0.5).astype(int)

def build_casebase(pred_matrix, y_train):
    return {'preds': np.array(pred_matrix), 'y': np.array(y_train)}

def cbr_classify(casebase, query_vec, k):
    n_neighbors = min(k, len(casebase['preds']))
    if n_neighbors == 0: return 0
    nbrs = NearestNeighbors(n_neighbors=n_neighbors, metric='euclidean').fit(casebase['preds'])
    _, idx = nbrs.kneighbors([query_vec])
    neigh = casebase['y'][idx.flatten()]
    return Counter(neigh).most_common(1)[0][0]

def mde(y_true, y_pred):
    return 1.0 - accuracy_score(y_true, y_pred)

def evaluate_models(train_df, test_df, feature_cols, price_only_cols, best_xgb_model):
    if len(test_df) < LSTM_LOOKBACK:
        return {'error':'not enough test data'}

    y_clf_train = train_df['target'].values
    y_clf_test = test_df['target'].values
    
    scaler_p = StandardScaler().fit(train_df[price_only_cols])
    Xp_train_scaled = scaler_p.transform(train_df[price_only_cols])
    Xp_test_scaled = scaler_p.transform(test_df[price_only_cols])

    scaler_n = StandardScaler().fit(train_df[feature_cols])
    Xn_train_scaled = scaler_n.transform(train_df[feature_cols])
    Xn_test_scaled = scaler_n.transform(test_df[feature_cols])

    price_models = train_classifiers_basic(Xp_train_scaled, y_clf_train)
    news_models = train_classifiers_basic(Xn_train_scaled, y_clf_train)
    news_models['xgb'] = best_xgb_model
    
    xgb_price_only = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42, tree_method='hist', device='cuda')
    xgb_price_only.fit(Xp_train_scaled, y_clf_train)
    price_models['xgb'] = xgb_price_only

    Xp_train_seq, yp_train_seq = create_lstm_dataset(Xp_train_scaled, y_clf_train)
    Xp_test_seq, yp_test_seq = create_lstm_dataset(Xp_test_scaled, y_clf_test)
    Xn_train_seq, yn_train_seq = create_lstm_dataset(Xn_train_scaled, y_clf_train)
    Xn_test_seq, yn_test_seq = create_lstm_dataset(Xn_test_scaled, y_clf_test)

    lstm_price = train_lstm_classifier(Xp_train_seq, yp_train_seq)
    p_lstm_test = predict_lstm_classifier(lstm_price, Xp_test_seq)
    lstm_news = train_lstm_classifier(Xn_train_seq, yn_train_seq)
    n_lstm_test = predict_lstm_classifier(lstm_news, Xn_test_seq)
    
    p_logit_test = price_models['logit'].predict(Xp_test_scaled)
    p_mlp_test = price_models['mlp'].predict(Xp_test_scaled)
    p_svc_test = price_models['svc'].predict(Xp_test_scaled)
    p_lgbm_test = price_models['lgbm'].predict(Xp_test_scaled)
    p_xgb_test = price_models['xgb'].predict(Xp_test_scaled)
    
    n_logit_test = news_models['logit'].predict(Xn_test_scaled)
    n_mlp_test = news_models['mlp'].predict(Xn_test_scaled)
    n_svc_test = news_models['svc'].predict(Xn_test_scaled)
    n_lgbm_test = news_models['lgbm'].predict(Xn_test_scaled)
    n_xgb_test = news_models['xgb'].predict(Xn_test_scaled)
    
    results = {}
    results['price_test_acc'] = {
        'P_Logit': accuracy_score(y_clf_test, p_logit_test), 'P_ANN': accuracy_score(y_clf_test, p_mlp_test),
        'P_SVM': accuracy_score(y_clf_test, p_svc_test), 'P_LGBM': accuracy_score(y_clf_test, p_lgbm_test),
        'P_XGB': accuracy_score(y_clf_test, p_xgb_test), 'P_LSTM': accuracy_score(yp_test_seq, p_lstm_test)
    }
    results['news_test_acc'] = {
        'N_Logit': accuracy_score(y_clf_test, n_logit_test), 'N_ANN': accuracy_score(y_clf_test, n_mlp_test),
        'N_SVM': accuracy_score(y_clf_test, n_svc_test), 'N_LGBM': accuracy_score(y_clf_test, n_lgbm_test),
        'N_XGB': accuracy_score(y_clf_test, n_xgb_test), 'N_LSTM': accuracy_score(yn_test_seq, n_lstm_test)
    }
    
    p_train_preds = np.column_stack([m.predict(Xp_train_scaled) for m in price_models.values()])
    n_train_preds = np.column_stack([m.predict(Xn_train_scaled) for m in news_models.values()])
    p_lstm_train = predict_lstm_classifier(lstm_price, Xp_train_seq)
    n_lstm_train = predict_lstm_classifier(lstm_news, Xn_train_seq)
    
    p_train_preds_cbr = np.column_stack([p_train_preds[LSTM_LOOKBACK:], p_lstm_train])
    n_train_preds_cbr = np.column_stack([n_train_preds[LSTM_LOOKBACK:], n_lstm_train])

    p_test_preds_cbr = np.column_stack([p_logit_test[LSTM_LOOKBACK:], p_mlp_test[LSTM_LOOKBACK:], p_svc_test[LSTM_LOOKBACK:], p_lgbm_test[LSTM_LOOKBACK:], p_xgb_test[LSTM_LOOKBACK:], p_lstm_test])
    n_test_preds_cbr = np.column_stack([n_logit_test[LSTM_LOOKBACK:], n_mlp_test[LSTM_LOOKBACK:], n_svc_test[LSTM_LOOKBACK:], n_lgbm_test[LSTM_LOOKBACK:], n_xgb_test[LSTM_LOOKBACK:], n_lstm_test])
    
    p_case = build_casebase(p_train_preds_cbr, yp_train_seq)
    n_case = build_casebase(n_train_preds_cbr, yn_train_seq)
    
    best_k_p, best_acc_p, best_preds_p = 1, 0.0, None
    for k in range(1, 12, 2):
        ypreds = [cbr_classify(p_case, p_test_preds_cbr[i], k) for i in range(len(p_test_preds_cbr))]
        acc = accuracy_score(yp_test_seq, ypreds)
        if acc > best_acc_p:
            best_acc_p, best_k_p, best_preds_p = acc, k, ypreds
    
    best_k_n, best_acc_n, best_preds_n = 1, 0.0, None
    for k in range(1, 12, 2):
        ypreds = [cbr_classify(n_case, n_test_preds_cbr[i], k) for i in range(len(n_test_preds_cbr))]
        acc = accuracy_score(yn_test_seq, ypreds)
        if acc > best_acc_n:
            best_acc_n, best_k_n, best_preds_n = acc, k, ypreds

    results['price_cbr'] = {'k': best_k_p, 'accuracy': best_acc_p, 'MDE': mde(yp_test_seq, np.array(best_preds_p)) if best_preds_p is not None else None}
    results['news_cbr'] = {'k': best_k_n, 'accuracy': best_acc_n, 'MDE': mde(yn_test_seq, np.array(best_preds_n)) if best_preds_n is not None else None}
    
    df_price = pd.DataFrame.from_dict(results['price_test_acc'], orient='index', columns=['Accuracy'])
    df_price['MDE'] = 1 - df_price['Accuracy']
    df_news = pd.DataFrame.from_dict(results['news_test_acc'], orient='index', columns=['Accuracy'])
    df_news['MDE'] = 1 - df_news['Accuracy']

    print("PRICE TEST (Selected Features)")
    print(df_price.to_string())
    print("\nNEWS TEST (Selected Features)")
    print(df_news.to_string())
    print(f"\nPRICE CBR k= {results['price_cbr']['k']}  accuracy= {results['price_cbr']['accuracy']:.6f}  MDE= {results['price_cbr']['MDE']:.6f}")
    print(f"NEWS CBR k= {results['news_cbr']['k']}  accuracy= {results['news_cbr']['accuracy']:.6f}  MDE= {results['news_cbr']['MDE']:.6f}")
    
    return results

def main():
    sentiment_df = load_sentiment(SENTIMENT_CSV)
    price_df = fetch_price(START_DATE, END_DATE)
    data_df, all_feature_cols, all_price_cols = prepare_data(price_df, sentiment_df)

    train_df = data_df[data_df['date'] <= pd.to_datetime(TRAIN_END)].copy()
    test_df = data_df[data_df['date'] > pd.to_datetime(TRAIN_END)].copy()

    X_train_all = train_df[all_feature_cols]
    y_train = train_df['target']
    
    temp_xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42, tree_method='hist', device='cuda')
    temp_xgb.fit(X_train_all, y_train)
    
    importances = pd.DataFrame({
        'feature': all_feature_cols,
        'importance': temp_xgb.feature_importances_
    }).sort_values('importance', ascending=False)
    
    print("--- Feature Importances ---")
    print(importances)
    
    selected_features_news = importances[importances['importance'] > 0.01]['feature'].tolist()
    if not selected_features_news:
        selected_features_news = importances.head(10)['feature'].tolist()
        
    selected_features_price = [f for f in selected_features_news if f in all_price_cols]
    
    print(f"\nSelected News Features ({len(selected_features_news)}): {selected_features_news}")
    print(f"Selected Price Features ({len(selected_features_price)}): {selected_features_price}\n")
    
    param_grid = {
        'n_estimators': [100, 200],
        'max_depth': [3, 5, 7],
        'learning_rate': [0.05, 0.1],
        'subsample': [0.8, 1.0],
        'colsample_bytree': [0.8, 1.0]
    }
    
    xgb_grid = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42, tree_method='hist', device='cuda')
    grid_search = GridSearchCV(estimator=xgb_grid, param_grid=param_grid, cv=3, n_jobs=-1, verbose=1, scoring='accuracy')
    
    scaler_for_grid = StandardScaler().fit(train_df[selected_features_news])
    X_train_scaled_for_grid = scaler_for_grid.transform(train_df[selected_features_news])
    
    print("--- Starting GridSearchCV for XGBoost ---")
    grid_search.fit(X_train_scaled_for_grid, y_train)
    print("--- GridSearchCV Finished ---")
    
    best_xgb = grid_search.best_estimator_
    print(f"\nBest XGBoost Params: {grid_search.best_params_}")
    print(f"Best XGBoost Accuracy on Train Set: {grid_search.best_score_:.4f}\n")

    results = evaluate_models(train_df, test_df, selected_features_news, selected_features_price, best_xgb)
    return results

if __name__ == "__main__":
    results = main()

--- Feature Importances ---
                   feature  importance
7                      OBV    0.085548
11          sentiment_ma_7    0.084113
12  sentiment_x_volatility    0.082340
5                   BBM_20    0.081892
0                   RSI_14    0.081572
10          sentiment_ma_3    0.078807
6                   BBU_20    0.075885
3            MACDS_12_26_9    0.074026
8           volatility_14d    0.073414
4                   BBL_20    0.072884
2            MACDH_12_26_9    0.072358
1             MACD_12_26_9    0.070011
9                sentiment    0.067150

Selected News Features (13): ['OBV', 'sentiment_ma_7', 'sentiment_x_volatility', 'BBM_20', 'RSI_14', 'sentiment_ma_3', 'BBU_20', 'MACDS_12_26_9', 'volatility_14d', 'BBL_20', 'MACDH_12_26_9', 'MACD_12_26_9', 'sentiment']
Selected Price Features (9): ['OBV', 'BBM_20', 'RSI_14', 'BBU_20', 'MACDS_12_26_9', 'volatility_14d', 'BBL_20', 'MACDH_12_26_9', 'MACD_12_26_9']

--- Starting GridSearchCV for XGBoost ---
Fitting 3 folds f

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


  return func(**kwargs)
Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


  return func(**kwargs)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, 

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.


--- GridSearchCV Finished ---

Best XGBoost Params: {'colsample_bytree': 0.8, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 200, 'subsample': 0.8}
Best XGBoost Accuracy on Train Set: 0.5078

[LightGBM] [Info] Number of positive: 945, number of negative: 859
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 2295
[LightGBM] [Info] Number of data points in the train set: 1804, number of used features: 9
[LightGBM] [Info] Using GPU Device: NVIDIA RTX A6000, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 9 dense feature groups (0.02 MB) transferred to GPU in 0.000919 secs. 0 sparse feature groups
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.523836 -> initscore=0.095416
[LightGBM] [Info] Start training from score 0.095416
[LightGBM] [Info] Number of positive: 945, number of negative: 859
[LightGBM] [Info] 

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
I0000 00:00:1758698408.819754 3909568 gpu_device.cc:2020] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 44923 MB memory:  -> device: 0, name: NVIDIA RTX A6000, pci bus id: 0000:1f:00.0, compute capability: 8.6
2025-09-24 16:20:11.589618: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:473] Loaded cuDNN version 91300
Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


  return func(**kwargs)


PRICE TEST (Selected Features)
         Accuracy       MDE
P_Logit  0.507634  0.492366
P_ANN    0.557252  0.442748
P_SVM    0.511450  0.488550
P_LGBM   0.526718  0.473282
P_XGB    0.496183  0.503817
P_LSTM   0.525490  0.474510

NEWS TEST (Selected Features)
         Accuracy       MDE
N_Logit  0.534351  0.465649
N_ANN    0.503817  0.496183
N_SVM    0.522901  0.477099
N_LGBM   0.526718  0.473282
N_XGB    0.503817  0.496183
N_LSTM   0.537255  0.462745

PRICE CBR k= 7  accuracy= 0.513725  MDE= 0.486275
NEWS CBR k= 3  accuracy= 0.533333  MDE= 0.466667


In [4]:
import pandas as pd
import pandas_ta as ta
import yfinance as yf
import numpy as np
from collections import Counter
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import GridSearchCV
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
import requests
import time
from datetime import datetime, timedelta
from pytrends.request import TrendReq
import warnings
warnings.filterwarnings('ignore')

START_DATE = "2020-01-01"
END_DATE = "2025-09-19"
TRAIN_END = "2024-12-31"
SENTIMENT_CSV = "daily_sentiment.csv"
TICKER = "ETH-USD"
LSTM_LOOKBACK = 7

gpus = tf.config.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
    except Exception:
        pass

def load_sentiment(csv_path):
    try:
        df = pd.read_csv(csv_path)
        df['date'] = pd.to_datetime(df['date']).dt.normalize()
        df = df[df['date'] >= pd.to_datetime(START_DATE)].copy()
        sentiment_col = next((col for col in ['mean_sentiment', 'sentiment', 'label'] if col in df.columns), None)
        df['sentiment'] = pd.to_numeric(df[sentiment_col], errors='coerce').fillna(0.0) if sentiment_col else 0.0
        df = df[['date','sentiment']].drop_duplicates(subset=['date']).set_index('date')
        return df
    except FileNotFoundError:
        print(f"Warning: Sentiment file '{csv_path}' not found. Proceeding without sentiment data.")
        return pd.DataFrame(index=pd.to_datetime(pd.date_range(START_DATE, END_DATE)), columns=['sentiment']).fillna(0.0)

def fetch_fear_greed_index():
    try:
        url = "https://api.alternative.me/fng/?limit=2000"
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        data = response.json()
        
        df_list = []
        for item in data.get('data', []):
            date = pd.to_datetime(item['timestamp'], unit='s').normalize()
            value = float(item['value'])
            df_list.append({'date': date, 'fear_greed': value})
        
        if not df_list:
            return pd.DataFrame()
            
        df = pd.DataFrame(df_list)
        df = df[df['date'] >= pd.to_datetime(START_DATE)]
        df = df.set_index('date').sort_index()
        return df
    except requests.RequestException:
        print("Warning: Could not fetch Fear & Greed Index.")
        return pd.DataFrame()

def fetch_defi_data():
    try:
        url = "https://api.llama.fi/charts"
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        data = response.json()
        
        df_list = []
        for item in data:
            date = pd.to_datetime(item['date'], unit='s').normalize()
            tvl = float(item.get('totalLiquidityUSD', 0))
            df_list.append({'date': date, 'defi_tvl': tvl})
            
        if not df_list:
            return pd.DataFrame()
            
        df = pd.DataFrame(df_list)
        df = df[df['date'] >= pd.to_datetime(START_DATE)]
        df = df.set_index('date').sort_index()
        return df
    except requests.RequestException:
        print("Warning: Could not fetch DeFi TVL data.")
        return pd.DataFrame()

def fetch_social_metrics():
    try:
        pytrends = TrendReq(hl='en-US', tz=360)
        keyword = "Ethereum"
        timeframe = f'{START_DATE} {END_DATE}'
        
        pytrends.build_payload([keyword], cat=0, timeframe=timeframe, geo='', gprop='')
        google_trends_df = pytrends.interest_over_time()

        if google_trends_df.empty or keyword not in google_trends_df.columns:
            print("Warning: Could not fetch Google Trends data.")
            return pd.DataFrame()

        google_trends_df = google_trends_df.rename(columns={keyword: 'google_trends'})
        google_trends_df = google_trends_df[['google_trends']]
        google_trends_df = google_trends_df.resample('D').interpolate(method='linear')
        return google_trends_df
    except Exception as e:
        print(f"Warning: An error occurred while fetching Google Trends data: {e}")
        return pd.DataFrame()

def fetch_traditional_markets():
    tickers = ['^GSPC', '^DJI', '^IXIC', '^VIX', 'GLD', 'DX-Y.NYB']
    market_dfs = []
    
    for ticker in tickers:
        try:
            data = yf.download(ticker, start=START_DATE, end=END_DATE, progress=False)
            if not data.empty:
                clean_ticker = ticker.replace('^', '').replace('-', '_').replace('.', '_')
                temp_df = pd.DataFrame(index=data.index)
                temp_df[f'{clean_ticker}_close'] = data['Close']
                temp_df[f'{clean_ticker}_return'] = data['Close'].pct_change()
                temp_df[f'{clean_ticker}_vol'] = data['Close'].pct_change().rolling(14).std()
                market_dfs.append(temp_df)
            time.sleep(0.1)
        except Exception:
            print(f"Warning: Could not fetch data for traditional market ticker {ticker}.")
            continue
    
    if market_dfs:
        df = pd.concat(market_dfs, axis=1)
        df.index = pd.to_datetime(df.index).normalize()
        return df
    return pd.DataFrame()

def fetch_price(start_date, end_date, ticker=TICKER):
    df = yf.download(ticker, start=start_date, end=(pd.to_datetime(end_date)+pd.Timedelta(days=1)).strftime("%Y-%m-%d"), progress=False, auto_adjust=False)
    if isinstance(df.columns, pd.MultiIndex):
        df.columns=df.columns.get_level_values(0)
    df = df.reset_index()
    df = df.rename(columns={'Date':'date', 'Open': 'open', 'High':'high', 'Low':'low', 'Close':'close', 'Volume':'volume'})
    df['date'] = pd.to_datetime(df['date']).dt.normalize()
    df = df[['date', 'open', 'high', 'low', 'close', 'volume']]
    df = df[df['date'] >= pd.to_datetime(START_DATE)].reset_index(drop=True)
    return df

def add_advanced_technical_indicators(df):
    df.ta.rsi(close='close', length=14, append=True)
    df.ta.rsi(close='close', length=21, append=True)
    df.ta.macd(close='close', fast=12, slow=26, signal=9, append=True)
    df.ta.bbands(close='close', length=20, append=True)
    df.ta.obv(close='close', volume=df['volume'], append=True)
    df.ta.stoch(high='high', low='low', close='close', append=True)
    df.ta.willr(high='high', low='low', close='close', append=True)
    df.ta.cci(high='high', low='low', close='close', append=True)
    df.ta.adx(high='high', low='low', close='close', append=True)
    df.ta.atr(high='high', low='low', close='close', append=True)
    
    for period in [5, 10, 20, 50]:
        df[f'sma_{period}'] = df['close'].rolling(window=period).mean()
        df[f'ema_{period}'] = df['close'].ewm(span=period).mean()
        df[f'price_vs_sma_{period}'] = df['close'] / df[f'sma_{period}'] - 1
        df[f'price_vs_ema_{period}'] = df['close'] / df[f'ema_{period}'] - 1
    
    df['price_momentum_5'] = df['close'] / df['close'].shift(5) - 1
    df['price_momentum_10'] = df['close'] / df['close'].shift(10) - 1
    df['volume_sma_20'] = df['volume'].rolling(window=20).mean()
    df['volume_ratio'] = df['volume'] / df['volume_sma_20']
    df['hl_ratio'] = (df['high'] - df['low']) / df['close']
    df['oc_ratio'] = abs(df['open'] - df['close']) / df['close']
    
    return df

def prepare_data(price_df, sentiment_df):
    df = price_df.copy().set_index('date').sort_index()
    
    df = add_advanced_technical_indicators(df)
    
    df['return'] = df['close'].pct_change()
    df['target'] = (df['return'] > 0).astype(int)
    df['volatility_7d'] = df['return'].rolling(window=7).std()
    df['volatility_14d'] = df['return'].rolling(window=14).std()
    df['volatility_30d'] = df['return'].rolling(window=30).std()
    
    fear_greed_df = fetch_fear_greed_index()
    if not fear_greed_df.empty:
        df = df.merge(fear_greed_df, how='left', left_index=True, right_index=True)
        df['fear_greed'] = df['fear_greed'].fillna(method='ffill').fillna(50)
        df['fear_greed_ma_3'] = df['fear_greed'].rolling(window=3).mean()
        df['fear_greed_ma_7'] = df['fear_greed'].rolling(window=7).mean()
        df['fear_greed_change'] = df['fear_greed'].diff()
    
    defi_df = fetch_defi_data()
    if not defi_df.empty:
        defi_df['defi_tvl_change'] = defi_df['defi_tvl'].pct_change()
        defi_df['defi_tvl_ma_7'] = defi_df['defi_tvl'].rolling(window=7).mean()
        df = df.merge(defi_df, how='left', left_index=True, right_index=True)
        df['defi_tvl'] = df['defi_tvl'].fillna(method='ffill')
        df['defi_tvl_change'] = df['defi_tvl_change'].fillna(0)
        df['defi_tvl_ma_7'] = df['defi_tvl_ma_7'].fillna(method='ffill')

    social_df = fetch_social_metrics()
    if not social_df.empty:
        for col in ['google_trends']:
            social_df[f'{col}_ma_3'] = social_df[col].rolling(window=3).mean()
            social_df[f'{col}_ma_7'] = social_df[col].rolling(window=7).mean()
            social_df[f'{col}_change'] = social_df[col].pct_change()
        df = df.merge(social_df, how='left', left_index=True, right_index=True)
    
    market_df = fetch_traditional_markets()
    if not market_df.empty:
        df = df.merge(market_df, how='left', left_index=True, right_index=True)

    s = sentiment_df.copy()
    s['sentiment_ma_3'] = s['sentiment'].rolling(window=3).mean()
    s['sentiment_ma_7'] = s['sentiment'].rolling(window=7).mean()
    s['sentiment_change'] = s['sentiment'].diff()
    
    merged = df.merge(s, how='left', left_index=True, right_index=True)
    
    merged['sentiment_x_volatility'] = merged['sentiment'] * merged['volatility_14d']
    if 'fear_greed' in merged.columns and 'volume_ratio' in merged.columns:
        merged['fear_greed_x_volume'] = merged['fear_greed'] / 100 * merged['volume_ratio']
    
    feature_columns_to_shift = [col for col in merged.columns if col != 'target']
    merged[feature_columns_to_shift] = merged[feature_columns_to_shift].shift(1)
        
    merged = merged.replace([np.inf, -np.inf], np.nan).dropna()
    merged = merged.reset_index().rename(columns={'index':'date'})
    
    price_technical_cols = [col for col in merged.columns if any(indicator in col.lower() for indicator in 
                           ['rsi', 'macd', 'bb', 'obv', 'stoch', 'willr', 'cci', 'adx', 'atr', 'sma', 'ema', 
                            'momentum', 'volume', 'volatility', 'ratio', 'price_vs', 'hl_ratio', 'oc_ratio']) and 
                           not any(external in col.lower() for external in 
                           ['sentiment', 'fear', 'greed', 'defi', 'google', 'gspc', 'dji', 'ixic', 'vix', 'gld', 'dx'])]
    
    alternative_data_cols = [col for col in merged.columns if any(indicator in col.lower() for external in 
                            ['sentiment', 'fear', 'greed', 'defi', 'google'] for indicator in [external])]
    
    market_data_cols = [col for col in merged.columns if any(market in col.lower() for market in 
                       ['gspc', 'dji', 'ixic', 'vix', 'gld', 'dx'])]
    
    all_feature_cols = price_technical_cols + alternative_data_cols + market_data_cols
    
    return merged, all_feature_cols, price_technical_cols

def remove_correlated_features(df, feature_cols, threshold=0.8):
    corr_matrix = df[feature_cols].corr().abs()
    upper_triangle = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    
    to_drop = [column for column in upper_triangle.columns if any(upper_triangle[column] > threshold)]
    return [col for col in feature_cols if col not in to_drop]

def create_lstm_dataset(X, y, lookback=LSTM_LOOKBACK):
    X_seq, y_seq = [], []
    for i in range(len(X) - lookback):
        X_seq.append(X[i:i+lookback])
        y_seq.append(y[i+lookback])
    return np.array(X_seq), np.array(y_seq)

def train_classifiers_basic(X_train, y_train):
    logit = LogisticRegression(max_iter=500, solver='liblinear', random_state=42).fit(X_train, y_train)
    mlp = MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=1000, early_stopping=True, random_state=42, n_iter_no_change=20).fit(X_train, y_train)
    svc = SVC(kernel='rbf', C=1.0, probability=True, random_state=42).fit(X_train, y_train)
    lgbm = LGBMClassifier(random_state=42, device='gpu', verbose=-1).fit(X_train, y_train)
    return {'logit':logit, 'mlp':mlp, 'svc':svc, 'lgbm':lgbm}

def train_lstm_classifier(X_train_seq, y_train_seq, units=64, epochs=30, batch_size=32):
    model = Sequential([
        Input(shape=(X_train_seq.shape[1], X_train_seq.shape[2])),
        LSTM(units, activation='tanh', return_sequences=True),
        Dropout(0.3),
        LSTM(units // 2, activation='tanh'),
        Dropout(0.3),
        Dense(units // 4, activation='relu'),
        Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])
    early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
    model.fit(X_train_seq, y_train_seq, epochs=epochs, batch_size=batch_size, verbose=0, validation_split=0.15, callbacks=[early_stopping])
    return model

def predict_lstm_classifier(model, X_seq):
    p = model.predict(X_seq, verbose=0).flatten()
    return (p >= 0.5).astype(int)

def build_casebase(pred_matrix, y_train):
    return {'preds': np.array(pred_matrix), 'y': np.array(y_train)}

def cbr_classify(casebase, query_vec, k):
    n_neighbors = min(k, len(casebase['preds']))
    if n_neighbors == 0: return 0
    nbrs = NearestNeighbors(n_neighbors=n_neighbors, metric='euclidean').fit(casebase['preds'])
    _, idx = nbrs.kneighbors([query_vec])
    neigh = casebase['y'][idx.flatten()]
    return Counter(neigh).most_common(1)[0][0]

def mde(y_true, y_pred):
    return 1.0 - accuracy_score(y_true, y_pred)

def optimize_xgb_model(X_train, y_train):
    param_grid = {
        'n_estimators': [100, 200, 300],
        'max_depth': [3, 5, 7],
        'learning_rate': [0.05, 0.1, 0.2],
        'subsample': [0.7, 0.8, 0.9],
        'colsample_bytree': [0.7, 0.8, 0.9]
    }
    
    grid_search = GridSearchCV(
        estimator=XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42, tree_method='hist', device='cuda'),
        param_grid=param_grid,
        scoring='accuracy',
        cv=3,
        n_jobs=-1,
        verbose=1
    )
    
    grid_search.fit(X_train, y_train)
    return grid_search.best_estimator_, grid_search.best_params_

def evaluate_feature_combinations(train_df, test_df):
    results = {}
    
    price_technical_cols = [col for col in train_df.columns if any(indicator in col.lower() for indicator in 
                           ['rsi', 'macd', 'bb', 'obv', 'stoch', 'willr', 'cci', 'adx', 'atr', 'sma', 'ema', 
                            'momentum', 'volume', 'volatility', 'ratio', 'price_vs', 'hl_ratio', 'oc_ratio']) and 
                           col != 'target']
    
    alternative_data_cols = [col for col in train_df.columns if any(indicator in col.lower() for external in 
                            ['sentiment', 'fear', 'greed', 'defi', 'google'] for indicator in [external])]
    
    market_data_cols = [col for col in train_df.columns if any(market in col.lower() for market in 
                       ['gspc', 'dji', 'ixic', 'vix', 'gld', 'dx'])]
    
    all_features = price_technical_cols + alternative_data_cols + market_data_cols
    
    feature_combinations = {
        'price_only': price_technical_cols,
        'price_uncorr': remove_correlated_features(train_df, price_technical_cols, 0.8),
        'enhanced': all_features,
        'enhanced_uncorr': remove_correlated_features(train_df, all_features, 0.8)
    }
    
    for combo_name, features in feature_combinations.items():
        if len(features) == 0:
            continue
            
        print(f"\nEvaluating {combo_name} with {len(features)} features...")
        
        X_train = train_df[features]
        y_train = train_df['target']
        X_test = test_df[features]
        y_test = test_df['target']
        
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)
        
        best_xgb, best_params = optimize_xgb_model(X_train_scaled, y_train)
        basic_models = train_classifiers_basic(X_train_scaled, y_train)
        
        combo_results = {}
        
        for model_name, model in basic_models.items():
            y_pred = model.predict(X_test_scaled)
            combo_results[f'{model_name}'] = accuracy_score(y_test, y_pred)
        
        xgb_pred = best_xgb.predict(X_test_scaled)
        combo_results['xgb_optimized'] = accuracy_score(y_test, xgb_pred)
        
        if len(X_train_scaled) >= LSTM_LOOKBACK:
            X_train_seq, y_train_seq = create_lstm_dataset(X_train_scaled, y_train.values)
            X_test_seq, y_test_seq = create_lstm_dataset(X_test_scaled, y_test.values)
            
            if len(X_train_seq) > 0 and len(X_test_seq) > 0:
                lstm_model = train_lstm_classifier(X_train_seq, y_train_seq)
                lstm_pred = predict_lstm_classifier(lstm_model, X_test_seq)
                combo_results['lstm'] = accuracy_score(y_test_seq, lstm_pred)
        
        results[combo_name] = {
            'features': features,
            'num_features': len(features),
            'accuracies': combo_results,
            'best_xgb_params': best_params
        }
    
    return results

def main():
    print("Loading sentiment data...")
    sentiment_df = load_sentiment(SENTIMENT_CSV)
    
    print("Fetching price data...")
    price_df = fetch_price(START_DATE, END_DATE)
    
    print("Preparing enhanced dataset with alternative data sources...")
    data_df, all_feature_cols, price_cols = prepare_data(price_df, sentiment_df)

    print(f"Total features created: {len(all_feature_cols)}")
    print(f"Technical indicators: {len(price_cols)}")
    print(f"Alternative & Market data features: {len(all_feature_cols) - len(price_cols)}")

    train_df = data_df[data_df['date'] <= pd.to_datetime(TRAIN_END)].copy()
    test_df = data_df[data_df['date'] > pd.to_datetime(TRAIN_END)].copy()
    
    if train_df.empty or test_df.empty:
        print("Error: Not enough data for training or testing after processing. Exiting.")
        return
    
    print("\n--- COMPREHENSIVE MODEL COMPARISON ---")
    results = evaluate_feature_combinations(train_df, test_df)
    
    print("\n--- FINAL COMPARISON RESULTS ---")
    comparison_df = []
    for combo_name, combo_results in results.items():
        for model_name, accuracy in combo_results['accuracies'].items():
            comparison_df.append({
                'Feature_Set': combo_name,
                'Model': model_name,
                'Num_Features': combo_results['num_features'],
                'Accuracy': accuracy,
                'MDE': 1 - accuracy
            })
    
    comparison_df = pd.DataFrame(comparison_df)
    print(comparison_df.pivot_table(index=['Feature_Set', 'Num_Features'], columns='Model', values='Accuracy').round(4))
    
    best_combo = comparison_df.loc[comparison_df['Accuracy'].idxmax()]
    print(f"\nBEST PERFORMING COMBINATION:")
    print(f"Feature Set: {best_combo['Feature_Set']}")
    print(f"Model: {best_combo['Model']}")
    print(f"Number of Features: {best_combo['Num_Features']}")
    print(f"Accuracy: {best_combo['Accuracy']:.4f}")
    print(f"MDE: {best_combo['MDE']:.4f}")
    
    price_only_best = comparison_df[comparison_df['Feature_Set'] == 'price_only']['Accuracy'].max()
    enhanced_best = comparison_df[comparison_df['Feature_Set'].str.contains('enhanced')]['Accuracy'].max()
    
    print(f"\nFEATURE IMPACT ANALYSIS:")
    print(f"Best Price-Only Model Accuracy: {price_only_best:.4f}")
    print(f"Best Enhanced Model Accuracy: {enhanced_best:.4f}")
    print(f"Improvement with Alternative Data: {enhanced_best - price_only_best:.4f} ({((enhanced_best - price_only_best) / price_only_best * 100):.2f}%)")
    
    return results

if __name__ == "__main__":
    final_results = main()
    print("\nProcess finished.")

Loading sentiment data...
Fetching price data...
Preparing enhanced dataset with alternative data sources...
Total features created: 78
Technical indicators: 42
Alternative & Market data features: 36

--- COMPREHENSIVE MODEL COMPARISON ---

Evaluating price_only with 45 features...
Fitting 3 folds for each of 243 candidates, totalling 729 fits


I0000 00:00:1758643171.517736 3904924 gpu_device.cc:2020] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 45829 MB memory:  -> device: 0, name: NVIDIA RTX A6000, pci bus id: 0000:1f:00.0, compute capability: 8.6
2025-09-24 00:59:34.295670: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:473] Loaded cuDNN version 91300



Evaluating price_uncorr with 12 features...
Fitting 3 folds for each of 243 candidates, totalling 729 fits

Evaluating enhanced with 81 features...
Fitting 3 folds for each of 243 candidates, totalling 729 fits

Evaluating enhanced_uncorr with 25 features...
Fitting 3 folds for each of 243 candidates, totalling 729 fits

--- FINAL COMPARISON RESULTS ---
Model                           lgbm   logit    lstm     mlp     svc  \
Feature_Set     Num_Features                                           
enhanced        81            0.4819  0.5542  0.5409  0.5904  0.5241   
enhanced_uncorr 25            0.5060  0.5422  0.5346  0.5181  0.5241   
price_only      45            0.5422  0.5422  0.5346  0.5181  0.5000   
price_uncorr    12            0.5000  0.5241  0.5031  0.5241  0.4880   

Model                         xgb_optimized  
Feature_Set     Num_Features                 
enhanced        81                   0.5542  
enhanced_uncorr 25                   0.5181  
price_only      45        