In [1]:
import pandas as pd
import data_engineering as de
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score, mean_squared_error
import numpy as np
from joblib import Parallel, delayed
import simulation
pd.options.mode.chained_assignment = None

In [2]:
ticker_frames = de.separate_by_stock()

In [3]:
# Feature engineering

def add_technical_indicators(df):
    # Simple moving average (SMA)
    df['SMA_5'] = df['Close'].rolling(window=5).mean()
    df['SMA_10'] = df['Close'].rolling(window=10).mean()
    
    # Exponential moving average (EMA)
    df['EMA_5'] = df['Close'].ewm(span=5, adjust=False).mean()
    df['EMA_10'] = df['Close'].ewm(span=10, adjust=False).mean()
    
    # Relative strength index (RSI)
    delta = df['Close'].diff(1)
    gain = delta.where(delta > 0, 0)
    loss = -delta.where(delta < 0, 0)
    avg_gain = gain.rolling(window=14).mean()
    avg_loss = loss.rolling(window=14).mean()
    rs = avg_gain / avg_loss
    df['RSI'] = 100 - (100 / (1 + rs))
    
    return df

def add_sentiment_features(df):
    df['sentiment_avg'] = df[['finvader_neg', 'finvader_neu', 'finvader_pos']].mean(axis=1)
    df['sentiment_diff'] = df['finvader_pos'] - df['finvader_neg']
    df['sentiment_moving_avg'] = df['finvader_tot'].rolling(window=5).mean()

    df['cumulative_pos'] = df['finvader_pos'].cumsum()
    df['cumulative_neg'] = df['finvader_neg'].cumsum()
    df['cumulative_tot'] = df['finvader_tot'].cumsum()

    df['sentiment_volatility'] = df['finvader_tot'].rolling(window=5).std()

    df['lagged_sentiment_tot_1'] = df['finvader_tot'].shift(1)
    df['lagged_sentiment_tot_2'] = df['finvader_tot'].shift(2)

    df['pos_neg_ratio'] = df['finvader_pos'] / (df['finvader_neg'] + 1)
    df['pos_tot_ratio'] = df['finvader_pos'] / (df['finvader_tot'] + 1)

    window = 5
    df['sentiment_trend'] = df['finvader_tot'].rolling(window).apply(lambda x: np.polyfit(range(window), x, 1)[0])

    df['extreme_pos'] = (df['finvader_pos'] > 0.8).astype(int)
    df['extreme_neg'] = (df['finvader_neg'] > 0.8).astype(int)

    df['sentiment_momentum'] = df['finvader_tot'].diff()

    return df

In [4]:
# Train gBDT and XGB and predict using validation set not test set 

def train_and_evaluate_trees(model, param_grid, ticker, df, features, importance_threshold=0.01):
    print(f"Processing {ticker}")

    # Add feature engineered variables
    df = add_technical_indicators(df)
    df = add_sentiment_features(df)

    # Drop rows with NaN values in finvader scores
    df.dropna(subset=['finvader_neg', 'finvader_neu', 'finvader_pos', 'finvader_tot'], inplace=True)

    X = df[features].copy()
    y = df['y']

    # Fill Nan values in X with zeros
    X = de.fillna(X)
    
    # Make y a binary, required as inputs to this model
    y_binary = (y > 0).astype(int)

    # Create the train set, use _ for test since we are using the validation set 
    train_df, _ = de.train_test_split(df)
    X_train = X.loc[train_df.index]
    y_train = y_binary.loc[train_df.index]

    # Create the validation splits
    cv_splits = de.get_cv_splits(train_df)

    # Set up GridSearchCV
    model = GridSearchCV(
        model,
        param_grid,
        scoring='accuracy'
    )

    val_scores = []
    metrics = []
    trade_predictions = []
    test_prices = []
    feature_ranks = {}

    for train_idx, val_idx in cv_splits:
        train_idx_int = train_df.index.get_indexer(train_idx)
        val_idx_int = train_df.index.get_indexer(val_idx)
        
        X_cv_train, X_cv_val = X_train.iloc[train_idx_int], X_train.iloc[val_idx_int]
        y_cv_train, y_cv_val = y_train.iloc[train_idx_int], y_train.iloc[val_idx_int]

        model.fit(X_cv_train, y_cv_train)
        y_pred = model.predict(X_cv_val)

        # Store trading predictions and opening prices
        trade_predictions.append(y_pred)
        test_prices.append(df['Open'].iloc[val_idx_int].values)

        # Metrics 
        accuracy = accuracy_score(y_cv_val, y_pred)
        precision = precision_score(y_cv_val, y_pred)
        recall = recall_score(y_cv_val, y_pred)
        f1 = f1_score(y_cv_val, y_pred)
        mse = mean_squared_error(y_cv_val, y_pred)
        cm = confusion_matrix(y_cv_val, y_pred)

        val_scores.append(accuracy)
        metrics.append({
            'accuracy': accuracy,
            'precision': precision,
            'recall': recall,
            'f1_score': f1,
            'mse': mse,
            'confusion_matrix': cm.tolist()
        })

    avg_val_score = np.mean(val_scores)
    best_params = model.best_params_
    feature_importances = model.best_estimator_.feature_importances_

    # Feature ranking
    selector = SelectFromModel(model.best_estimator_,  threshold=importance_threshold, prefit=True)
    feature_ranks[ticker] = [features[i] for i in selector.get_support(indices=True)]
    print(f"Best parameters for {ticker}: {best_params}")
    print(f"Cross-validation accuracy for {ticker}: {avg_val_score}")
    #print(f"Feature importances for {ticker}: {feature_importances}")
    print(f"Feature ranks for {ticker}: {feature_ranks[ticker]}")

    return ticker, best_params, avg_val_score, metrics, feature_importances, trade_predictions, test_prices, feature_ranks

In [5]:
# Parallel jobs to make it faster 
def run_parallel_pipeline(ticker_frames, model, features, param_grid):
    results = Parallel(n_jobs=-1)(
        delayed(train_and_evaluate_trees)(model, param_grid, ticker, df, features, importance_threshold=0.01)
        for ticker, df in ticker_frames.items()
    )
    return results

In [6]:
def save_results(results, model_name):
    best_params = {res[0]: res[1] for res in results}
    cv_scores = {res[0]: res[2] for res in results}
    metrics = {res[0]: res[3] for res in results}
    feature_importances = {res[0]: res[4] for res in results}
    trade_predictions = {res[0]: res[5] for res in results}
    test_prices = {res[0]: res[6] for res in results}
    feature_ranks = {res[0]: res[7] for res in results}

    # Save best parameters
    best_params_df = pd.DataFrame(best_params).T
    best_params_df.to_csv(f"../results/best_params_per_stock_{model_name}.csv")

    # Save cv scores
    cv_scores_df = pd.DataFrame(cv_scores, index=['accuracy']).T
    cv_scores_df.to_csv(f"../results/cv_scores_per_stock_{model_name}.csv")

    # Save metrics
    metrics_df = pd.DataFrame(metrics).T
    metrics_df.to_csv(f"../results/metrics_per_stock_{model_name}.csv")

    # Save feature importance
    feature_importances_df = pd.DataFrame(feature_importances).T
    feature_importances_df.to_csv(f"../results/feature_importances_per_stock_{model_name}.csv")

    # Save trade preds and test prices
    for ticker in trade_predictions:
        trade_df = pd.DataFrame(trade_predictions[ticker])
        trade_df.to_csv(f"../results/trade_predictions_{ticker}_{model_name}.csv", index=False)
        price_df = pd.DataFrame(test_prices[ticker])
        price_df.to_csv(f"../results/test_prices_{ticker}_{model_name}.csv", index=False)
    
    # Save feature ranks
    feature_ranks_df = pd.DataFrame(feature_ranks).T
    feature_ranks_df.to_csv(f"../results/feature_ranks_per_stock_{model_name}.csv")

In [7]:
def run_simulation(results):
    trade_dict = {res[0]: np.concatenate(res[5]) for res in results}
    test_dict = {res[0]: np.concatenate(res[6]) for res in results}

    performance = simulation.get_performance(trade_dict, test_dict)
    print(f"Portfolio performance: {performance}")

In [13]:
features_basic = ['finvader_neg', 'finvader_neu', 'finvader_pos', 'finvader_tot', 'Open', 'High', 'Low', 'Close', 'Volume', 'pos_art_count', 'neg_art_count', 'neu_art_count', 'total_articles']
features_engineering = ['finvader_neg', 'finvader_neu', 'finvader_pos', 'finvader_tot', 'Open', 'High', 'Low', 'Close', 'Volume', 'pos_art_count', 'neg_art_count', 'neu_art_count', 'total_articles', 'SMA_5', 'SMA_10', 'RSI', 'EMA_5', 'EMA_10', 'sentiment_avg', 'sentiment_diff', 'sentiment_moving_avg', 'cumulative_pos', 'cumulative_neg', 'cumulative_tot', 'sentiment_volatility', 'lagged_sentiment_tot_1', 'lagged_sentiment_tot_2', 'pos_neg_ratio', 'pos_tot_ratio', 'sentiment_trend', 'extreme_pos', 'extreme_neg', 'sentiment_momentum']

n_estimators = np.linspace(100, 1500, num=15).astype(int)
learning_rate = np.array([0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1])
max_depth = np.arange(2,16)
param_grid = {
    'n_estimators':   [100, 600] , #[100, 250, 300, 400, 600, 750, 900, 1000, 1200]
    'learning_rate': [0.0001, 0.01], #  [0.001, 0.01]
    'max_depth':  [2, 8]  #[2, 8] 
}

In [14]:
print(f"Training Gradient Boosted Decision Trees with basic features")
results_gbdt_basic = run_parallel_pipeline(
    ticker_frames,
    GradientBoostingClassifier(random_state=42),
    features_basic,
    param_grid
)

save_results(results_gbdt_basic, "gbt_basic_100_600_0p0001_0p01_2_8")

Training Gradient Boosted Decision Trees with basic features
Processing ABBV
Processing AMZN
Processing BAC
Processing GOOGL
Processing JNJ
Processing JPM
Processing LLY
Processing MSFT
Processing AAPL
Processing MA
Processing MRK
Processing NVDA


In [11]:
print(f"Training Gradient Boosted Decision Trees with engineered features")
results_gbdt_engineered = run_parallel_pipeline(
    ticker_frames,
    GradientBoostingClassifier(random_state=42),
    features_engineering,
    param_grid
)

save_results(results_gbdt_engineered, "gbt_engineered_100_500_0p001_0p01_2_6")

Training Gradient Boosted Decision Trees with engineered features
Processing AMZN
Processing GOOGL
Processing JPM
Processing ABBV
Processing AAPL
Processing BAC
Processing JNJ
Processing MA
Processing LLY
Processing MRK
Processing NVDA
Processing MSFT
Best parameters for LLY: {'learning_rate': 0.001, 'max_depth': 6, 'n_estimators': 100}
Cross-validation accuracy for LLY: 0.5047018239113827
Feature ranks for LLY: ['finvader_neg', 'finvader_neu', 'finvader_pos', 'Open', 'Low', 'Close', 'Volume', 'SMA_5', 'SMA_10', 'RSI', 'EMA_10', 'sentiment_diff', 'sentiment_moving_avg', 'cumulative_tot', 'sentiment_volatility', 'lagged_sentiment_tot_1', 'pos_neg_ratio', 'pos_tot_ratio']
Processing UNH
Best parameters for ABBV: {'learning_rate': 0.001, 'max_depth': 2, 'n_estimators': 100}
Cross-validation accuracy for ABBV: 0.577088910712604
Feature ranks for ABBV: ['RSI', 'sentiment_avg', 'lagged_sentiment_tot_2']
Processing V
Best parameters for BAC: {'learning_rate': 0.01, 'max_depth': 6, 'n_estimato

In [12]:
print("Training XGBoost with basic features")
results_xgb_basic = run_parallel_pipeline(
    ticker_frames, 
    XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss'),
    features_basic,
    param_grid
)

save_results(results_xgb_basic, "xgb_basic")

Training XGBoost with basic features
Processing AAPL
Processing ABBV
Processing AMZN
Processing JNJ
Processing LLY
Processing BAC
Processing GOOGL
Processing JPM
Processing MRK
Processing MSFT
Processing NVDA
Processing MA
Best parameters for LLY: {'learning_rate': 0.001, 'max_depth': 2, 'n_estimators': 100}
Cross-validation accuracy for LLY: 0.4975589667685256
Feature ranks for LLY: ['finvader_neg', 'finvader_neu', 'Open', 'Close']
Processing UNH
Best parameters for BAC: {'learning_rate': 0.01, 'max_depth': 6, 'n_estimators': 500}
Cross-validation accuracy for BAC: 0.40831043956043955
Feature ranks for BAC: ['finvader_neg', 'finvader_neu', 'finvader_pos', 'finvader_tot', 'Open', 'High', 'Low', 'Close', 'Volume', 'pos_art_count', 'neg_art_count', 'neu_art_count', 'total_articles']
Processing V
Best parameters for ABBV: {'learning_rate': 0.001, 'max_depth': 2, 'n_estimators': 100}
Cross-validation accuracy for ABBV: 0.577088910712604
Feature ranks for ABBV: ['finvader_neg', 'Open', 'Clo

  _warn_prf(average, modifier, msg_start, len(result))


Best parameters for UNH: {'learning_rate': 0.001, 'max_depth': 6, 'n_estimators': 500}
Cross-validation accuracy for UNH: 0.5436339522546418
Feature ranks for UNH: ['finvader_neg', 'finvader_neu', 'finvader_pos', 'finvader_tot', 'Open', 'High', 'Close', 'Volume', 'pos_art_count', 'neu_art_count']
Best parameters for V: {'learning_rate': 0.01, 'max_depth': 6, 'n_estimators': 500}
Cross-validation accuracy for V: 0.6082347972972972
Feature ranks for V: ['finvader_neg', 'finvader_neu', 'finvader_pos', 'finvader_tot', 'Open', 'High', 'Low', 'Close', 'Volume', 'pos_art_count', 'neg_art_count', 'neu_art_count', 'total_articles']
Best parameters for WFC: {'learning_rate': 0.01, 'max_depth': 6, 'n_estimators': 500}
Cross-validation accuracy for WFC: 0.5850516621743037
Feature ranks for WFC: ['finvader_neg', 'finvader_neu', 'finvader_pos', 'finvader_tot', 'Open', 'High', 'Low', 'Close', 'Volume', 'pos_art_count', 'neg_art_count', 'neu_art_count', 'total_articles']


In [None]:
print("Training XGBoost with engineered features")
results_xgb_engineered = run_parallel_pipeline(
    ticker_frames, 
    XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss'),
    features_basic,
    param_grid
)

save_results(results_xgb_engineered, "xgb_engineered")