In [1]:
import pandas as pd
import itertools
from sklearn.ensemble import RandomForestClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve
import os

# Load the dataset
def load_and_prepare_data(file_path):
    train_data = pd.read_csv(file_path)
    train_data['Entry_Date'] = pd.to_datetime(train_data['Entry_Date'])
    train_data = train_data.sort_values(by='Entry_Date')
    train_data.dropna(inplace=True)
    return train_data

# Calculate KS statistic
def calculate_ks_statistic(y_true, y_pred_proba):
    fpr, tpr, _ = roc_curve(y_true, y_pred_proba)
    ks_statistic = max(tpr - fpr)
    return ks_statistic

# Test feature combinations with periodic saving
def test_feature_combinations_with_saving(train_data, valid_combinations, target, params, save_path, save_interval=100):
    results = []
    for i, subset in enumerate(valid_combinations, start=1):
        print(f"Testing combination {i}/{len(valid_combinations)}: {subset}")

        rf_clf = RandomForestClassifier(
            n_estimators=params['n_estimators'],
            max_depth=params['max_depth'],
            min_samples_split=params['min_samples_split'],
            min_samples_leaf=params['min_samples_leaf'],
            random_state=42
        )
        rf_clf.fit(train_data[subset], train_data[target])

        calibrated_rf_clf = CalibratedClassifierCV(estimator=rf_clf, method='sigmoid')
        calibrated_rf_clf.fit(train_data[subset], train_data[target])

        y_train_pred = calibrated_rf_clf.predict(train_data[subset])
        y_train_pred_proba = calibrated_rf_clf.predict_proba(train_data[subset])[:, 1]

        accuracy = accuracy_score(train_data[target], y_train_pred)
        precision = precision_score(train_data[target], y_train_pred)
        recall = recall_score(train_data[target], y_train_pred)
        f1 = f1_score(train_data[target], y_train_pred)
        ks_statistic = calculate_ks_statistic(train_data[target], y_train_pred_proba)

        results.append({
            'Features': subset,
            'Accuracy': accuracy,
            'Precision': precision,
            'Recall': recall,
            'F1-Score': f1,
            'KS-Statistic': ks_statistic
        })

        # Periodically save results
        if i % save_interval == 0 or i == len(valid_combinations):
            results_df = pd.DataFrame(results)
            if os.path.exists(save_path):
                results_df.to_csv(save_path, mode='a', header=False, index=False)
            else:
                results_df.to_csv(save_path, index=False)
            print(f"Saved {i} combinations to {save_path}")
            results = []  # Clear the results list after saving

    # Final save if there are any remaining results
    if results:
        results_df = pd.DataFrame(results)
        results_df.to_csv(save_path, mode='a', header=False, index=False)
        print(f"Final save of remaining results to {save_path}")

# Generate all valid combinations
def generate_valid_combinations_corrected():
    sma_features = ['SMA5_At_Entry', 'SMA7_At_Entry', 'SMA10_At_Entry', 'SMA15_At_Entry']
    ema_features = ['EMA5_At_Entry', 'EMA7_At_Entry', 'EMA10_At_Entry', 'EMA15_At_Entry']
    rsi_features = ['RSI5_At_Entry', 'RSI10_At_Entry', 'RSI15_At_Entry']
    roc_features = ['ROC14_At_Entry', 'ROC15_At_Entry']
    stoch_features = ['Stoch5_K_At_Entry', 'Stoch7_K_At_Entry', 'Stoch21_K_At_Entry']
    bb_features_sets = [
        ['BB3_High_At_Entry', 'BB3_Low_At_Entry', 'BB3_MAvg_At_Entry'],
        ['BB5_High_At_Entry', 'BB5_Low_At_Entry', 'BB5_MAvg_At_Entry'],
        ['BB10_High_At_Entry', 'BB10_Low_At_Entry', 'BB10_MAvg_At_Entry'],
        ['BB15_High_At_Entry', 'BB15_Low_At_Entry', 'BB15_MAvg_At_Entry']
    ]
    atr_features = ['ATR5_At_Entry', 'ATR15_At_Entry']
    mandatory_features = ['MACD_At_Entry', 'Day_Of_Week_At_Entry']

    # Use itertools to calculate the number of combinations
    sma_combinations = list(itertools.combinations(sma_features, 2))
    ema_combinations = list(itertools.combinations(ema_features, 2))
    rsi_combinations = list(itertools.combinations(rsi_features, 2))
    roc_combinations = list(itertools.combinations(roc_features, 1))
    stoch_combinations = list(itertools.combinations(stoch_features, 2))
    bb_combinations = list(itertools.combinations(bb_features_sets, 2))
    atr_combinations = list(itertools.combinations(atr_features, 1))

    valid_combinations = []

    for sma in sma_combinations:
        for ema in ema_combinations:
            for rsi in rsi_combinations:
                for roc in roc_combinations:
                    for stoch in stoch_combinations:
                        for bb_set in bb_combinations:
                            for atr in atr_combinations:
                                bb_features = list(itertools.chain.from_iterable(bb_set))
                                feature_combination = list(sma) + list(ema) + list(rsi) + list(roc) + list(stoch) + bb_features + list(atr) + mandatory_features
                                valid_combinations.append(feature_combination)

    return valid_combinations

# Main execution
file_path = r'C:\Users\Kyle Henry\Desktop\Honours Thesis\Research_Assignment\train_data(new2).csv'  # Replace with the path to your training dataset
save_path = 'feature_combination_results.csv'  # Path to save the results
train_data = load_and_prepare_data(file_path)

params = {
    'n_estimators': 20,
    'max_depth': 20,
    'min_samples_split': 50,
    'min_samples_leaf': 20
}

valid_combinations = generate_valid_combinations_corrected()
test_feature_combinations_with_saving(train_data, valid_combinations, target='Target', params=params, save_path=save_path)

print(f"Feature combination testing complete. Results saved to '{save_path}'.")


Testing combination 1/7776: ['SMA5_At_Entry', 'SMA7_At_Entry', 'EMA5_At_Entry', 'EMA7_At_Entry', 'RSI5_At_Entry', 'RSI10_At_Entry', 'ROC14_At_Entry', 'Stoch5_K_At_Entry', 'Stoch7_K_At_Entry', 'BB3_High_At_Entry', 'BB3_Low_At_Entry', 'BB3_MAvg_At_Entry', 'BB5_High_At_Entry', 'BB5_Low_At_Entry', 'BB5_MAvg_At_Entry', 'ATR5_At_Entry', 'MACD_At_Entry', 'Day_Of_Week_At_Entry']
Testing combination 2/7776: ['SMA5_At_Entry', 'SMA7_At_Entry', 'EMA5_At_Entry', 'EMA7_At_Entry', 'RSI5_At_Entry', 'RSI10_At_Entry', 'ROC14_At_Entry', 'Stoch5_K_At_Entry', 'Stoch7_K_At_Entry', 'BB3_High_At_Entry', 'BB3_Low_At_Entry', 'BB3_MAvg_At_Entry', 'BB5_High_At_Entry', 'BB5_Low_At_Entry', 'BB5_MAvg_At_Entry', 'ATR15_At_Entry', 'MACD_At_Entry', 'Day_Of_Week_At_Entry']
Testing combination 3/7776: ['SMA5_At_Entry', 'SMA7_At_Entry', 'EMA5_At_Entry', 'EMA7_At_Entry', 'RSI5_At_Entry', 'RSI10_At_Entry', 'ROC14_At_Entry', 'Stoch5_K_At_Entry', 'Stoch7_K_At_Entry', 'BB3_High_At_Entry', 'BB3_Low_At_Entry', 'BB3_MAvg_At_Entry