In [4]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import classification_report, roc_auc_score

# Load and prepare the dataset
def load_and_prepare_data():
    train_data = pd.read_csv('train_data(new2).csv')
    val_data = pd.read_csv('val_data(new2).csv')
    test_data = pd.read_csv('test_data(new2).csv')
    holdout_data = pd.read_csv('holdout_data(new2).csv')

    train_data['Entry_Date'] = pd.to_datetime(train_data['Entry_Date'])
    val_data['Entry_Date'] = pd.to_datetime(val_data['Entry_Date'])
    test_data['Entry_Date'] = pd.to_datetime(test_data['Entry_Date'])
    holdout_data['Entry_Date'] = pd.to_datetime(holdout_data['Entry_Date'])

    train_data = train_data.sort_values(by='Entry_Date')
    val_data = val_data.sort_values(by='Entry_Date')
    test_data = test_data.sort_values(by='Entry_Date')
    holdout_data = holdout_data.sort_values(by='Entry_Date')

    train_data.dropna(inplace=True)
    val_data.dropna(inplace=True)
    test_data.dropna(inplace=True)
    holdout_data.dropna(inplace=True)
    
    return train_data, val_data, test_data, holdout_data

# Train the model with provided parameters
def train_model(train_data, val_data, params, features, target):
    rf_clf = RandomForestClassifier(
        n_estimators=params['n_estimators'],
        max_depth=params['max_depth'],
        min_samples_split=params['min_samples_split'],
        min_samples_leaf=params['min_samples_leaf'],
        random_state=42
    )
    rf_clf.fit(train_data[features], train_data[target])
    
    # Apply Platt scaling
    calibrated_rf_clf = CalibratedClassifierCV(estimator=rf_clf, method='sigmoid')
    calibrated_rf_clf.fit(train_data[features], train_data[target])
    
    # Evaluate on training set
    y_train_pred = calibrated_rf_clf.predict(train_data[features])
    y_train_pred_proba = calibrated_rf_clf.predict_proba(train_data[features])[:, 1]
    print("----- Training Set Evaluation -----")
    print(classification_report(train_data[target], y_train_pred))
    roc_auc_train = roc_auc_score(train_data[target], y_train_pred_proba)
    print(f"Training ROC AUC: {roc_auc_train}")
    
    # Evaluate on validation set
    y_val_pred = calibrated_rf_clf.predict(val_data[features])
    y_val_pred_proba = calibrated_rf_clf.predict_proba(val_data[features])[:, 1]
    print("----- Validation Set Evaluation -----")
    print(classification_report(val_data[target], y_val_pred))
    roc_auc_val = roc_auc_score(val_data[target], y_val_pred_proba)
    print(f"Validation ROC AUC: {roc_auc_val}")
    
    return calibrated_rf_clf

# Final evaluation on the test set
def evaluate_on_test_set(model, test_data, features, target, set_name="Test Set"):
    y_test_pred = model.predict(test_data[features])
    y_test_pred_proba = model.predict_proba(test_data[features])[:, 1]
    print(f"----- {set_name} Evaluation -----")
    print(classification_report(test_data[target], y_test_pred))
    roc_auc_test = roc_auc_score(test_data[target], y_test_pred_proba)
    print(f"{set_name} ROC AUC: {roc_auc_test}")

    # Sort the test results by predicted probabilities
    test_results = pd.DataFrame({
        'Predicted_Probability': y_test_pred_proba,
        'Actual_Profit': test_data[target]
    })
    sorted_test_results = test_results.sort_values(by='Predicted_Probability', ascending=False)
    print("Sorted test results by predicted probabilities:")
    print(sorted_test_results)

    # Display top 10 predicted probabilities and actual profits
    top_n = 10
    top_n_results = sorted_test_results.head(top_n)
    print(f"Top {top_n} predicted probabilities and actual profits:")
    print(top_n_results)
    
    return sorted_test_results

train_data, val_data, test_data, holdout_data = load_and_prepare_data()

# Set the parameters for the model
params = {
    'n_estimators': 20,
    'max_depth': 20,
    'min_samples_split': 50,
    'min_samples_leaf': 20
}

# Define the features and target
selected_features = [
    'SMA5_At_Entry', 'SMA10_At_Entry', 'EMA5_At_Entry', 'EMA15_At_Entry', 'RSI5_At_Entry', 'RSI10_At_Entry',
    'ATR5_At_Entry', 'ATR15_At_Entry', 'Stoch7_K_At_Entry', 'Stoch21_K_At_Entry',
    'BB10_High_At_Entry', 'BB10_Low_At_Entry', 'BB10_MAvg_At_Entry',
    'BB15_High_At_Entry', 'BB15_Low_At_Entry', 'BB15_MAvg_At_Entry',
    'MACD_At_Entry', 'Day_Of_Week_At_Entry',
    'ROC14_At_Entry', 'ROC15_At_Entry', 'Open', 'Low', 'High', 'Last'
]

target = 'Target'

# Train the model with the specified parameters
calibrated_rf_clf = train_model(train_data, val_data, params, selected_features, target)

# Final evaluation on the test set
sorted_test_results_test = evaluate_on_test_set(calibrated_rf_clf, test_data, selected_features, target, set_name="Test Set")

# Evaluation on the holdout set
sorted_test_results_holdout = evaluate_on_test_set(calibrated_rf_clf, holdout_data, selected_features, target, set_name="Holdout Set")

# Add predicted confidence to test_data for use in allocation methods
test_data['Predicted_Confidence'] = sorted_test_results_test['Predicted_Probability'].values
holdout_data['Predicted_Confidence'] = sorted_test_results_holdout['Predicted_Probability'].values

----- Training Set Evaluation -----
              precision    recall  f1-score   support

         0.0       0.78      0.50      0.61      3476
         1.0       0.63      0.86      0.73      3445

    accuracy                           0.68      6921
   macro avg       0.71      0.68      0.67      6921
weighted avg       0.71      0.68      0.67      6921

Training ROC AUC: 0.7674402621500782
----- Validation Set Evaluation -----
              precision    recall  f1-score   support

         0.0       0.63      0.49      0.55       480
         1.0       0.55      0.69      0.61       442

    accuracy                           0.59       922
   macro avg       0.59      0.59      0.58       922
weighted avg       0.59      0.59      0.58       922

Validation ROC AUC: 0.6386783559577677
----- Test Set Evaluation -----
              precision    recall  f1-score   support

         0.0       0.62      0.49      0.55       475
         1.0       0.56      0.69      0.62       447



In [3]:
from scipy.stats import ks_2samp

def ks_test(y_true, y_pred_proba):
    # Separate the probabilities by class
    positive_proba = y_pred_proba[y_true == 1]
    negative_proba = y_pred_proba[y_true == 0]

    # Perform the KS test
    ks_statistic, p_value = ks_2samp(positive_proba, negative_proba)
    print(f"KS Statistic: {ks_statistic}, P-value: {p_value}")
    
    return ks_statistic, p_value

# Apply KS test on the validation set
ks_stat, p_val = ks_test(val_data[target], calibrated_rf_clf.predict_proba(val_data[selected_features])[:, 1])


KS Statistic: 0.2435049019607843, P-value: 1.7608243387793867e-12
