In [5]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.model_selection import learning_curve
# Load the dataset
train_data = pd.read_csv('train_data.csv')
val_data = pd.read_csv('val_data.csv')
test_data = pd.read_csv('test_data.csv')

# Ensure the data is sorted by the 'Entry_Date' column
train_data['Entry_Date'] = pd.to_datetime(train_data['Entry_Date'])
val_data['Entry_Date'] = pd.to_datetime(val_data['Entry_Date'])
test_data['Entry_Date'] = pd.to_datetime(test_data['Entry_Date'])

train_data = train_data.sort_values(by='Entry_Date')
val_data = val_data.sort_values(by='Entry_Date')
test_data = test_data.sort_values(by='Entry_Date')

# Define the feature sets manually
selected_features = ['Open', 'High', 'Low', 'Last','SMA5_At_Entry', 'SMA10_At_Entry', 'EMA5_At_Entry', 'EMA15_At_Entry', 'RSI5_At_Entry', 'RSI10_At_Entry',
                     'ATR5_At_Entry', 'ATR15_At_Entry', 'Stoch7_K_At_Entry', 'Stoch21_K_At_Entry',
                     'BB10_High_At_Entry', 'BB10_Low_At_Entry', 'BB10_MAvg_At_Entry',
                     'BB15_High_At_Entry', 'BB15_Low_At_Entry', 'BB15_MAvg_At_Entry',
                      'MACD_At_Entry', 'Day_Of_Week_At_Entry',
                     'ROC14_At_Entry' , 'ROC15_At_Entry'
                     ]

trade_info = ['Strategy', 'Trade_ID', 'Entry_Date', 'Exit_Date', 'Profit_Loss'] 

# Separating features and target variable
X_train = train_data[selected_features]
y_train = train_data['Target']
X_val = val_data[selected_features]
y_val = val_data['Target']
X_test = test_data[selected_features]
y_test = test_data['Target']

# Initialize the Random Forest Classifier
rf_clf = RandomForestClassifier(
    n_estimators=20,
    max_depth=10,
    min_samples_split=50,
    min_samples_leaf=10,
    bootstrap=True,
    random_state=42
)

# Fit the model to the training data
rf_clf.fit(X_train, y_train)

# Predict on the validation set
y_val_pred = rf_clf.predict(X_val)
y_val_proba = rf_clf.predict_proba(X_val)  # Probabilities for both classes

# Calculate evaluation metrics for the validation set
print(classification_report(y_val, y_val_pred))
roc_auc_val = roc_auc_score(y_val, y_val_proba[:, 1])
print(f"Validation ROC AUC: {roc_auc_val}")

# Print the probabilities for the first 5 samples of the validation set for inspection
print("Validation set probabilities (first 5 samples):")
print(y_val_proba[:5])

# Predict on the test set
y_test_pred = rf_clf.predict(X_test)
y_test_proba = rf_clf.predict_proba(X_test)  # Probabilities for both classes

# Calculate evaluation metrics for the test set
print(classification_report(y_test, y_test_pred))
roc_auc_test = roc_auc_score(y_test, y_test_proba[:, 1])
print(f"Test ROC AUC: {roc_auc_test}")

# Print the probabilities for the first 5 samples of the test set for inspection
print("Test set probabilities (first 5 samples):")
print(y_test_proba[:5])

# Add probabilities to the validation data
val_data['Predicted_Prob'] = y_val_proba[:, 1]

# Save validation data with probabilities to a DataFrame
val_predictions_with_proba = val_data[trade_info + selected_features + ['Target', 'Predicted_Prob']]
val_predictions_with_proba.to_csv('val_predictions_with_proba.csv', index=False)

# Add probabilities to the test data
test_data['Predicted_Prob'] = y_test_proba[:, 1]

# Save test data with probabilities to a DataFrame
test_predictions_with_proba = test_data[trade_info + selected_features + ['Target', 'Predicted_Prob']]
test_predictions_with_proba.to_csv('test_predictions_with_proba.csv', index=False)

print("Validation and test predictions with probabilities have been saved to CSV files.")

              precision    recall  f1-score   support

         0.0       0.54      0.52      0.53       454
         1.0       0.56      0.58      0.57       472

    accuracy                           0.55       926
   macro avg       0.55      0.55      0.55       926
weighted avg       0.55      0.55      0.55       926

Validation ROC AUC: 0.581434798028821
Validation set probabilities (first 5 samples):
[[0.47281835 0.52718165]
 [0.48910959 0.51089041]
 [0.47398377 0.52601623]
 [0.53342987 0.46657013]
 [0.47027463 0.52972537]]
              precision    recall  f1-score   support

         0.0       0.55      0.51      0.53       461
         1.0       0.55      0.59      0.57       466

    accuracy                           0.55       927
   macro avg       0.55      0.55      0.55       927
weighted avg       0.55      0.55      0.55       927

Test ROC AUC: 0.5679480137413535
Test set probabilities (first 5 samples):
[[0.47421988 0.52578012]
 [0.59024039 0.40975961]
 [0.55369