In [2]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from itertools import product

# Load the datasets
train_data = pd.read_csv('train_data.csv')
val_data = pd.read_csv('val_data.csv')
test_data = pd.read_csv('test_data.csv')

# Ensure the data is sorted by the 'Entry_Date' column
train_data['Entry_Date'] = pd.to_datetime(train_data['Entry_Date'])
val_data['Entry_Date'] = pd.to_datetime(val_data['Entry_Date'])
test_data['Entry_Date'] = pd.to_datetime(test_data['Entry_Date'])

train_data = train_data.sort_values(by='Entry_Date')
val_data = val_data.sort_values(by='Entry_Date')
test_data = test_data.sort_values(by='Entry_Date')

# List of feature columns to include
feature_columns = [
    'SMA5_At_Entry', 'SMA7_At_Entry', 'EMA5_At_Entry', 'EMA15_At_Entry',
    'RSI5_At_Entry', 'RSI10_At_Entry', 'ATR5_At_Entry', 'ATR15_At_Entry',
    'Stoch5_K_At_Entry', 'Stoch7_K_At_Entry', 'BB5_High_At_Entry', 'BB5_Low_At_Entry',
    'BB5_MAvg_At_Entry', 'BB10_High_At_Entry', 'BB10_Low_At_Entry', 'BB10_MAvg_At_Entry',
    'Open', 'High', 'Low', 'Last', 'MACD_At_Entry', 'Day_Of_Week_At_Entry', 'ROC14_At_Entry'
]

# Separating features and target variable for training data
X_train = train_data[feature_columns]
y_train = train_data['Target']

# Separating features and target variable for validation data
X_val = val_data[feature_columns]
y_val = val_data['Target']

# Separating features and target variable for test data
X_test = test_data[feature_columns]
y_test = test_data['Target']

# Define the parameter grid for the RandomForestClassifier
param_grid = {
    'n_estimators': [5, 10, 50, 100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10, 25],
    'min_samples_leaf': [1, 2, 4, 5]
}

# Initialize a DataFrame to store the results
results = pd.DataFrame(columns=[
    'n_estimators', 'max_depth', 'min_samples_split', 'min_samples_leaf',
    'roc_auc_val', 'roc_auc_test', 'roc_auc_diff'
])

# Iterate through all combinations of parameters
for n_estimators, max_depth, min_samples_split, min_samples_leaf in product(
    param_grid['n_estimators'],
    param_grid['max_depth'],
    param_grid['min_samples_split'],
    param_grid['min_samples_leaf']
):
    # Initialize the Random Forest Classifier with specific parameters
    rf_clf = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        random_state=42
    )
    
    # Fit the model to the training data
    rf_clf.fit(X_train, y_train)
    
    # Predict on the validation set
    y_val_pred = rf_clf.predict_proba(X_val)[:, 1]
    roc_auc_val = roc_auc_score(y_val, y_val_pred)
    
    # Predict on the test set
    y_test_pred = rf_clf.predict_proba(X_test)[:, 1]
    roc_auc_test = roc_auc_score(y_test, y_test_pred)
    
    # Calculate the difference between validation and test ROC AUC
    roc_auc_diff = abs(roc_auc_val - roc_auc_test)
    
    # Print the results for this combination
    print(f"n_estimators: {n_estimators}, max_depth: {max_depth}, min_samples_split: {min_samples_split}, min_samples_leaf: {min_samples_leaf}")
    print(f"Validation ROC AUC: {roc_auc_val}, Test ROC AUC: {roc_auc_test}, Difference: {roc_auc_diff}\n")
    
    # Create a DataFrame with the current results
    current_result = pd.DataFrame([{
        'n_estimators': n_estimators,
        'max_depth': max_depth,
        'min_samples_split': min_samples_split,
        'min_samples_leaf': min_samples_leaf,
        'roc_auc_val': roc_auc_val,
        'roc_auc_test': roc_auc_test,
        'roc_auc_diff': roc_auc_diff
    }])
    
    # Concatenate the current result to the results DataFrame
    results = pd.concat([results, current_result], ignore_index=True)

# Find the best combination of parameters based on the smallest difference in ROC AUC scores
best_params = results.loc[results['roc_auc_diff'].idxmin()]
print("\nBest parameters based on smallest ROC AUC difference:")
print(best_params)

# Save the results to a CSV file
results.to_csv('grid_search_results.csv', index=False)


n_estimators: 5, max_depth: None, min_samples_split: 2, min_samples_leaf: 1
Validation ROC AUC: 0.5025199731202867, Test ROC AUC: 0.5128452794354501, Difference: 0.010325306315163374

n_estimators: 5, max_depth: None, min_samples_split: 2, min_samples_leaf: 2
Validation ROC AUC: 0.5114238781452998, Test ROC AUC: 0.5246478545427462, Difference: 0.013223976397446435

n_estimators: 5, max_depth: None, min_samples_split: 2, min_samples_leaf: 4
Validation ROC AUC: 0.49659103636227875, Test ROC AUC: 0.5022878981128913, Difference: 0.00569686175061257

n_estimators: 5, max_depth: None, min_samples_split: 2, min_samples_leaf: 5
Validation ROC AUC: 0.5254213955051146, Test ROC AUC: 0.49508439388156, Difference: 0.03033700162355457

n_estimators: 5, max_depth: None, min_samples_split: 5, min_samples_leaf: 1
Validation ROC AUC: 0.5150615060106026, Test ROC AUC: 0.49572211929654697, Difference: 0.019339386714055662

n_estimators: 5, max_depth: None, min_samples_split: 5, min_samples_leaf: 2
Valida