In [13]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score
import json

# Load the dataset containing the feature combinations
combinations_file_path = 'feature_combination_results_retest.csv'
combinations_data = pd.read_csv(combinations_file_path)

# Filter the dataset for combinations with accuracy greater than 53%
filtered_combinations = combinations_data[combinations_data['Accuracy'] > 0.53]

# Load the train, validation, and test datasets
train_data = pd.read_csv('train_data.csv')
val_data = pd.read_csv('val_data.csv')
test_data = pd.read_csv('test_data.csv')

# Ensure the data is sorted by the 'Entry_Date' column
train_data['Entry_Date'] = pd.to_datetime(train_data['Entry_Date'])
val_data['Entry_Date'] = pd.to_datetime(val_data['Entry_Date'])
test_data['Entry_Date'] = pd.to_datetime(test_data['Entry_Date'])

train_data = train_data.sort_values(by='Entry_Date')
val_data = val_data.sort_values(by='Entry_Date')
test_data = test_data.sort_values(by='Entry_Date')

# Prepare to store results
results = []

# Loop over each feature combination
for index, row in filtered_combinations.iterrows():
    feature_list_str = row['Features']
    selected_features = json.loads(feature_list_str.replace("'", "\""))
    
    # Separating features and target variable
    X_train = train_data[selected_features]
    y_train = train_data['Target']
    X_val = val_data[selected_features]
    y_val = val_data['Target']
    X_test = test_data[selected_features]
    y_test = test_data['Target']

    # Initialize the Random Forest Classifier
    rf_clf = RandomForestClassifier(
        n_estimators=100,
        max_depth=None,
        min_samples_split=25,
        min_samples_leaf=5,
        bootstrap=True,
        random_state=42
    )

    # Fit the model to the training data
    rf_clf.fit(X_train, y_train)

    # Predict on the validation set
    y_val_pred = rf_clf.predict(X_val)

    # Calculate evaluation metrics for the validation set
    val_report = classification_report(y_val, y_val_pred, output_dict=True)
    roc_auc_val = roc_auc_score(y_val, y_val_pred)

    # Predict on the test set
    y_test_pred = rf_clf.predict(X_test)

    # Calculate evaluation metrics for the test set
    test_report = classification_report(y_test, y_test_pred, output_dict=True)
    roc_auc_test = roc_auc_score(y_test, y_test_pred)

    # Store the results
    results.append({
        'Features': feature_list_str,
        'Validation Report': val_report,
        'Validation ROC AUC': roc_auc_val,
        'Test Report': test_report,
        'Test ROC AUC': roc_auc_test
    })

    # Print progress update
    print(f"Completed {index + 1}/{len(filtered_combinations)} tests.")

    # Save results every 10 tests
    if (index + 1) % 10 == 0:
        partial_results_df = pd.DataFrame(results)
        partial_results_df.to_csv('partial_feature_combination_results.csv', index=False)
        print(f"Saved progress at {index + 1} tests.")

# Convert results to a DataFrame
results_df = pd.DataFrame(results)

# Save final results to a CSV file
results_df.to_csv('feature_combination_results.csv', index=False)

print("Final results have been saved to 'feature_combination_results.csv'")


Completed 8/915 tests.
Completed 17/915 tests.
Completed 27/915 tests.
Completed 33/915 tests.
Completed 38/915 tests.
Completed 50/915 tests.
Saved progress at 50 tests.
Completed 51/915 tests.
Completed 52/915 tests.
Completed 53/915 tests.
Completed 59/915 tests.
Completed 62/915 tests.
Completed 74/915 tests.
Completed 91/915 tests.
Completed 98/915 tests.
Completed 104/915 tests.
Completed 107/915 tests.
Completed 119/915 tests.
Completed 127/915 tests.
Completed 136/915 tests.
Completed 137/915 tests.
Completed 138/915 tests.
Completed 140/915 tests.
Saved progress at 140 tests.
Completed 151/915 tests.
Completed 152/915 tests.
Completed 153/915 tests.
Completed 154/915 tests.
Completed 155/915 tests.
Completed 156/915 tests.
Completed 163/915 tests.
Completed 168/915 tests.
Completed 181/915 tests.
Completed 182/915 tests.
Completed 187/915 tests.
Completed 188/915 tests.
Completed 200/915 tests.
Saved progress at 200 tests.
Completed 206/915 tests.
Completed 228/915 tests.
Comp