In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

def main():
    """
    Loads split_0 for each dataset, prints stats, trains a Random Forest,
    and evaluates its performance.
    """
    # Use the correct base directory name from your 'tree' output
    base_dir = Path("../dataset_classification_toy") 

    if not base_dir.is_dir():
        print(f"Error: Base directory '{base_dir}' not found.")
        return

    # Get a sorted list of dataset directories
    dataset_dirs = sorted([d for d in base_dir.iterdir() if d.is_dir()])

    for dataset_dir in dataset_dirs:
        dataset_name = dataset_dir.name
        print(f"\n{'='*20} Processing: {dataset_name} {'='*20}")

        split_path = dataset_dir / "split_0"

        if not split_path.is_dir():
            print(f"  - split_0 not found for {dataset_name}. Skipping.")
            continue

        # --- 1. Load Data ---
        try:
            X_train_path = split_path / "train_features.csv"
            y_train_path = split_path / "train_target.csv"
            X_test_path = split_path / "test_features.csv"
            y_test_path = split_path / "test_target.csv"

            X_train = pd.read_csv(X_train_path, header=None).values
            y_train = pd.read_csv(y_train_path, header=None).values.ravel()
            X_test = pd.read_csv(X_test_path, header=None).values
            y_test = pd.read_csv(y_test_path, header=None).values.ravel()
        except FileNotFoundError as e:
            print(f"  - Error loading files in {split_path}: {e}. Skipping.")
            continue
            
        # --- 2. Print Statistics ---
        print("\n## Data Statistics (split_0)")
        print(f"  - Training features shape: {X_train.shape}")
        print(f"  - Training target shape:  {y_train.shape}")
        print(f"  - Test features shape:     {X_test.shape}")
        print(f"  - Test target shape:      {y_test.shape}")
        
        # Calculate and print class distribution
        train_counts = np.bincount(y_train)
        test_counts = np.bincount(y_test)
        print("\n  - Class Distribution (Negative: 0, Positive: 1):")
        print(f"    - Training Set: {train_counts[0]} (0), {train_counts[1]} (1)")
        print(f"    - Test Set:     {test_counts[0]} (0), {test_counts[1]} (1)")

        # --- 3. Train Random Forest Model ---
        # Using random_state for reproducible results
        model = RandomForestClassifier(n_estimators=100, random_state=42)
        model.fit(X_train, y_train)

        # --- 4. Evaluate Model ---
        y_pred = model.predict(X_test)
        
        accuracy = accuracy_score(y_test, y_pred)
        report = classification_report(y_test, y_pred)
        cm = confusion_matrix(y_test, y_pred)

        print("\n## Model Evaluation (Random Forest)")
        print(f"  - Accuracy: {accuracy:.4f}")
        print("\n  - Classification Report:")
        # Indent the report for better readability
        print("    " + report.replace("\n", "\n    "))
        
        print("  - Confusion Matrix:")
        print("      [TN, FP]")
        print("      [FN, TP]")
        print(f"    {cm}")
        

if __name__ == "__main__":
    main()





## Data Statistics (split_0)
  - Training features shape: (1234, 4)
  - Training target shape:  (1234,)
  - Test features shape:     (138, 4)
  - Test target shape:      (138,)

  - Class Distribution (Negative: 0, Positive: 1):
    - Training Set: 685 (0), 549 (1)
    - Test Set:     77 (0), 61 (1)

## Model Evaluation (Random Forest)
  - Accuracy: 1.0000

  - Classification Report:
                  precision    recall  f1-score   support
    
               0       1.00      1.00      1.00        77
               1       1.00      1.00      1.00        61
    
        accuracy                           1.00       138
       macro avg       1.00      1.00      1.00       138
    weighted avg       1.00      1.00      1.00       138
    
  - Confusion Matrix:
      [TN, FP]
      [FN, TP]
    [[77  0]
 [ 0 61]]


## Data Statistics (split_0)
  - Training features shape: (512, 30)
  - Training target shape:  (512,)
  - Test features shape:     (57, 30)
  - Test target shape:      (5