In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import cv2
from tqdm import tqdm

# Random
np.random.seed(42)

# Load data
file_path = ''
data = pd.read_csv(file_path)

# Derive 'label' from the 'filename' column
data['label'] = data['filename'].apply(lambda x: 0 if 'benign' in x else 1)
# Filter data for magnification (mag) == 40
data = data[data['mag'] == 40]

# Parameters
target_size = (224, 224)  # Target size for input

# Function to preprocess images
def preprocess_images(dataframe, target_size):
    images = []
    labels = []
    for idx in tqdm(range(len(dataframe)), desc="Preprocessing images"):
        row = dataframe.iloc[idx]
        img_path = row['filename']
        img_path = "" + img_path
        label = row['label']

        img = cv2.imread(img_path)
        if img is None:
            raise FileNotFoundError(f"Image not found: {img_path}")

        img = cv2.resize(img, target_size)
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        img = img / 255.0  # Normalize
        images.append(img.flatten())  # Flatten the image for Random Forest
        labels.append(label)

    return np.array(images), np.array(labels)

# Split data into train and test
train_df = data[data['grp'] == 'train']
test_df = data[data['grp'] == 'test']

# Preprocess data
X_train, y_train = preprocess_images(train_df, target_size)
X_test, y_test = preprocess_images(test_df, target_size)

# Further split train data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Initialize Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced', n_jobs=-1)

# Train the model
print("Training Random Forest model...")
rf_model.fit(X_train, y_train)

# Validate the model
y_val_pred = rf_model.predict(X_val)
val_accuracy = accuracy_score(y_val, y_val_pred)
print(f"Validation Accuracy: {val_accuracy * 100:.2f}%")

# Test the model
y_test_pred = rf_model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f"Test Accuracy: {test_accuracy * 100:.2f}%")

# Print classification report
target_names = ['Benign', 'Malignant']
print("\nClassification Report:")
print(classification_report(y_test, y_test_pred, target_names=target_names))
