In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from tqdm import tqdm

In [2]:
# --- 1. Define Constants and Load Data Splits ---
PROCESSED_DATA_DIR = '../data/processed/'
train_df = pd.read_csv(os.path.join(PROCESSED_DATA_DIR, 'train.csv'))
val_df = pd.read_csv(os.path.join(PROCESSED_DATA_DIR, 'val.csv'))

In [3]:
# --- 2. Create a Function to Load and Flatten Images ---
def load_data_and_flatten(dataframe):
    """
    Loads .npy image files listed in a dataframe and flattens them into 1D vectors.
    """
    # Get the project root directory to build absolute paths
    project_root = os.path.abspath(os.path.join(PROCESSED_DATA_DIR, '..', '..'))
    
    images = []
    labels = []
    
    for idx, row in tqdm(dataframe.iterrows(), total=len(dataframe), desc="Loading data"):
        # Construct absolute path to the image
        img_path = os.path.join(project_root, row['filepath'])
        
        # Load the processed .npy image array
        img_array = np.load(img_path)
        
        # Flatten the 2D image (128x128) into a 1D vector (16384)
        images.append(img_array.flatten())
        labels.append(row['label'])
        
    return np.array(images), np.array(labels)

In [4]:
# --- 3. Prepare Training and Validation Sets ---
print("Preparing training data...")
X_train, y_train = load_data_and_flatten(train_df)

print("\nPreparing validation data...")
X_val, y_val = load_data_and_flatten(val_df)

print(f"\nTraining data shape: {X_train.shape}")
print(f"Validation data shape: {X_val.shape}")

Preparing training data...


Loading data: 100%|██████████| 1750/1750 [00:00<00:00, 5801.73it/s]



Preparing validation data...


Loading data: 100%|██████████| 375/375 [00:00<00:00, 4884.41it/s]


Training data shape: (1750, 16384)
Validation data shape: (375, 16384)





In [5]:
# --- 4. Train the Logistic Regression Model ---
print("\nTraining Logistic Regression model...")

# Initialize the model
# max_iter is increased to ensure the model converges with this many features
model = LogisticRegression(max_iter=1000, random_state=42)

# Train the model
model.fit(X_train, y_train)

print("Model training complete.")


Training Logistic Regression model...
Model training complete.


In [6]:
# --- 5. Evaluate the Model ---
print("\nEvaluating model on the validation set...")

# Make predictions on the validation data
y_pred = model.predict(X_val)

# Calculate and print accuracy
accuracy = accuracy_score(y_val, y_pred)
print(f"\nValidation Accuracy: {accuracy:.4f}")

# Print a detailed classification report
print("\nClassification Report:")
print(classification_report(y_val, y_pred, target_names=['Normal (0)', 'Hemorrhage (1)']))


Evaluating model on the validation set...

Validation Accuracy: 0.9387

Classification Report:
                precision    recall  f1-score   support

    Normal (0)       0.96      0.97      0.96       328
Hemorrhage (1)       0.76      0.74      0.75        47

      accuracy                           0.94       375
     macro avg       0.86      0.86      0.86       375
  weighted avg       0.94      0.94      0.94       375

