In [None]:
import pandas as pd
import numpy as np
from scipy.io import arff

# ==========================================
# STEP 1: LOAD & DECODE THE DATA
# ==========================================
# Loading the research format (.arff)
data, meta = arff.loadarff('bone-marrow.arff')
df = pd.DataFrame(data)

# Decode 'bytes' to 'strings' (turning b'survived' into 'survived')
for col in df.select_dtypes([object]):
    df[col] = df[col].str.decode('utf-8')

# ==========================================
# STEP 2: PREPROCESSING (The "Sanitizer")
# ==========================================
# 1. Handle Missing Values: Replace '?' with NaN and fill with mean
df = df.replace('?', np.nan)
df = df.fillna(df.mean(numeric_only=True))

# 2. Categorical Encoding: Turn text columns (Gender, etc.) into numbers
for col in df.columns:
    if df[col].dtype == 'object':
        df[col] = pd.factorize(df[col])[0]

# 3. Separate Features (X) and Target (y)
target_col = 'survival_status' 
X = df.drop(target_col, axis=1).values.astype(float)
y = df[target_col].values.astype(float)

# 4. STANDARDIZATION (Crucial to prevent 'Overflow' errors)
# This squashes features like Age and Cell Count into the same scale
X = (X - np.mean(X, axis=0)) / (np.std(X, axis=0) + 1e-15)

# ==========================================
# STEP 3: THE MATH ENGINE (The "Scratch" Build)
# ==========================================
def sigmoid(z):
    """Maps any value into a 0-1 probability"""
    return 1 / (1 + np.exp(-np.clip(z, -250, 250))) # clip prevents math errors

# Initialize random weights
np.random.seed(42)
weights = np.random.randn(X.shape[1])

# ==========================================
# STEP 4: GRADIENT DESCENT (The "Learning" Loop)
# ==========================================
learning_rate = 0.1
epochs = 2000

print("Starting training...")
for i in range(epochs):
    # 1. Forward Pass: Predict
    z = np.dot(X, weights)
    probs = sigmoid(z)
    
    # 2. Calculate Error (Gradient)
    error = probs - y
    gradient = np.dot(X.T, error) / len(y)
    
    # 3. Update Weights (The "Nudge")
    weights = weights - (learning_rate * gradient)
    
    # Print progress every 200 steps
    if i % 200 == 0:
        # Calculate 'Loss' (how wrong the model is)
        loss = -np.mean(y * np.log(probs + 1e-15) + (1 - y) * np.log(1 - probs + 1e-15))
        print(f"Iteration {i}: Loss = {loss:.4f}")

# ==========================================
# STEP 5: FINAL EVALUATION
# ==========================================
final_z = np.dot(X, weights)
final_probs = sigmoid(final_z)
final_preds = (final_probs > 0.5).astype(int)

accuracy = (final_preds == y).mean()
print(f"\n--- MISSION COMPLETE ---")
print(f"Final Model Accuracy: {accuracy * 100:.2f}%")

Starting training...
Iteration 0: Loss = 1.4200
Iteration 200: Loss = 0.1983
Iteration 400: Loss = 0.1426
Iteration 600: Loss = 0.1205
Iteration 800: Loss = 0.1071
Iteration 1000: Loss = 0.0977
Iteration 1200: Loss = 0.0905
Iteration 1400: Loss = 0.0848
Iteration 1600: Loss = 0.0802
Iteration 1800: Loss = 0.0762

--- MISSION COMPLETE ---
Final Model Accuracy: 98.40%
