# Baseline Model: Logistic Regression

Train a baseline logistic regression model for comparison.

**Acceptance Criteria:**
- Model trained successfully
- ROC-AUC reported
- Baseline documented

In [None]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt

## 1. Load Data

In [None]:
# Load training data
df = pd.read_csv('../data/raw/application_train.csv')
print(f"Dataset shape: {df.shape}")
print(f"Target distribution:\n{df['TARGET'].value_counts(normalize=True)}")

In [None]:
df.head()

## 2. Basic Preprocessing

In [None]:
# Separate target
X = df.drop(['SK_ID_CURR', 'TARGET'], axis=1)
y = df['TARGET']

print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")

In [None]:
# Identify column types
numeric_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()

print(f"Numeric columns: {len(numeric_cols)}")
print(f"Categorical columns: {len(categorical_cols)}")
print(f"\nCategorical columns: {categorical_cols}")

In [None]:
# Handle missing values in numeric columns (fill with median)
for col in numeric_cols:
    if X[col].isnull().sum() > 0:
        X[col] = X[col].fillna(X[col].median())

print(f"Missing values in numeric columns after filling: {X[numeric_cols].isnull().sum().sum()}")

In [None]:
# Encode categorical columns (Label Encoding for simplicity in baseline)
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    # Handle missing values in categorical columns
    X[col] = X[col].fillna('Missing')
    X[col] = le.fit_transform(X[col].astype(str))
    label_encoders[col] = le

print(f"Encoded {len(categorical_cols)} categorical columns")

In [None]:
# Verify no missing values remain
print(f"Total missing values: {X.isnull().sum().sum()}")
X.head()

## 3. Train-Test Split

In [None]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set: {X_train.shape}")
print(f"Test set: {X_test.shape}")
print(f"\nTarget distribution in train: {y_train.value_counts(normalize=True).to_dict()}")
print(f"Target distribution in test: {y_test.value_counts(normalize=True).to_dict()}")

In [None]:
# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Features scaled successfully")

## 4. Train Logistic Regression Model

In [None]:
# Train baseline logistic regression
model = LogisticRegression(
    max_iter=1000,
    random_state=42,
    solver='lbfgs',
    n_jobs=-1
)

model.fit(X_train_scaled, y_train)
print("Model trained successfully!")

## 5. Evaluate Model

In [None]:
# Predict probabilities
y_train_pred_proba = model.predict_proba(X_train_scaled)[:, 1]
y_test_pred_proba = model.predict_proba(X_test_scaled)[:, 1]

# Calculate ROC-AUC
train_auc = roc_auc_score(y_train, y_train_pred_proba)
test_auc = roc_auc_score(y_test, y_test_pred_proba)

print("="*50)
print("BASELINE MODEL RESULTS")
print("="*50)
print(f"Training ROC-AUC: {train_auc:.4f}")
print(f"Test ROC-AUC:     {test_auc:.4f}")
print("="*50)

In [None]:
# Predictions with default threshold
y_test_pred = model.predict(X_test_scaled)

print("Classification Report:")
print(classification_report(y_test, y_test_pred))

In [None]:
# Confusion Matrix
cm = confusion_matrix(y_test, y_test_pred)
print("Confusion Matrix:")
print(cm)

## 6. Baseline Documentation

### Model Summary
- **Model**: Logistic Regression
- **Features**: All columns from application_train.csv (excluding SK_ID_CURR and TARGET)
- **Preprocessing**:
  - Numeric columns: Missing values filled with median
  - Categorical columns: Label encoded, missing values filled with 'Missing'
  - All features scaled with StandardScaler
- **Train/Test Split**: 80/20, stratified by target

### Performance Metrics
| Metric | Train | Test |
|--------|-------|------|
| ROC-AUC | TBD | TBD |

### Notes
- This is a baseline model using only the main application table
- No feature engineering from bureau, previous_application, or other tables
- Future models with engineered features should be compared against this baseline

In [None]:
# Save baseline metrics for comparison
baseline_results = {
    'model': 'Logistic Regression',
    'features': 'application_train only',
    'train_auc': train_auc,
    'test_auc': test_auc,
    'n_features': X.shape[1],
    'n_train_samples': X_train.shape[0],
    'n_test_samples': X_test.shape[0]
}

print("Baseline Results Summary:")
for key, value in baseline_results.items():
    print(f"  {key}: {value}")