# Baseline Model: Logistic Regression

Train a baseline logistic regression model for comparison.

**Acceptance Criteria:**
- Model trained successfully
- ROC-AUC reported
- Baseline documented

In [31]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt

## 1. Load Data

In [32]:
# Load training data
df = pd.read_csv('../data/raw/application_train.csv')
print(f"Dataset shape: {df.shape}")
print(f"Target distribution:\n{df['TARGET'].value_counts(normalize=True)}")

Dataset shape: (307511, 122)
Target distribution:
TARGET
0    0.919271
1    0.080729
Name: proportion, dtype: float64


In [33]:
df.head()

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,100004,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
3,100006,0,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,...,0,0,0,0,,,,,,
4,100007,0,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


## 2. Basic Preprocessing

In [34]:
# Separate target
X = df.drop(['SK_ID_CURR', 'TARGET'], axis=1)
y = df['TARGET']

print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")

Features shape: (307511, 120)
Target shape: (307511,)


In [35]:
# Identify column types
numeric_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()

print(f"Numeric columns: {len(numeric_cols)}")
print(f"Categorical columns: {len(categorical_cols)}")
print(f"\nCategorical columns: {categorical_cols}")

Numeric columns: 104
Categorical columns: 16

Categorical columns: ['NAME_CONTRACT_TYPE', 'CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'NAME_TYPE_SUITE', 'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE', 'OCCUPATION_TYPE', 'WEEKDAY_APPR_PROCESS_START', 'ORGANIZATION_TYPE', 'FONDKAPREMONT_MODE', 'HOUSETYPE_MODE', 'WALLSMATERIAL_MODE', 'EMERGENCYSTATE_MODE']


In [36]:
# Handle missing values in numeric columns (fill with median)
for col in numeric_cols:
    if X[col].isnull().sum() > 0:
        X[col] = X[col].fillna(X[col].median())

print(f"Missing values in numeric columns after filling: {X[numeric_cols].isnull().sum().sum()}")

Missing values in numeric columns after filling: 0


In [37]:
# Encode categorical columns (Label Encoding for simplicity in baseline)
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    # Handle missing values in categorical columns
    X[col] = X[col].fillna('Missing')
    X[col] = le.fit_transform(X[col].astype(str))
    label_encoders[col] = le

print(f"Encoded {len(categorical_cols)} categorical columns")

Encoded 16 categorical columns


In [38]:
# Verify no missing values remain
print(f"Total missing values: {X.isnull().sum().sum()}")
X.head()

Total missing values: 0


Unnamed: 0,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,NAME_TYPE_SUITE,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,0,1,0,1,0,202500.0,406597.5,24700.5,351000.0,7,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,0,0,0,0,0,270000.0,1293502.5,35698.5,1129500.0,1,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,1,1,1,1,0,67500.0,135000.0,6750.0,135000.0,7,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
3,0,0,0,1,0,135000.0,312682.5,29686.5,297000.0,7,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
4,0,1,0,1,0,121500.0,513000.0,21865.5,513000.0,7,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


## 3. Train-Test Split

In [39]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set: {X_train.shape}")
print(f"Test set: {X_test.shape}")
print(f"\nTarget distribution in train: {y_train.value_counts(normalize=True).to_dict()}")
print(f"Target distribution in test: {y_test.value_counts(normalize=True).to_dict()}")

Training set: (246008, 120)
Test set: (61503, 120)

Target distribution in train: {0: 0.9192709180189262, 1: 0.08072908198107379}
Target distribution in test: {0: 0.9192722306228964, 1: 0.08072776937710356}


In [40]:
# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Features scaled successfully")

Features scaled successfully


## 4. Train Logistic Regression Model

In [41]:
# Train baseline logistic regression
model = LogisticRegression(
    max_iter=1000,
    random_state=42,
    solver='lbfgs',
    n_jobs=-1
)

model.fit(X_train_scaled, y_train)
print("Model trained successfully!")

Model trained successfully!


## 5. Evaluate Model

In [42]:
# Predict probabilities
y_train_pred_proba = model.predict_proba(X_train_scaled)[:, 1]
y_test_pred_proba = model.predict_proba(X_test_scaled)[:, 1]

# Calculate ROC-AUC
train_auc = roc_auc_score(y_train, y_train_pred_proba)
test_auc = roc_auc_score(y_test, y_test_pred_proba)

print("="*50)
print("BASELINE MODEL RESULTS")
print("="*50)
print(f"Training ROC-AUC: {train_auc:.4f}")
print(f"Test ROC-AUC:     {test_auc:.4f}")
print("="*50)

BASELINE MODEL RESULTS
Training ROC-AUC: 0.7432
Test ROC-AUC:     0.7457


In [43]:
# Predictions with default threshold
y_test_pred = model.predict(X_test_scaled)

print("Classification Report:")
print(classification_report(y_test, y_test_pred))

Classification Report:
              precision    recall  f1-score   support

           0       0.92      1.00      0.96     56538
           1       0.53      0.01      0.02      4965

    accuracy                           0.92     61503
   macro avg       0.72      0.50      0.49     61503
weighted avg       0.89      0.92      0.88     61503



In [44]:
# Confusion Matrix
cm = confusion_matrix(y_test, y_test_pred)
print("Confusion Matrix:")
print(cm)

Confusion Matrix:
[[56491    47]
 [ 4913    52]]


## 6. Baseline Documentation

### Model Summary
- **Model**: Logistic Regression
- **Features**: All columns from application_train.csv (excluding SK_ID_CURR and TARGET)
- **Preprocessing**:
  - Numeric columns: Missing values filled with median
  - Categorical columns: Label encoded, missing values filled with 'Missing'
  - All features scaled with StandardScaler
- **Train/Test Split**: 80/20, stratified by target

### Performance Metrics
| Metric | Train | Test |
|--------|-------|------|
| ROC-AUC | TBD | TBD |

### Notes
- This is a baseline model using only the main application table
- No feature engineering from bureau, previous_application, or other tables
- Future models with engineered features should be compared against this baseline

In [45]:
# Save baseline metrics for comparison
baseline_results = {
    'model': 'Logistic Regression',
    'features': 'application_train only',
    'train_auc': train_auc,
    'test_auc': test_auc,
    'n_features': X.shape[1],
    'n_train_samples': X_train.shape[0],
    'n_test_samples': X_test.shape[0]
}

print("Baseline Results Summary:")
for key, value in baseline_results.items():
    print(f"  {key}: {value}")

Baseline Results Summary:
  model: Logistic Regression
  features: application_train only
  train_auc: 0.7432084047180219
  test_auc: 0.7456812281463541
  n_features: 120
  n_train_samples: 246008
  n_test_samples: 61503
