In [26]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')
import joblib

In [27]:
# Load data
url = "https://raw.githubusercontent.com/kenstare/Practice_datasets/master/home_loan_train.csv"
df = pd.read_csv(url)

In [28]:
df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [29]:
print(f"Original Shape: {df.shape}")
print(f"Missing Values:\n{df.isnull().sum()}\n")
print(f"Columns: {list(df.columns)}\n")


Original Shape: (614, 13)
Missing Values:
Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

Columns: ['Loan_ID', 'Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term', 'Credit_History', 'Property_Area', 'Loan_Status']



In [30]:
# Drop Unnecessary Column
df = df.drop('Loan_ID', axis=1)

In [33]:
print(f"Original Shape: {df.shape}")
print(f"Missing Values:\n{df.isnull().sum()}\n")
print(f"Columns: {list(df.columns)}\n")

Original Shape: (614, 12)
Missing Values:
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

Columns: ['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term', 'Credit_History', 'Property_Area', 'Loan_Status']



In [39]:
# Handle Missing Values
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = df.select_dtypes(include=['object']).columns

# Categorical: median
df[numerical_cols] = df[numerical_cols].fillna(df[numerical_cols].median())
# Categorical: mode
df[categorical_cols] = df[categorical_cols].fillna(df[categorical_cols].mode().iloc[0])

print("Missing values after imputation:")
print(df.isnull().sum().sum())

Missing values after imputation:
0


In [40]:
# Type Corrections
df['Dependents'] = df['Dependents'].astype(str)

# Outlier Capping (99th percentile)
df['ApplicantIncome'] = df['ApplicantIncome'].clip(upper=df['ApplicantIncome'].quantile(0.99))
df['LoanAmount'] = df['LoanAmount'].clip(upper=df['LoanAmount'].quantile(0.99))

print(f"Outliers capped at 99th percentile:")
print(f"  ApplicantIncome max: {df['ApplicantIncome'].max():.2f}")
print(f"  LoanAmount max: {df['LoanAmount'].max():.2f}\n")


Outliers capped at 99th percentile:
  ApplicantIncome max: 32540.41
  LoanAmount max: 495.87



### Feature Engineering

In [42]:
# Total Income
df['TotalIncome'] = df['ApplicantIncome'] + df['CoapplicantIncome']

# Income to Loan Ratio (avoid division by zero)
df['IncomeToLoanRatio'] = df['TotalIncome'] / (df['LoanAmount'] + 1) 

# Log Transformations (normalize skewness)
df['LogApplicantIncome'] = np.log1p(df['ApplicantIncome'])
df['LogLoanAmount'] = np.log1p(df['LoanAmount'])

# Bin Loan Term
bins = [0, 180, 360, np.inf]
labels = ['Short', 'Standard', 'Long']
df['TermBinned'] = pd.cut(df['Loan_Amount_Term'], bins=bins, labels=labels, include_lowest=True)

print("New features added:")
print("TotalIncome, IncomeToLoanRatio, LogApplicantIncome, LogLoanAmount, TermBinned\n")


New features added:
TotalIncome, IncomeToLoanRatio, LogApplicantIncome, LogLoanAmount, TermBinned



### Encoding Categorical Variables

In [51]:
# One-hot encode nominal categoricals
cat_to_encode = ['Gender', 'Married', 'Dependents', 'Education', 
                 'Self_Employed', 'Property_Area', 'TermBinned']

df_encoded = pd.get_dummies(df, columns=cat_to_encode, drop_first=True)

# Label encode target
le = LabelEncoder()
df_encoded['Loan_Status'] = le.fit_transform(df_encoded['Loan_Status'])

print(f"After Encoding applied to {len(cat_to_encode)} columns")
print(f"Target Classes: {le.classes_}")
print(f"Shape after encoding: {df_encoded.shape}\n")

After Encoding applied to 7 columns
Target Classes: ['N' 'Y']
Shape after encoding: (614, 21)



In [64]:
# Scaling Numerical Features
# Define Numerical Columns for Scaling
num_cols_post = [
    'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term', 'Credit_History',
    'TotalIncome', 'IncomeToLoanRatio', 'LogApplicantIncome', 'LogLoanAmount'
]
print(f"Scaled {len(num_cols_post)} numerical features using StandardScaler\n")

# Train-Test Split (Stratified)
X = df_encoded.drop('Loan_Status', axis=1)
y = df_encoded['Loan_Status']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

print(f"\nTrain Shape: {X_train.shape}, Test Shape: {X_test.shape}")

# 10. Scale Numerical Features (Fit on Train, Transform Both)
scaler = StandardScaler()

# Fit on training data
X_train[num_cols_post] = scaler.fit_transform(X_train[num_cols_post])

# Transform test data
X_test[num_cols_post] = scaler.transform(X_test[num_cols_post])

print(f"Final split:")
print(f"  X_train: {X_train.shape}, X_test: {X_test.shape}")
print(f"  y_train distribution: {y_train.value_counts(normalize=True).round(3).to_dict()}")
print(f"  y_test  distribution: {y_test.value_counts(normalize=True).round(3).to_dict()}\n")

# Ready for Modeling
print("PREPROCESSING COMPLETE", "\n")
print(f"Final Training Features: {X_train.shape[1]}")
print(f"Class Balance Preserved: ~68.7% Y, 31.3% N")
print(f"Data is clean, encoded, scaled, and split.")
print("Ready for machine learning models (Logistic Regression, Random Forest, etc.)")

Scaled 8 numerical features using StandardScaler


Train Shape: (491, 20), Test Shape: (123, 20)
Final split:
  X_train: (491, 20), X_test: (123, 20)
  y_train distribution: {1: 0.686, 0: 0.314}
  y_test  distribution: {1: 0.691, 0: 0.309}

PREPROCESSING COMPLETE 

Final Training Features: 20
Class Balance Preserved: ~68.7% Y, 31.3% N
Data is clean, encoded, scaled, and split.
Ready for machine learning models (Logistic Regression, Random Forest, etc.)


In [11]:
X_train.to_csv('X_train.csv', index=False)
X_test.to_csv('X_test.csv', index=False)
y_train.to_csv('y_train.csv', index=False)
y_test.to_csv('y_test.csv', index=False)

In [58]:
joblib.dump(scaler, 'scaler.pkl')

['scaler.pkl']