<a href="https://colab.research.google.com/github/jc890/python/blob/master/casestudy01.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# Loan Prediction Preprocessing
# --------------------------------

import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler


df = pd.read_csv(r'/content/train_loan.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB


In [4]:
data.describe

In [8]:
# 3. Missing value handling

missing_summary = df.isnull().sum()

## 3b. Numerical Imputation
num_cols = df.select_dtypes(include=['int64','float64']).columns
num_imputer = SimpleImputer(strategy='median')
df[num_cols] = num_imputer.fit_transform(df[num_cols])

## 3c. Categorical Imputation
cat_cols = df.select_dtypes(include=['object']).columns
# Exclude 'Loan_ID' from categorical imputation as it's an identifier
cat_cols = cat_cols.drop('Loan_ID', errors='ignore')
cat_imputer = SimpleImputer(strategy='most_frequent')
df[cat_cols] = cat_imputer.fit_transform(df[cat_cols])

In [9]:
# 4. Outlier detection and treatment (IQR + clipping)
for col in num_cols:
    q_low, q_hi = df[col].quantile([0.01, 0.99])
    df[col] = df[col].clip(lower=q_low, upper=q_hi)

In [10]:
# 5. Encoding
# Define target and ID columns
target = 'Loan_Status'
id_col = 'Loan_ID'

# Encode target if present
if target and target in df.columns:
    df[target] = df[target].map({'Y': 1, 'N': 0})
# Drop ID
if id_col and id_col in df.columns:
    df = df.drop(columns=[id_col])
# One-hot encode categoricals
df = pd.get_dummies(df, columns=cat_cols, drop_first=True)

In [11]:
# 6. Scaling
scaler = StandardScaler()
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
if target in num_cols:
    num_cols.remove(target)
df[num_cols] = scaler.fit_transform(df[num_cols])


In [12]:
# Final dataset
print("Final shape:", df.shape)
print(df.head())

# Save processed dataset
df.to_csv("loan_data_preprocessed.csv", index=False)


Final shape: (614, 15)
   ApplicantIncome  CoapplicantIncome  LoanAmount  Loan_Amount_Term  \
0         0.145548          -0.811895   -0.214101          0.274463   
1        -0.139085           0.013787   -0.214101          0.274463   
2        -0.494989          -0.811895   -1.004570          0.274463   
3        -0.588743           0.479191   -0.316097          0.274463   
4         0.179497          -0.811895   -0.048357          0.274463   

   Credit_History  Loan_Status  Gender_Male  Married_Yes  Dependents_1  \
0        0.411733          NaN         True        False         False   
1        0.411733          NaN         True         True          True   
2        0.411733          NaN         True         True         False   
3        0.411733          NaN         True         True         False   
4        0.411733          NaN         True        False         False   

   Dependents_2  Dependents_3+  Education_Not Graduate  Self_Employed_Yes  \
0         False          Fal