In [1]:
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
import joblib

In [2]:
df = pd.read_csv("train_merged.csv")

In [6]:
# ----------------------------
# 1. Custom Feature Engineering
# ----------------------------
class LoanFeatureEngineering(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        df = X.copy()
        
        # Fill categorical nulls
        cat_cols = [
            'bank_account_type','bank_name_clients','bank_branch_clients',
            'employment_status_clients','level_of_education_clients'
        ]
        for col in cat_cols:
            df[col] = df[col].fillna("Unknown")
        
        # Fill numeric nulls
        num_cols = [
            'systemloanid_count','loanamount_mean','loanamount_max',
            'totaldue_mean','termdays_mean','termdays_max','repayment_delay_mean'
        ]
        for col in num_cols:
            df[col] = df[col].fillna(0)
        
        # Fill birthdate
        df["birthdate"] = df["birthdate"].fillna(pd.Timestamp("1900-01-01"))
        
        # Convert dates
        df["creationdate"] = pd.to_datetime(df["creationdate"])
        df["approveddate"] = pd.to_datetime(df["approveddate"])
        df["birthdate"] = pd.to_datetime(df["birthdate"])
        
        # Loan creation -> approval
        df["loan_creation_to_approval_days"] = (df["approveddate"] - df["creationdate"]).dt.days
        
        # Customer age
        df["customer_age"] = (pd.to_datetime("today") - df["birthdate"]).dt.days // 365
        
        # Drop unneeded columns
        drop_cols = [
            "creationdate","approveddate","birthdate",
            "customerid","systemloanid","referredby",
            "longitude_gps","latitude_gps"
        ]
        df = df.drop(columns=[c for c in drop_cols if c in df.columns])
        
        return df

# ----------------------------
# 2. Categorical Columns for OHE
# ----------------------------
cat_cols = [
    'bank_account_type','bank_name_clients','bank_branch_clients',
    'employment_status_clients','level_of_education_clients'
]

preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(drop="first", handle_unknown="ignore"), cat_cols)
    ],
    remainder="passthrough"
)

# ----------------------------
# 3. Full Pipeline
# ----------------------------
pipeline = Pipeline([
    ("features", LoanFeatureEngineering()),
    ("preprocess", preprocessor),
    ("model", RandomForestClassifier(n_estimators=300, random_state=42))
])

# ----------------------------
# 4. FIT the pipeline (THIS PART WAS MISSING!)
# ----------------------------
X = df.drop("good_bad_flag", axis=1)
y = df["good_bad_flag"]

pipeline.fit(X, y)

# ----------------------------
# 5. Save the fitted pipeline
# ----------------------------
joblib.dump(pipeline, "loan_pipeline.pkl")
print("Pipeline TRAINED and saved as loan_pipeline.pkl")

Pipeline TRAINED and saved as loan_pipeline.pkl


In [3]:
df.columns

Index(['Unnamed: 0', 'customerid', 'systemloanid', 'loannumber',
       'approveddate', 'creationdate', 'loanamount', 'totaldue', 'termdays',
       'referredby', 'good_bad_flag', 'birthdate', 'bank_account_type',
       'longitude_gps', 'latitude_gps', 'bank_name_clients',
       'bank_branch_clients', 'employment_status_clients',
       'level_of_education_clients', 'systemloanid_count', 'loanamount_mean',
       'loanamount_max', 'totaldue_mean', 'termdays_mean', 'termdays_max',
       'repayment_delay_mean'],
      dtype='object')

In [4]:
test = pd.read_csv("testmerge.csv")

In [5]:
test.columns

Index(['Unnamed: 0', 'customerid', 'systemloanid', 'loannumber',
       'approveddate', 'creationdate', 'loanamount', 'totaldue', 'termdays',
       'referredby', 'birthdate', 'bank_account_type', 'longitude_gps',
       'latitude_gps', 'bank_name_clients', 'bank_branch_clients',
       'employment_status_clients', 'level_of_education_clients',
       'systemloanid_count', 'loanamount_mean', 'loanamount_max',
       'totaldue_mean', 'termdays_mean', 'termdays_max',
       'repayment_delay_mean'],
      dtype='object')