## Imports

In [15]:
# Import necessary libraries
import gc
import os
import itertools
import pickle
import re
import time

import warnings
warnings.filterwarnings('ignore')

from random import choice, choices
from functools import reduce
from tqdm import tqdm
from itertools import cycle

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib_venn import venn2

from scipy import stats
from sklearn import metrics
from sklearn import model_selection
from sklearn import preprocessing
from sklearn import linear_model
from sklearn import ensemble
from sklearn import decomposition
from sklearn import tree

import lightgbm as lgb
import xgboost as xgb

# Set display options
pd.set_option("display.max_columns", None)
plt.style.use("ggplot")
color_pal = plt.rcParams["axes.prop_cycle"].by_key()["color"]
color_cycle = cycle(plt.rcParams["axes.prop_cycle"].by_key()["color"])

# Load datasets
train_df = pd.read_csv("/content/Train.csv")
test_df = pd.read_csv("/content/Test.csv")

# Define loan overdue columns
loan_overdue_columns = [
    'loans_within_5_days',
    'loans_within_5_to_30_days',
    'loans_within_30_to_60_days',
    'loans_within_60_to_90_days',
    'loans_over_90_days',
    'is_zero_loans_within_5_days',
    'is_zero_loans_within_5_to_30_days',
    'is_zero_loans_within_30_to_60_days',
    'is_zero_loans_within_60_to_90_days',
    'is_zero_loans_over_90_days'
]

# Ensure all columns exist in the training data
for col in loan_overdue_columns:
    if col not in train_df.columns:
        train_df[col] = 0

# Basic EDA
print("Train Data Head:")
print(train_df.head())

print("Test Data Head:")
print(test_df.head())

print("Train Data Info:")
print(train_df.info())

print("Test Data Info:")
print(test_df.info())



import pandas as pd
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

# Assuming X_train is your DataFrame with string columns

# Create a copy of the DataFrame to avoid modifying the original
X_train_encoded = X_train.copy()
X_val_encoded = X_val.copy()  # Do the same for your validation set
X_test_encoded = test_df.copy()  # Do the same for your test set


# Initialize LabelEncoder
encoder = LabelEncoder()

# Loop through columns in your DataFrame
for col in X_train_encoded.select_dtypes(include=['object']).columns:
    # Fit and transform on training data, then transform validation and test data

    # Fit on all unique values from train, validation, and test sets
    all_values = pd.concat([X_train_encoded[col], X_val_encoded[col], X_test_encoded[col]]).unique()
    encoder.fit(all_values)

    X_train_encoded[col] = encoder.transform(X_train_encoded[col])
    X_val_encoded[col] = encoder.transform(X_val_encoded[col])
    X_test_encoded[col] = encoder.transform(X_test_encoded[col])

# Before scaling the test data, ensure all features are present
for feature in features:
    if feature not in X_test_encoded.columns:
        X_test_encoded[feature] = 0  # Add missing columns and fill with 0

# Now you can scale the features:
X_test_scaled = scaler.transform(X_test_encoded[features])
# Now you can scale the features:
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train_encoded[features])
X_val_scaled = scaler.transform(X_val_encoded[features])
X_test_scaled = scaler.transform(X_test_encoded[features])



# Convert date columns to datetime format with error handling
train_df['disbursement_date'] = pd.to_datetime(train_df['disbursement_date'], errors='coerce')
train_df['due_date'] = pd.to_datetime(train_df['due_date'], errors='coerce')
test_df['disbursement_date'] = pd.to_datetime(test_df['disbursement_date'], errors='coerce')
test_df['due_date'] = pd.to_datetime(test_df['due_date'], errors='coerce')

# Calculate repayment_days, ensuring valid dates
train_df['repayment_days'] = (train_df['due_date'] - train_df['disbursement_date']).dt.days
test_df['repayment_days'] = (test_df['due_date'] - test_df['disbursement_date']).dt.days

# Handle missing or invalid repayment_days
train_df['repayment_days'] = train_df['repayment_days'].fillna(-1)  # Use -1 or a neutral value
test_df['repayment_days'] = test_df['repayment_days'].fillna(-1)

# Feature Engineering for Improved Model Performance
import numpy as np

# Date-based Features
train_df['repayment_days'] = (train_df['due_date'] - train_df['disbursement_date']).dt.days
test_df['repayment_days'] = (test_df['due_date'] - test_df['disbursement_date']).dt.days

train_df['disbursement_weekday'] = train_df['disbursement_date'].dt.weekday
test_df['disbursement_weekday'] = test_df['disbursement_date'].dt.weekday

train_df['disbursement_month'] = train_df['disbursement_date'].dt.month
test_df['disbursement_month'] = test_df['disbursement_date'].dt.month

# Ratios
train_df['repayment_percentage'] = train_df['Total_Amount_to_Repay'] / train_df['Total_Amount']
test_df['repayment_percentage'] = test_df['Total_Amount_to_Repay'] / test_df['Total_Amount']

train_df['lender_funding_percentage'] = train_df['Amount_Funded_By_Lender'] / train_df['Total_Amount']
test_df['lender_funding_percentage'] = test_df['Amount_Funded_By_Lender'] / test_df['Total_Amount']

# Interaction Features
train_df['duration_funding_interaction'] = train_df['duration'] * train_df['lender_funding_percentage']
test_df['duration_funding_interaction'] = test_df['duration'] * test_df['lender_funding_percentage']

# Log Transformation for Skewed Features
train_df['log_Total_Amount'] = np.log1p(train_df['Total_Amount'])
test_df['log_Total_Amount'] = np.log1p(test_df['Total_Amount'])

# Encoding Categorical Variables
train_df = pd.get_dummies(train_df, columns=['loan_type', 'New_versus_Repeat'], drop_first=True)
test_df = pd.get_dummies(test_df, columns=['loan_type', 'New_versus_Repeat'], drop_first=True)

# Align Test Columns with Training Columns
missing_cols = set(train_df.columns) - set(test_df.columns)
for col in missing_cols:
    test_df[col] = 0

test_df = test_df[train_df.columns]
if 'target' in test_df.columns:
    test_df = test_df.drop('target', axis=1)

# Advanced Date-Based Features
train_df['disbursement_quarter'] = train_df['disbursement_date'].dt.quarter
test_df['disbursement_quarter'] = test_df['disbursement_date'].dt.quarter

train_df['repayment_day_of_year'] = train_df['disbursement_date'].dt.dayofyear
test_df['repayment_day_of_year'] = test_df['disbursement_date'].dt.dayofyear

train_df['due_day_of_year'] = train_df['due_date'].dt.dayofyear
test_df['due_day_of_year'] = test_df['due_date'].dt.dayofyear

# Interaction Terms
train_df['amount_duration_ratio'] = train_df['Total_Amount'] / (train_df['duration'] + 1)
test_df['amount_duration_ratio'] = test_df['Total_Amount'] / (test_df['duration'] + 1)

train_df['repayment_to_duration_ratio'] = train_df['repayment_days'] / (train_df['duration'] + 1)
test_df['repayment_to_duration_ratio'] = test_df['repayment_days'] / (test_df['duration'] + 1)

train_df['repayment_percentage_duration_interaction'] = train_df['repayment_percentage'] * train_df['duration']
test_df['repayment_percentage_duration_interaction'] = test_df['repayment_percentage'] * test_df['duration']

# Polynomial Features
train_df['repayment_percentage_squared'] = train_df['repayment_percentage'] ** 2
test_df['repayment_percentage_squared'] = test_df['repayment_percentage'] ** 2

# Aggregated Features
train_df['total_amount_per_month'] = train_df['Total_Amount'] / (train_df['repayment_days'] / 30 + 1)
test_df['total_amount_per_month'] = test_df['Total_Amount'] / (test_df['repayment_days'] / 30 + 1)



# Align Columns Between Train and Test
missing_cols = set(train_df.columns) - set(test_df.columns)
for col in missing_cols:
    test_df[col] = 0

test_df = test_df[train_df.columns]
if 'target' in test_df.columns:
    test_df = test_df.drop('target', axis=1)

# Save Updated Datasets
train_df.to_csv('Enhanced_Train_v2.csv', index=False)
test_df.to_csv('Enhanced_Test_v2.csv', index=False)





# Handling missing values (example)
train_df.fillna(0, inplace=True)
test_df.fillna(0, inplace=True)

# Define target and features
target = 'target'
features = [col for col in train_df.columns if col not in ['ID', 'target', 'disbursement_date', 'due_date']]

# Convert categorical features to numerical using Label Encoding
for col in ['country_id', 'loan_type', 'New_versus_Repeat']:
    if col in features:  # Check if the column is in your feature list
        # Create a LabelEncoder for each categorical column
        encoder = preprocessing.LabelEncoder()

        # Fit the encoder on both train and test data to ensure consistent mapping
        encoder.fit(pd.concat([train_df[col], test_df[col]], axis=0).astype(str))

        # Transform the columns in both train and test data
        train_df[col] = encoder.transform(train_df[col].astype(str))
        test_df[col] = encoder.transform(test_df[col].astype(str))

# ... (your existing code) ...

# Train-test split for model evaluation
X = train_df[features]
y = train_df[target]
X_train, X_val, y_train, y_val = model_selection.train_test_split(X, y, test_size=0.2, random_state=42)

# Create LightGBM Datasets *after* Label Encoding
lgb_train = lgb.Dataset(X_train, y_train)  # Use the encoded data
lgb_val = lgb.Dataset(X_val, y_val)      # Use the encoded data

# ... (rest of your code) ...

Train Data Head:
                      ID  customer_id country_id  tbl_loan_id  lender_id  \
0  ID_266671248032267278       266671      Kenya       248032     267278   
1  ID_248919228515267278       248919      Kenya       228515     267278   
2  ID_308486370501251804       308486      Kenya       370501     251804   
3  ID_266004285009267278       266004      Kenya       285009     267278   
4  ID_253803305312267278       253803      Kenya       305312     267278   

  loan_type  Total_Amount  Total_Amount_to_Repay disbursement_date  \
0    Type_1        8448.0                 8448.0        2022-08-30   
1    Type_1       25895.0                25979.0        2022-07-30   
2    Type_7        6900.0                 7142.0        2024-09-06   
3    Type_1        8958.0                 9233.0        2022-10-20   
4    Type_1        4564.0                 4728.0        2022-11-28   

     due_date  duration New_versus_Repeat  Amount_Funded_By_Lender  \
0  2022-09-06         7       Repea

In [16]:
params = {
    'objective': 'binary',
    'metric': 'binary_logloss',
    'boosting_type': 'gbdt',
    'num_leaves': 40,  # Increase for capturing more patterns
    'learning_rate': 0.04,  # Lower for finer training
    'feature_fraction': 0.85,  # Slightly reduce to prevent overfitting
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'lambda_l1': 0.3,  # Moderate regularisation
    'lambda_l2': 0.7,  # Increased regularisation
    'min_data_in_leaf': 25,  # Moderate number for balance
    'verbose': 0
}


print("Training LightGBM model...")
model = lgb.train(
    params,  # Now 'params' is defined
    lgb_train,
    valid_sets=[lgb_train, lgb_val],
    num_boost_round=1000,
    callbacks=[lgb.early_stopping(stopping_rounds=100, verbose=True)],
    valid_names=['train', 'eval']
)

print("Training LightGBM model...")
model = lgb.train(
    params,
    lgb_train,
    valid_sets=[lgb_train, lgb_val],
    num_boost_round=1000,
    callbacks=[lgb.early_stopping(stopping_rounds=100, verbose=True)], # Use early_stopping callback
    # verbose_eval=100  Removed verbose_eval and added valid_names for clarity
    valid_names=['train', 'eval']
)

Training LightGBM model...
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[260]	train's binary_logloss: 0.0028284	eval's binary_logloss: 0.011274
Training LightGBM model...
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[260]	train's binary_logloss: 0.0028284	eval's binary_logloss: 0.011274


In [17]:
val_preds = model.predict(X_val)

from sklearn.metrics import f1_score

best_threshold, best_f1 = 0, 0
for threshold in [i / 100 for i in range(10, 90, 1)]:  # Test thresholds from 0.1 to 0.9
    val_preds_binary = (val_preds > threshold).astype(int)
    f1 = f1_score(y_val, val_preds_binary)
    if f1 > best_f1:
        best_f1 = f1
        best_threshold = threshold

print(f"Optimal Threshold: {best_threshold}")
print(f"Best F1 Score: {best_f1}")

val_preds_binary = (val_preds > best_threshold).astype(int)
print("Final Validation F1 Score:", f1_score(y_val, val_preds_binary))


Optimal Threshold: 0.69
Best F1 Score: 0.8922413793103449
Final Validation F1 Score: 0.8922413793103449


In [None]:
# Ensure all columns exist in the training data
for col in loan_overdue_columns:
    if col not in train_df.columns:
        train_df[col] = 0

# Ensure all columns exist in the test data
for col in loan_overdue_columns:
    if col not in test_df.columns:
        test_df[col] = 0

# Basic EDA
print("Train Data Head:")
print(train_df.head())

print("Test Data Head:")
print(test_df.head())

print("Train Data Info:")
print(train_df.info())

print("Test Data Info:")
print(test_df.info())

Train Data Head:
                      ID  customer_id  country_id  tbl_loan_id  lender_id  \
0  ID_266671248032267278       266671           1       248032     267278   
1  ID_248919228515267278       248919           1       228515     267278   
2  ID_308486370501251804       308486           1       370501     251804   
3  ID_266004285009267278       266004           1       285009     267278   
4  ID_253803305312267278       253803           1       305312     267278   

   Total_Amount  Total_Amount_to_Repay disbursement_date   due_date  duration  \
0        8448.0                 8448.0        2022-08-30 2022-09-06         7   
1       25895.0                25979.0        2022-07-30 2022-08-06         7   
2        6900.0                 7142.0        2024-09-06 2024-09-13         7   
3        8958.0                 9233.0        2022-10-20 2022-10-27         7   
4        4564.0                 4728.0        2022-11-28 2022-12-05         7   

   Amount_Funded_By_Lender  Lende

In [None]:
# Predict on the test set
X_test = test_df[features]
test_preds = model.predict(X_test)

# Apply the best threshold to the test predictions
test_preds_binary = (test_preds > best_threshold).astype(int)

# Create a submission DataFrame
submission = pd.DataFrame({
    'ID': test_df['ID'],  # Assuming 'ID' is the identifier column
    'target': test_preds_binary
})

In [None]:

# Save the submission to a CSV file
submission.to_csv("submission.csv", index=False)
print("Predictions saved to submission.csv")

Predictions saved to submission.csv
