Below are the feature descriptions.

## Description of variables in Train/Test.csv

0. **ID**: A unique identifier for each entry in the dataset.

1. **customer_id**: Unique identifier for each customer in the dataset.

2. **country_id**: Identifier or code representing the country where the customer resides or where the loan was issued.

3. **tbl_loan_id**: Unique identifier for each loan associated with the customer.

4. **Total_Amount**: The total loan amount initially disbursed to the customer.

5. **Total_Amount_to_Repay**: The total amount the customer is expected to repay, including principal, interest, and fees.

6. **loan_type**: The category or type of loan.

7. **disbursement_date**: The date when the loan amount was disbursed to the customer.

8. **duration**: The length of the loan term, typically expressed in days

9. **lender_id**: Unique identifier for the lender or institution that issued the loan.

10. **New_versus_Repeat**: Indicates whether the loan is the customer's first loan ("New") or if the customer has taken loans before ("Repeat").

11. **Amount_Funded_By_Lender**: The portion of the loan funded directly by the lender.

12. **Lender_portion_Funded**: Percentage of the total loan amount funded by the lender.

13. **due_date**: The date by which the loan repayment is due.

14. **Lender_portion_to_be_repaid**: The portion of the outstanding loan that needs to be repaid to the lender.

15. **target**: This variables takes the value 0 or 1. 1 means the customer defaulted on the loan, whereas 0 means, the customer paid the loan.


In [None]:
import warnings
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sn
from datetime import datetime
from scipy.stats import zscore
import xgboost as xgb
from sklearn.metrics import roc_curve,auc,confusion_matrix,accuracy_score,precision_score,classification_report,f1_score,make_scorer,precision_recall_curve
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold,RandomizedSearchCV, GridSearchCV, train_test_split,cross_val_score
from bayes_opt import BayesianOptimization
import optuna
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import VotingClassifier, StackingClassifier
import seaborn as sns
from datetime import datetime

In [None]:
# Just to ignore the warnings
warnings.filterwarnings('ignore')

# Max columns
pd.set_option('display.max_columns', None)  # Ensure all columns are shown
pd.set_option('display.expand_frame_repr', False)

# Importing the data

In [None]:
# train and test set
train = pd.read_csv("C:\\Users\\josha\Downloads\\african-credit-scoring-challenge20241203-14702-1yayxml\\Train.csv")
test = pd.read_csv("C:\\Users\\josha\Downloads\\african-credit-scoring-challenge20241203-14702-1yayxml\\Test.csv")

In [None]:
train

In [None]:
# Display in the full format
pd.set_option('display.float_format', '{:.2f}'.format)
train.describe()

In [None]:
# Display in the full format
pd.set_option('display.float_format', '{:.2f}'.format)
test.describe()

# Data Exploration and EDA

In [None]:
# log transforming our numericals 
numerical = ['Total_Amount','Total_Amount_to_Repay','duration','Amount_Funded_By_Lender','Lender_portion_Funded','Lender_portion_to_be_repaid']
for n in numerical:
    train[f'{n}_log'] = np.log1p(train[n])
    test[f'{n}_log'] = np.log1p(test[n])

In [None]:
# Changing ID columns into objects
test['lender_id'] = test['lender_id'].astype('object')
test['customer_id'] = test['customer_id'].astype('object')
test['tbl_loan_id'] = test['tbl_loan_id'].astype('object')

In [None]:
# Processing dates and splitting into month, day, year
def date_processing(df):
    df['disbursement_date'] = pd.to_datetime(df['disbursement_date'])
    df['due_date'] = pd.to_datetime(df['due_date'])
    
    # Get month, day and year
    df['disbursement_year'] = df['disbursement_date'].dt.year
    df['disbursement_month'] = df['disbursement_date'].dt.month
    df['disbursement_day'] = df['disbursement_date'].dt.day
    
    df['due_year'] = df['due_date'].dt.year
    df['due_month'] = df['due_date'].dt.month
    df['due_day'] = df['due_date'].dt.day
    return df

train = date_processing(train)
test = date_processing(test)

In [None]:
# Number of loans taken by each customer
def loans_taken(df):
    loan_counts = df.groupby('customer_id').agg(loans_taken=('customer_id','count'))
    df = df.merge(loan_counts, on='customer_id')
    return df

train = loans_taken(train)
test = loans_taken(test)

In [None]:
# How does my new feature look between the target classes?
sns.boxplot(x='target',y='loans_taken',data=train)
plt.xlabel('The loan default class')
plt.ylabel('Number of loans taken')
plt.title('Number of loans taken per default class')
plt.show()

In [None]:
# Investigate the categoricals
train['due_month_categorical'] = train['due_month'].astype('object')
train['disbursed_month_categorical'] = train['disbursement_month'].astype('object')
train['target'] = train['target'].astype('object')
train['lender_id'] = train['lender_id'].astype('object')
train['customer_id'] = train['customer_id'].astype('object')
train['tbl_loan_id'] = train['tbl_loan_id'].astype('object')

# Just checking the value counts for my categoricals
exclusions = ['ID', 'disbursement_date','due_date','customer_id','tbl_loan_id'] # Too many to visually see
categoricals = train.select_dtypes(include='object')
for c in categoricals.columns:
  if c not in exclusions:
    print(f'{c}: {categoricals[c].value_counts()}')

Imbalanced target variable, this would need addressing.

In [None]:
# Subsetting for just the log transformed features
numericals = train[['Total_Amount','Total_Amount_to_Repay','duration','Amount_Funded_By_Lender','Lender_portion_Funded','Lender_portion_to_be_repaid','Total_Amount_log','Total_Amount_to_Repay_log','duration_log','Amount_Funded_By_Lender_log','Lender_portion_Funded_log','Lender_portion_to_be_repaid_log']]
for n in numericals.columns:
  bin = int(np.sqrt(len(train)))  # Setting the number of bins as the square root of the length

  plt.figure(figsize=(10,5))
  plt.hist(train[n], bins=bin)
  plt.title(f'Distribution of {n}')
  plt.xlabel(n)
  plt.ylabel('Frequency')
  plt.show()

Log transformed my values due to skewing. Very difficult to see the trends of the raw values.

In [None]:
# Boxplots of target vs features, using numericals and exclusions from before
for n in numericals.columns:
  plt.figure(figsize=(10,5))
  sns.boxplot(x='target', y=n, data=train)
  plt.title(f'Box plot of {n} against target')
  plt.xlabel('Loan default')
  plt.ylabel(n)
  plt.show()


Again, due to the data spread, it's easier to spot trends in the log transformed data.

In [None]:
# Countplots for all my categorical features
exclude = ['ID','customer_id','tbl_loan_id','country_id']
for c in categoricals.columns:
  if c not in exclude:
    plt.figure(figsize=(10,5))
    sns.countplot(x=c, data=train)
    plt.title(f'Count plot of {c}')
    plt.xlabel(c)
    plt.ylabel('Frequency')
    plt.xticks(rotation=45)
    plt.show()

1. A very dominant lender id and loan_type in the dataset
2. Majority are repeat loans
3. Massive class imbalance in our target class, very few loan defaults in our dataset
4. Most of our loans are taken and paid in the second half of the year

In [None]:
# Splitting our categorical features by our target variable
for c in categoricals.columns:
  if c not in exclude:
    plt.figure(figsize=(10,5))
    sns.countplot(x=c, data=train, hue='target')
    plt.title(f'Count plot of {c}')
    plt.xlabel(c)
    plt.ylabel('Frequency')
    plt.xticks(rotation=45)
    plt.show()

# Feature Engineering

The below feature had the biggest impact.

In [None]:
# Creating a ratio of how much is left of the debt
train['repayment_ratio'] = train['Total_Amount_to_Repay'] / train['Total_Amount']
test['repayment_ratio'] = test['Total_Amount_to_Repay'] / test['Total_Amount']

In [None]:
# Binning our duration column to reduce some noise
bins = [0,30,180,365, float('inf')]
labels = ['Short-term','Medium-term','Long-term','Very-long term']

train['Duration category'] = pd.cut(train['duration'], bins=bins, labels=labels)
test['Duration category'] = pd.cut(test['duration'], bins=bins, labels=labels)


In [None]:
# Correlation matrix
# nums = exclusive_df.select_dtypes(exclude='object')
plt.figure(figsize=(12,6))
corr_matrix = numericals.corr()
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
plt.show()

Our features are highly correlated as a number of columns are showing the same thing, thus will either use log transformed or raw data and not both.

In [None]:
# Do we have multiple tbl_loan_ids?
threshold = 1
focus = train['tbl_loan_id'].value_counts()
need = focus[focus > threshold]

# Duplicates
duplicated = train[train['tbl_loan_id'].isin(need.index)]
duplicated

In [None]:
# Customer ID?
# Do we have the same customer ID appearing multiple times
threshold = 1
focus = train['customer_id'].value_counts()
need = focus[focus > threshold]

# How is the target class split for duplicate customer_ids
duplicated_id = train[train['customer_id'].isin(need.index)]
duplicated_id = duplicated_id.sort_values(by='customer_id', ascending=False)
duplicated_id['target'].value_counts()

In [None]:
# This function shows how many lenders are in a loan
def unique_id(df): 
    check = pd.DataFrame(df['tbl_loan_id'].value_counts()).reset_index()
    df = df.merge(check, on='tbl_loan_id',how='left')
    df = df.rename(columns={'count':'Lender_numbers'})
    return df

train = unique_id(train)
test = unique_id(test)


In [None]:
print(f"Training lenders:{train['Lender_numbers'].value_counts()}")
print(f"Test set lenders:{test['Lender_numbers'].value_counts()}")

In [None]:
# What's the spread of years for disbursement and due dates
def date_processing(df):
    df['disbursement_date'] = pd.to_datetime(df['disbursement_date'])
    df['due_date'] = pd.to_datetime(df['due_date'])
    
    # Get month, day and year
    df['disbursement_year'] = df['disbursement_date'].dt.year
    df['disbursement_month'] = df['disbursement_date'].dt.month
    df['disbursement_day'] = df['disbursement_date'].dt.day
    
    df['due_year'] = df['due_date'].dt.year
    df['due_month'] = df['due_date'].dt.month
    df['due_day'] = df['due_date'].dt.day
    return df

train = date_processing(train)
test = date_processing(test)



In [None]:
# Changing the lender_id dtypes
train['lender_id'] = train['lender_id'].astype(object)
test['lender_id'] = test['lender_id'].astype(object)

Below is the economics indicators data, providing further info on the countries. Including them improved my score locally but not on the leaderboard. I therefore didn't include them in my features but have just shown them below.

In [None]:
# Economic indicators
indicators_df = pd.read_csv('C:\\Users\\josha\\Downloads\\african-credit-scoring-challenge20241203-14702-1yayxml\economic_indicators.csv')
indicators_df.head()

In [None]:
# Melt the DataFrame to make years rows instead of columns
melted = pd.melt(indicators_df, id_vars=["Country", "Indicator"], var_name="Year", value_name="Value")

# Pivot to make each indicator its own column
economics_df = melted.pivot_table(index=["Country", "Year"], columns="Indicator", values="Value").reset_index()

# Keep the year numericals
economics_df['Year'] = economics_df['Year'].str[2:].astype(int)
economics_df.head()


In [None]:
# A fair amomunt of null values, in addition there's no 2024 data
economics_df.isnull().sum()

I commented out the merging as I didn't include the features but the code shows my approach. 

In [None]:
# # Some merging
# train = train.merge(economics_df, left_on=['country_id','disbursement_year'], right_on=['Country','Year'],how='left')
# test = test.merge(economics_df, left_on=['country_id','disbursement_year'], right_on=['Country','Year'],how='left')


In [None]:
# # Drop these cols
# train = train.drop(columns=['Average precipitation in depth (mm per year)','Fossil fuel energy consumption (% of total)'])
# test = test.drop(columns=['Average precipitation in depth (mm per year)','Fossil fuel energy consumption (% of total)'])
# # test = test.fillna(0)

In [None]:
# Created this function to get the actual day of the week from train and test set
def get_day_name(df):
    # Disbursement Date and due date
    df['disbursement_date'] = pd.to_datetime(df['disbursement_date'], format="%Y-%m-%d")
    df['due_date'] = pd.to_datetime(df['due_date'], format="%Y-%m-%d")
    
    # Extract day names
    df['Disbursement Day'] = df['disbursement_date'].dt.strftime("%A")
    df['Due Day'] = df['due_date'].dt.strftime("%A")
    return df

In [None]:
train = get_day_name(train)
test = get_day_name(test)

In [None]:
# Similar spread across
print(test['Disbursement Day'].value_counts())
print(test['Due Day'].value_counts())

In [None]:
# Corr matrix
nums = train.select_dtypes(exclude=['object','category'])


plt.figure(figsize=(18,10))
corr_matrix = nums.corr()
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
plt.show()

Quite a bit of multicolinearity

In [None]:
# This feature simply shows the ratio of new to returning customers each lender has.
def new_vs_returning(df):
    # Lenders and returning customers?
    grouped_ratio = df.groupby(['lender_id','New_versus_Repeat']).agg(counts=('New_versus_Repeat','count'))
    
    # Creating a new to repeat ratio
    loan_counts = grouped_ratio.unstack(fill_value=0)['counts']
    loan_counts['new vs repeat ratio'] = loan_counts['New Loan'] / loan_counts['Repeat Loan']
    
    # Reset the index of loan_counts
    loan_counts = loan_counts.reset_index()
    df = df.merge(loan_counts[['lender_id','new vs repeat ratio']], on='lender_id')
    return df

train = new_vs_returning(train)
test = new_vs_returning(test)

In [None]:
# Experimenting with some interaction features
def final_features(df):
    df['lender_funded_ratio'] = df['Lender_portion_Funded'] / df['Total_Amount']
    df['lender_repay_ratio'] = df['Lender_portion_to_be_repaid'] / df['Total_Amount_to_Repay']
    return df

# Applying it
train = final_features(train)
test = final_features(test)

In [None]:
# Replaced infinite values with 0 for cases where denominator was 0
train['lender_repay_ratio'] = train['lender_repay_ratio'].replace(np.inf,0)
test['lender_repay_ratio'] = test['lender_repay_ratio'].replace(np.inf,0)

In [None]:
# Setting weekdays or weekends
def weekday_or_weekend(df):
    weekdays = ['Monday','Tuesday','Wednesday','Thursday','Friday']

    # Set as weekday or weekend for some more filtering
    df['weekday/weekend'] = np.where(df['Due Day'].isin(weekdays),'Weekday','Weekend')
    return df

train = weekday_or_weekend(train)
test = weekday_or_weekend(test)

# Outlier investigation

In [None]:
# Computing z scores for numericals

nums = train[['Total_Amount','Total_Amount_to_Repay','Amount_Funded_By_Lender','Lender_portion_Funded','Lender_portion_to_be_repaid']]
for col in nums.columns:
    nums[f'{col}_zscore'] = zscore(nums[col])

nums

In [None]:
# Filtering the zscore columns
zscore_cols = [col for col in nums.columns if '_zscore' in col]

# Any value wth a zscore of >3 is considered an outlier in this case
outlier_rows = nums[(nums[zscore_cols]>3).any(axis=1)]
outlier_rows

In [None]:
# Mapping it back to the original dataframe
outliers_df = train.loc[outlier_rows.index]
outliers_df.head()

In [None]:
outliers_df.groupby(['lender_id','New_versus_Repeat','target']).agg(ratio = ('New_versus_Repeat','count'))

Lender ID 245684 had most of their loan defaults coming from new loans, which is different from all the rest.

In [None]:
# Among our outliers who defaulted, is there any trend?
defaults = outliers_df[outliers_df['target']==1]
defaults['lender_id'].value_counts()

# Features

In future, a more robust feature selection method will be needed, as my approach was quite experimental and reactive based on feature importance and the impact features had on the leaderboard score and my local score.

In [None]:
common_features = [
    'lender_id',
    'loan_type',
    'New_versus_Repeat',
    'Amount_Funded_By_Lender',
    'Lender_portion_Funded',
    'Lender_portion_to_be_repaid',
    'disbursement_month',
    'disbursement_day',
    'due_year',
    'due_month',
    'due_day',
    'Disbursement Day',
    'Due Day',
    'Duration category',
    'repayment_ratio',
    'loans_taken',
    'Lender_numbers',
    'new vs repeat ratio',
    'weekday/weekend',
    'lender_funded_ratio',
    'lender_repay_ratio'
]

predictors = train[common_features + ['target']]  # Add target only for train
predictions = test[common_features]


In [None]:
# Final confirmation
predictions.isnull().sum()

In [None]:
# Final heatmap
nums = predictors.select_dtypes(exclude=['object','category'])
corr = nums.corr()
plt.figure(figsize=(12,6))
sns.heatmap(corr, annot=True)
plt.show()

NOTE.Despite the multicolinearity, this combination gave the best score on the public leaderboard so I chose to keep it this way. In hindsight, I should have removed these highly correlated features.

# Baseline model

Starting with a baseline model to see how our model will learn. Will go straight to XGBoost.
In the next challenge, i'll make use of StratifiedGroupKFold, grouping by customer_id to ensure that all customer_ids are in the same group meaning there'll be no leakage.

In [None]:
#One hot encoding training, val and test set
processed_x = pd.get_dummies(predictors.drop(columns=['target']))
processed_y = predictors['target']


processed_test = pd.get_dummies(predictions)

# Aligning the dfs to ensure we have the same columns
final_train, X_test = processed_x.align(processed_test, join='outer', axis=1, fill_value=0)

# Initialize Stratified K-Fold
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Looping through each fold
fold_results = []
scaler = StandardScaler()

for fold, (train_index, val_index) in enumerate(skf.split(final_train, processed_y)):
    print(f"Processing Fold {fold+1}...")

    # Train/Validation split for the current fold
    X_train, X_val = final_train.iloc[train_index], final_train.iloc[val_index]
    y_train, y_val = processed_y.iloc[train_index], processed_y.iloc[val_index]
    
# Ensuring y_train and y_val are integers
y_train = y_train.astype('int64')
y_val = y_val.astype('int64')

#Scale the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)


In order to address the class imbalance, I set the class_weights manually which will be passed during the fit. This brought me significantly better results than upsampling using SMOTE.

In [None]:
# Assuming binary classification
class_weights = {0: 1, 1: 3}  # Higher weight for minority class

# Use sample_weight during fitting
sample_weight = y_train.map(class_weights)

Started the tuning with a small grid and then moved to other methods. Medium max depth and min child weight were used to ensure we have no extremes of underfitting/overfitting.

In [None]:
# Hyperparameter tuning
param_grid_x = {
    'learning_rate': [0.01,0.1],
    'max_depth':[5,6],
    'n_estimators': [200, 300],
    'min_child_weight': [5,6]
}


xgbmodel = xgb.XGBClassifier()
xgb_c = RandomizedSearchCV(xgbmodel,param_grid_x,cv=5,scoring='f1')
xgb_c.fit(X_train_scaled,y_train,sample_weight=sample_weight) # Tuning with the weights

print("Tuned XGB Parameters: {}".format(xgb_c.best_params_))
print("Best score is {}".format(xgb_c.best_score_))

xg_params = xgb_c.best_params_

After the initial tuning, I made use of Bayesian Optimization and Optuna as well. The Optuna gave me better results on the lb but as expected, both gave better F1 than the simple grid.

In [None]:
# Bayesian Optimization process
def xgb_f1_eval(learning_rate, max_depth, n_estimators, min_child_weight):
    # Convert inputs to integers where required
    max_depth = int(max_depth)
    n_estimators = int(n_estimators)
    min_child_weight = int(min_child_weight)
    
    # Initialize XGBClassifier with weights for the minority class
    model = xgb.XGBClassifier(
        learning_rate=learning_rate,
        max_depth=max_depth,
        n_estimators=n_estimators,
        min_child_weight=min_child_weight,
        random_state=42,
        use_label_encoder=False,
        eval_metric='logloss',
        scale_pos_weight=3  # Adjusting for class imbalance
    )
    
    # Train the model
    model.fit(X_train_scaled, y_train)
    
    # Predict and evaluate F1 score
    y_pred = model.predict(X_val_scaled)
    f1 = f1_score(y_val, y_pred)
    return f1

# Defining the parameter search space
param_bounds = {
    'learning_rate': (0.01, 0.2),
    'max_depth': (5, 10),
    'n_estimators': (100, 500),
    'min_child_weight': (4, 10)
}

# Run Bayesian Optimization
optimizer = BayesianOptimization(f=xgb_f1_eval, pbounds=param_bounds, random_state=42)
optimizer.maximize(init_points=5, n_iter=25)

# Best parameters and F1 score
best_params = optimizer.max['params']
best_params['max_depth'] = int(best_params['max_depth'])
best_params['n_estimators'] = int(best_params['n_estimators'])
best_params['min_child_weight'] = int(best_params['min_child_weight'])
best_f1_score = optimizer.max['target']

best_params, best_f1_score

In [None]:
# Objective function for Optuna
def objective(trial):
    # Defining the parameter search space
    params = {
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2),
        'max_depth': trial.suggest_int('max_depth', 5, 8),
        'n_estimators': trial.suggest_int('n_estimators', 100, 500),
        'min_child_weight': trial.suggest_int('min_child_weight', 5, 10),
        'scale_pos_weight': 3,  # Adjust for class imbalance,
        'random_state':42,
        'use_label_encoder': False,
        'eval_metric': 'logloss'
    }

    # Initialize the XGBoostClassifier
    model = xgb.XGBClassifier(**params)
    
    # Train the model
    model.fit(X_train_scaled, y_train)
    
    # Predict on validation set
    y_pred = model.predict(X_val_scaled)
    
    # Evaluate F1 score
    f1 = f1_score(y_val, y_pred)
    return f1

# Create an Optuna study
study = optuna.create_study(direction="maximize")  # Maximize the F1 score
study.optimize(objective, n_trials=30)

# Get the best parameters and score
best_paramss = study.best_params
best_f1_score = study.best_value

print("Best Parameters:", best_paramss)
print("Best F1 Score:", best_f1_score)


In [None]:
#XGBoost model
xgb_model = xgb.XGBClassifier(random_state=42, **best_params) # Trying Bayesian params

xgb_model.fit(X_train_scaled,y_train,sample_weight=sample_weight)
y_pred = xgb_model.predict(X_test_scaled)

#Matrix of the train data
training_set = xgb_model.predict(X_train_scaled)
print('Trained data matrix:',confusion_matrix(y_train,training_set))
print('Training set:',accuracy_score(y_train,training_set))
print(classification_report(y_train,training_set))

#Matrix of val data
val_set = xgb_model.predict(X_val_scaled)
print('Validation data matrix:',confusion_matrix(y_val,val_set))
print('Validation set:',accuracy_score(y_val,val_set))
print(classification_report(y_val,val_set))

These results show a good balance between classes which is good to see considering the class imbalance. Just need to ensure that there was no data leakage anywhere.

Below shows that the CV score for F1 was 0.86. Given our standard deviation is less than 0.2 of our mean, we can conclude that the model's performance is stable.

In [None]:
# Define the k-fold cross-validation
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Using F1 score
f1_scorer = make_scorer(f1_score)

# Perform cross-validation
cv_scores = cross_val_score(xgb_model, X_train_scaled, y_train, cv=kfold, scoring=f1_scorer)

# Output the scores
print(f"Cross-Validation F1 Scores: {cv_scores}")
print(f"Mean F1 Score: {cv_scores.mean():.4f}")
print(f"Standard Deviation of F1 Scores: {cv_scores.std():.4f}")

Adjusting the decision boundary but didn't get any difference.

In [None]:
# Adjusted threshold
# Predict probabilities
y_pred_proba = xgb_model.predict_proba(X_test_scaled)[:, 1]

# Adjust threshold
threshold = 0.45
y_pred_adjusted = (y_pred_proba >= threshold).astype(int)



In [None]:
# Predict probabilities
y_pred_proba_val = xgb_model.predict_proba(X_val_scaled)[:, 1]

# Adjust threshold
threshold = 0.45
y_pred_adjusted_val = (y_pred_proba_val >= threshold).astype(int)

# # Evaluate performance
print('Validation data matrix:',confusion_matrix(y_val,y_pred_adjusted_val))
print('Validation set:',accuracy_score(y_val,y_pred_adjusted_val))
print(classification_report(y_val, y_pred_adjusted_val))

# Feature Importance

In [None]:
# Feature importance
feature_importances = pd.Series(xgb_model.feature_importances_, index=X_train.columns).sort_values(ascending=False)
print(feature_importances)

In [None]:
# Plotting the feature importances
plt.figure(figsize=(12, 8))  
sns.barplot(x=feature_importances.values[:20], y=feature_importances.index[:20], palette="viridis")  # Top 20 features
plt.title("Feature Importances from XGBoost")
plt.xlabel("Importance Score")
plt.ylabel("Features")
plt.tight_layout()  
plt.show()

# Tuning and Ensemble

To boost the results, I incorporated an ensemble approach adding Catboost and LGBM classifiers, and then using Stacking/Voting to make the predictions. In this situation the Voting Classifier produced better lb results. Did a simpler param tune for both of these but in future might also run Optuna on them to maximise accuracy.

In [None]:
# Params for catboost
param_grid_r = {
    'iterations': [100],               # Number of boosting iterations. Any higher and processing time is too long
    'learning_rate': [0.03, 0.15],           
    'depth': [6, 8],                        
    'l2_leaf_reg': [3, 7],                  # L2 regularization strength
    'bagging_temperature': [3, 5],          
}

catboost = CatBoostClassifier()
cat = RandomizedSearchCV(catboost,param_grid_r,n_iter=5,cv=5,scoring='f1')
cat.fit(X_train_scaled,y_train,sample_weight=sample_weight)

print("Tuned RF Parameters: {}".format(cat.best_params_))
print("Best score is {}".format(cat.best_score_))

cat_params = cat.best_params_

In [None]:
# Lightgbm params
param_grid_l = {
    'learning_rate': [0.01, 0.05, 0.1, 0.2],  
    'max_depth': [4, 5, 6, 7],  
    'n_estimators': [100, 200, 300, 500],  
    'min_child_weight': [3, 5, 10],  
    'subsample': [0.6, 0.8, 1.0],  
    'colsample_bytree': [0.6, 0.8, 1.0]  
}

lgbmodel = LGBMClassifier()
lgb_c = RandomizedSearchCV(lgbmodel,param_grid_x,cv=5,scoring='f1')
lgb_c.fit(X_train_scaled,y_train,sample_weight=sample_weight)

print("Tuned LGB Parameters: {}".format(lgb_c.best_params_))
print("Best score is {}".format(lgb_c.best_score_))

lg_params = lgb_c.best_params_

In [None]:
# My models
xgb_modell = xgb.XGBClassifier(**best_params)  
cat_model = CatBoostClassifier(**cat_params, random_state=42)
lgb_model = LGBMClassifier(**lg_params)  # LightGBM model

# Fit models individually
xgb_modell.fit(X_train_scaled, y_train,sample_weight=sample_weight)
cat_model.fit(X_train_scaled, y_train,sample_weight=sample_weight)
lgb_model.fit(X_train_scaled, y_train, sample_weight=sample_weight)

# Evaluate individual models
for model, name in zip([xgb_modell, cat_model, lgb_model], ['XGBoost', 'Catboost','Light GBM']):
    y_val_pred = model.predict(X_val_scaled)
    print(f"{name} F1 Score: {f1_score(y_val, y_val_pred):.4f}")


In [None]:
# VotingClassifier
voting_clf = VotingClassifier(
    estimators=[('xgb', xgb_modell), ('cat', cat_model), ('lgb', lgb_model)],
    voting='soft',
    weights=[0.4, 0.3, 0.3]# Use 'soft' for probabilities, weighted XGBoost a bit higher
)
voting_clf.fit(X_train_scaled, y_train)

# Evaluate on validation set
y_val_pred = voting_clf.predict(X_val_scaled)
print(f"VotingClassifier F1 Score: {f1_score(y_val, y_val_pred):.4f}")


In [None]:
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_val_score

# Define the k-fold cross-validation
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Using F1 score
f1_scorer = make_scorer(f1_score)

# Perform cross-validation
cv_scores = cross_val_score(voting_clf, X_train_scaled, y_train, cv=kfold, scoring=f1_scorer)

# Output the scores
print(f"Cross-Validation F1 Scores: {cv_scores}")
print(f"Mean F1 Score: {cv_scores.mean():.4f}")
print(f"Standard Deviation of F1 Scores: {cv_scores.std():.4f}")

Stacking didn't give me good results in this occasion.

In [None]:
# # StackingClassifier
# stacking_clf = StackingClassifier(
#     estimators=[('xgb', xgb_modell), ('cat', cat_model), ('lgb', lgb_model)],
#     final_estimator=xgb.XGBClassifier(random_state=42),
#     cv=5
# )
# stacking_clf.fit(X_train_scaled, y_train)

# # Evaluate on validation set
# y_val_pred = stacking_clf.predict(X_val_scaled)
# print(f"StackingClassifier F1 Score: {f1_score(y_val, y_val_pred):.4f}")


In [None]:
from sklearn.metrics import precision_recall_curve
# Predict probabilities
y_val_probs = voting_clf.predict_proba(X_val_scaled)[:, 1]

precision, recall, thresholds = precision_recall_curve(y_val, y_val_probs)
f1_scores = 2 * (precision * recall) / (precision + recall)
optimal_idx = f1_scores.argmax()
optimal_threshold = thresholds[optimal_idx]

print(f"Optimal Threshold: {optimal_threshold:.4f}")

# Predict with adjusted threshold
y_val_pred = (y_val_probs >= optimal_threshold).astype(int)
print(f"Adjusted F1 Score: {f1_score(y_val, y_val_pred):.4f}")


In [None]:
y_test_pred = voting_clf.predict(X_test_scaled)
# Predict probabilities
y_pred_probab = voting_clf.predict_proba(X_test_scaled)[:, 1]

# Adjust threshold
threshold = optimal_threshold
y_pred_adjusted = (y_pred_probab >= threshold).astype(int)

In [None]:
# Predict probabilities
y_pred_proba_valx = voting_clf.predict_proba(X_val_scaled)[:, 1]

# Adjust threshold
threshold = optimal_threshold
y_pred_adjusted_val = (y_pred_proba_valx >= threshold).astype(int)

# # Evaluate performance
print('Validation data matrix:',confusion_matrix(y_val,y_pred_adjusted_val))
print('Validation set:',accuracy_score(y_val,y_pred_adjusted_val))
print(classification_report(y_val, y_pred_adjusted_val))

# Submission

In [None]:
test['ID']

In [None]:
test.Target = y_pred_adjusted

# Create submission DataFrame
submission = pd.DataFrame({"ID": test["ID"],
                           "Target": test.Target})
submission

In [None]:
# Create submission csv file csv file
submission.to_csv('adjusted_submission.csv', index = False)
