# Data Modelling
This study applied a logistic regression model to predict payment discrepancies where the difference between the actual and expected paid amounts exceeded 10%. The aim was to identify which areas or units are most likely to experience such discrepancies

In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix

# Data simulation
Create 15000 data with several variables to make sure that the data is similar to the current dataset.
Each rows represent each unit with unit_id, total amount of EFTSL with levels(etc. Postgraduate), funding cluster, gov_contr and expected paid.

In [2]:
np.random.seed(42)
n = 15000

df = pd.DataFrame({
    "unit_id": [f"U{i:05d}" for i in range(1, n+1)],
    "EFTSL": np.random.uniform(0.125, 8.0, n),
    "gov_contr": np.random.uniform(6000, 24000, n),
    "Funding_Cluster": np.random.choice(
        ["Funding_Cluster_1", "Funding_Cluster_2", "Funding_Cluster_3", "Funding_Cluster_4"],
        size=n, p=[0.25, 0.30, 0.25, 0.20]
    ),
    "Unit": np.random.choice(["Health", "Psychology", "Arts", "Business"],
                             size=n, p=[0.25, 0.30, 0.25, 0.20]),
    "level_type": np.random.choice(["UG", "PG"], size=n, p=[0.7, 0.3])
})

df["expected_paid"] = df["EFTSL"] * df["gov_contr"]
df.head(15)

Unnamed: 0,unit_id,EFTSL,gov_contr,Funding_Cluster,Unit,level_type,expected_paid
0,U00001,3.074503,14994.064387,Funding_Cluster_3,Psychology,PG,46099.302477
1,U00002,7.611875,19441.441905,Funding_Cluster_2,Business,PG,147985.828771
2,U00003,5.889452,16128.002052,Funding_Cluster_4,Business,UG,94985.098646
3,U00004,4.839436,7499.446496,Funding_Cluster_1,Business,UG,36293.088076
4,U00005,1.353647,9340.444262,Funding_Cluster_3,Psychology,UG,12643.662425
5,U00006,1.353457,9947.907313,Funding_Cluster_3,Arts,UG,13464.063272
6,U00007,0.582408,10641.556566,Funding_Cluster_4,Psychology,UG,6197.732421
7,U00008,6.946137,16177.261055,Funding_Cluster_2,Arts,UG,112369.473967
8,U00009,4.858781,15071.384037,Funding_Cluster_3,Health,UG,73228.550144
9,U00010,5.701072,9656.880691,Funding_Cluster_2,Arts,PG,55054.567768


# Simulate the noise and percentage
These are the specific column for modelling which are:
gov_contr = contribution by government base on each cluster  
expected_fee = gov_contr * total_eftsl  
actual_fee = actual fee from agreement or specifc dataset 
error_flag = 1 will be triggered if the actual and expected discrepancy are more than 10%

Probability and bias point are added to make the data must more messy to let the model to learn

In [3]:
unit_bias = {"Health": 0.06, "Psychology": 0.20, "Arts": -0.02, "Business": 0.02}
level_bias  = {"UG": 0.00, "PG": 0.00}

noise = np.random.normal(0.0, 0.06, n)

bias_vec = df["Unit"].map(unit_bias).values + df["level_type"].map(level_bias).values
df["actual_paid"] = df["expected_paid"] * (1 + bias_vec + noise)

df["diff_pct"] = (df["actual_paid"] - df["expected_paid"]).abs() / df["expected_paid"]
df["flag_error"] = (df["diff_pct"] > 0.10).astype(int)

df.head(10)

Unnamed: 0,unit_id,EFTSL,gov_contr,Funding_Cluster,Unit,level_type,expected_paid,actual_paid,diff_pct,flag_error
0,U00001,3.074503,14994.064387,Funding_Cluster_3,Psychology,PG,46099.302477,60484.677186,0.312052,1
1,U00002,7.611875,19441.441905,Funding_Cluster_2,Business,PG,147985.828771,153867.808733,0.039747,0
2,U00003,5.889452,16128.002052,Funding_Cluster_4,Business,UG,94985.098646,96113.1816,0.011876,0
3,U00004,4.839436,7499.446496,Funding_Cluster_1,Business,UG,36293.088076,38838.759926,0.070142,0
4,U00005,1.353647,9340.444262,Funding_Cluster_3,Psychology,UG,12643.662425,14238.645703,0.126149,1
5,U00006,1.353457,9947.907313,Funding_Cluster_3,Arts,UG,13464.063272,13017.582733,0.033161,0
6,U00007,0.582408,10641.556566,Funding_Cluster_4,Psychology,UG,6197.732421,7534.620153,0.215706,1
7,U00008,6.946137,16177.261055,Funding_Cluster_2,Arts,UG,112369.473967,114972.207417,0.023162,0
8,U00009,4.858781,15071.384037,Funding_Cluster_3,Health,UG,73228.550144,78681.42184,0.074464,0
9,U00010,5.701072,9656.880691,Funding_Cluster_2,Arts,PG,55054.567768,49695.383063,0.097343,0


In [4]:
df["flag_error"].value_counts()

flag_error
0    8975
1    6025
Name: count, dtype: int64

# Variables preprocessing
The list of X variables are independent variable and set error flag as y which is dependent variables
This part is about handling data types with standard scaler and one hot encoder to make sure the model able to read and fetch the data.

In [5]:
X = df[["EFTSL", "gov_contr", "Funding_Cluster", "Unit", "level_type"]]
y = df["flag_error"]

num_cols = ["EFTSL", "gov_contr"]
cat_cols = ["Funding_Cluster", "Unit", "level_type"]

preprocess = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), num_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
    ],
    remainder="drop"
)

clf = Pipeline(steps=[
    ("prep", preprocess),
    ("logit", LogisticRegression(
        solver="lbfgs",
        max_iter=2000,
        class_weight="balanced",
        random_state=42
    ))
])

# Logistic Regression
This part is to apply the data to logistic regression modelling and shows the most important features with coefficient and odd ratio in a list

In [6]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.30, random_state=42, stratify=y
)

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))
cm = confusion_matrix(y_test, y_pred)
print(cm)

ohe = clf.named_steps["prep"].named_transformers_["cat"]
cat_names = ohe.get_feature_names_out(cat_cols)
feature_names = np.r_[num_cols, cat_names]

coefs = clf.named_steps["logit"].coef_.ravel()
odds = np.exp(coefs)

coef_df = pd.DataFrame({
    "feature": feature_names,
    "coef": coefs,
    "odds_ratio": odds
}).sort_values("coef", key=np.abs, ascending=False)

print("\nTop features by absolute impact:")
print(coef_df.head(12).to_string(index=False))

              precision    recall  f1-score   support

           0       0.83      0.98      0.90      2692
           1       0.95      0.70      0.81      1808

    accuracy                           0.87      4500
   macro avg       0.89      0.84      0.85      4500
weighted avg       0.88      0.87      0.86      4500

[[2632   60]
 [ 540 1268]]

Top features by absolute impact:
                          feature      coef  odds_ratio
                  Unit_Psychology  3.478914   32.424475
                        Unit_Arts -1.528692    0.216819
                    Unit_Business -1.493634    0.224555
                      Unit_Health -0.520536    0.594202
Funding_Cluster_Funding_Cluster_4 -0.065062    0.937009
                        gov_contr  0.062572    1.064571
                    level_type_UG -0.060177    0.941598
                            EFTSL  0.048653    1.049856
                    level_type_PG -0.003771    0.996236
Funding_Cluster_Funding_Cluster_2  0.003424    1.003

# Machine Learning Model
This model uses XGBoost (Extreme Gradient Boosting), a powerful machine learning algorithm that builds an ensemble of decision trees to predict whether a payment record is likely to have a discrepancy. Unlike simpler models such as logistic regression, XGBoost combines many small trees that learn from previous errors, improving accuracy with each step.

# Data preparation
Same stap as the previous regression model before feed into model

In [9]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

X = df[["EFTSL", "gov_contr", "Funding_Cluster", "Unit", "level_type"]]
y = df["flag_error"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

num_cols = ["EFTSL", "gov_contr"]
cat_cols = ["Funding_Cluster", "Unit", "level_type"]

preprocess = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), num_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
    ],
    remainder="drop"
)

X_train_prep = preprocess.fit_transform(X_train)
X_test_prep = preprocess.transform(X_test)

In [None]:
# xgboost install if haven't
# !pip install xgboost

# XGBoost
The model is trained on the processed dataset (X_train_prep, y_train) and then tested on unseen data (X_test_prep) to evaluate its predictive performance.
The output includes a classification report, a ROC-AUC score, and a confusion matrix, which together show how well the model distinguishes between error and non-error cases. Overall, this model helps identify complex, nonlinear patterns in the data that may indicate a higher risk of payment discrepancies.

In [11]:
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

xgb = XGBClassifier(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=4,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    scale_pos_weight=1,
    eval_metric="logloss",
    n_jobs=-1
)

xgb.fit(X_train_prep, y_train)
y_pred_xgb = xgb.predict(X_test_prep)

print(classification_report(y_test, y_pred_xgb))
print("ROC-AUC:", roc_auc_score(y_test, xgb.predict_proba(X_test_prep)[:, 1]).round(3))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_xgb))

              precision    recall  f1-score   support

           0       0.83      0.98      0.90      2692
           1       0.95      0.70      0.81      1808

    accuracy                           0.87      4500
   macro avg       0.89      0.84      0.85      4500
weighted avg       0.88      0.87      0.86      4500

ROC-AUC: 0.869
Confusion Matrix:
 [[2631   61]
 [ 540 1268]]


In [12]:
import pandas as pd
import numpy as np

# Get feature names from the preprocessor
ohe = preprocess.named_transformers_["cat"]
cat_names = ohe.get_feature_names_out(cat_cols)
feature_names = np.r_[num_cols, cat_names]

# Get feature importance from XGBoost
importance = xgb.feature_importances_

# Create a DataFrame for better readability
feat_imp = pd.DataFrame({
    "Feature": feature_names,
    "Importance": importance
}).sort_values(by="Importance", ascending=False)

print("\nTop 10 Feature Importances:")
print(feat_imp.head(10))



Top 10 Feature Importances:
                              Feature  Importance
9                     Unit_Psychology    0.804032
8                         Unit_Health    0.063062
7                       Unit_Business    0.054214
6                           Unit_Arts    0.041475
0                               EFTSL    0.005884
1                           gov_contr    0.005551
11                      level_type_UG    0.005077
4   Funding_Cluster_Funding_Cluster_3    0.004707
5   Funding_Cluster_Funding_Cluster_4    0.004460
2   Funding_Cluster_Funding_Cluster_1    0.004161
