In [80]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
import joblib
import os

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [74]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

# Load dataset
df = pd.read_csv("/content/drive/MyDrive/Credit_Scoring_Project/cleaned_data.csv")

# Drop irrelevant columns
df = df.drop(columns=['Unnamed: 0', 'TransactionStartTime', 'bin','TransactionId', 'BatchId', 'AccountId', 'SubscriptionId', 'CustomerId'])
# Encode the target column
df['Label'] = df['Label'].map({'Good': 0, 'Bad': 1})

df.dropna(inplace=True)

# One-Hot Encode categorical features with fewer categories
df = pd.get_dummies(df, columns=['CurrencyCode', 'ChannelId'], drop_first=True)

# Label Encode other categorical features
le = LabelEncoder()
for col in ['ProviderId', 'ProductId', 'ProductCategory']:
    df[col] = le.fit_transform(df[col])

# Verify all columns are numeric
print(df.dtypes)

# Split features and target
X = df.drop(columns=['Label'])
y = df['Label']

print(f"Features shape: {X.shape}, Target shape: {y.shape}")


CountryCode                   int64
ProviderId                    int64
ProductId                     int64
ProductCategory               int64
Amount                      float64
Value                         int64
PricingStrategy               int64
FraudResult                   int64
TotalTransactionAmount      float64
AverageTransactionAmount    float64
TransactionCount              int64
TransactionStdDev           float64
TransactionHour               int64
TransactionDay                int64
TransactionMonth              int64
TransactionYear               int64
Recency                       int64
Frequency                     int64
Monetary                    float64
Seasonality                   int64
RFMS_Score                  float64
RFMS_Cluster                  int64
Label                         int64
WoE                         float64
ChannelId_ChannelId_2          bool
ChannelId_ChannelId_3          bool
ChannelId_ChannelId_5          bool
dtype: object
Features shape

Unnamed: 0_level_0,count
Label,Unnamed: 1_level_1
0,90859
1,4091


In [75]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("Class distribution in y_train after stratified split:")
print(y_train.value_counts())

Class distribution in y_train after stratified split:
Label
0    72687
1     3273
Name: count, dtype: int64


In [56]:
# imputer = SimpleImputer(strategy='mean') # Create an imputer with the desired strategy
# X_train = imputer.fit_transform(X_train) # Fit and transform on the training data
# X_test = imputer.transform(X_test) # Transform the testing data using the fitted imputer

In [58]:
# # Get indices of rows with missing values in y_train and y_test
# train_missing_indices = y_train[y_train.isnull()].index
# test_missing_indices = y_test[y_test.isnull()].index

# # Convert X_train and X_test back to DataFrames with original indices
# X_train = pd.DataFrame(X_train, index=y_train.index)
# X_test = pd.DataFrame(X_test, index=y_test.index)

# # Remove rows with missing target values from X_train, y_train, X_test, and y_test
# X_train = X_train[~X_train.index.isin(train_missing_indices)]
# y_train = y_train[~y_train.index.isin(train_missing_indices)]
# X_test = X_test[~X_test.index.isin(test_missing_indices)]
# y_test = y_test[~y_test.index.isin(test_missing_indices)]

In [89]:
# Train Logistic Regression and Random Forest models
def train_models(X_train, y_train):
    models = {
        "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
        "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42)
    }
    trained_models = {}
    for name, model in models.items():
        model.fit(X_train, y_train)
        trained_models[name] = model
    return trained_models

models = train_models(X_train, y_train)

In [78]:
def tune_hyperparameters(model, param_grid, X_train, y_train, search_type='grid', cv=3, n_iter=10):
    """
    Tune hyperparameters for a given model using GridSearchCV or RandomizedSearchCV.

    Parameters:
    - model: Base model to tune.
    - param_grid: Dictionary of hyperparameters to search.
    - X_train, y_train: Training data.
    - search_type: 'grid' for GridSearchCV, 'random' for RandomizedSearchCV.
    - cv: Number of cross-validation folds.
    - n_iter: Number of parameter settings sampled for RandomizedSearchCV.

    Returns:
    - best_model: Model with the best parameters.
    - best_params: Best hyperparameters.
    """
    if search_type == 'grid':
        search = GridSearchCV(model, param_grid, cv=cv, scoring='roc_auc', verbose=1, n_jobs=-1)
    elif search_type == 'random':
        search = RandomizedSearchCV(model, param_grid, n_iter=n_iter, cv=cv, scoring='roc_auc', verbose=1, n_jobs=-1)
    else:
        raise ValueError("search_type must be 'grid' or 'random'")

    search.fit(X_train, y_train)
    return search.best_estimator_, search.best_params_


In [84]:
# Evaluate models on test data
def evaluate_models(models, X_test, y_test):
    results = {}
    for name, model in models.items():
        y_pred = model.predict(X_test)
        y_pred_proba = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else None
        results[name] = {
            "Accuracy": accuracy_score(y_test, y_pred),
            "Precision": precision_score(y_test, y_pred),
            "Recall": recall_score(y_test, y_pred),
            "F1 Score": f1_score(y_test, y_pred),
            "ROC-AUC": roc_auc_score(y_test, y_pred_proba) if y_pred_proba is not None else None,
        }
    return results

# Evaluate and display results
results = evaluate_models(models, X_test, y_test)
print("Model Evaluation Results:")
for model, metrics in results.items():
    print(f"\n{model}")
    for metric, value in metrics.items():
        print(f"{metric}: {value:.4f}")


Model Evaluation Results:

Logistic Regression
Accuracy: 1.0000
Precision: 1.0000
Recall: 1.0000
F1 Score: 1.0000
ROC-AUC: 1.0000

Random Forest
Accuracy: 1.0000
Precision: 1.0000
Recall: 1.0000
F1 Score: 1.0000
ROC-AUC: 1.0000


In [90]:
# # Define the model and parameter grid
# lr = LogisticRegression(max_iter=1000, random_state=42)
# lr_param_grid = {
#     'penalty': ['l1', 'l2', 'elasticnet', None],
#     'C': [0.01, 0.1, 1, 10, 100],
#     'solver': ['liblinear', 'saga']
# }

# # Tune hyperparameters
# best_lr, best_lr_params = tune_hyperparameters(lr, lr_param_grid, X_train, y_train, search_type='grid')
# print("Best Logistic Regression Parameters:", best_lr_params)


In [91]:
# Define the model and parameter grid
rf = RandomForestClassifier(random_state=42)
rf_param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

# Tune hyperparameters
best_rf, best_rf_params = tune_hyperparameters(rf, rf_param_grid, X_train, y_train, search_type='random', n_iter=20)
print("Best Random Forest Parameters:", best_rf_params)


Fitting 3 folds for each of 20 candidates, totalling 60 fits
Best Random Forest Parameters: {'n_estimators': 50, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_depth': 10, 'bootstrap': False}


In [92]:
rf_results = evaluate_models({"Tuned Random Forest": best_rf}, X_test, y_test)
print("\nTuned Random Forest Results:", rf_results)


Tuned Random Forest Results: {'Tuned Random Forest': {'Accuracy': 1.0, 'Precision': 1.0, 'Recall': 1.0, 'F1 Score': 1.0, 'ROC-AUC': 1.0}}


In [87]:
# Save the best-performing model
def save_best_model(models, results, output_dir="models"):
    os.makedirs(output_dir, exist_ok=True)
    best_model_name = max(results, key=lambda x: results[x]['Accuracy'])
    best_model = models[best_model_name]
    model_path = f"{output_dir}/{best_model_name.replace(' ', '_')}.pkl"
    joblib.dump(best_model, model_path)
    return best_model_name, model_path

best_model_name, model_path = save_best_model(models, results)
print(f"\nBest model '{best_model_name}' saved to: {model_path}")



Best model 'Logistic Regression' saved to: models/Logistic_Regression.pkl


In [94]:
# Combine result for Random Forest only
all_results = {
    "Tuned Random Forest": rf_results["Tuned Random Forest"]
}

# Save the best model (which will be Random Forest in this case)
# Note the change in the dictionary key here
best_model_name, model_path = save_best_model({"Tuned Random Forest": best_rf}, all_results)
print(f"\nBest model '{best_model_name}' saved to: {model_path}")


Best model 'Tuned Random Forest' saved to: models/Tuned_Random_Forest.pkl
