## prepare data for modeling

In [1]:
# setup
import sys
import os
sys.path.append(os.path.abspath('../src'))

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from utils import preprocess_data, validate_data

data_location = '../data/raw/psp_raw_data.xlsx'

### retrieve, preprocess and validate data
- feature selection based in previous analysis

In [2]:
df = pd.read_excel(data_location, index_col=0)

# Apply the preprocessing and validation function
df = preprocess_data(df)
print(validate_data(df))
display(df.head())

Validation complete and successful.
None


Unnamed: 0,country,card,3d_secured,is_peak_time,amount,psp,success
0,Germany,Visa,0,0,89,UK_Card,0
1,Germany,Visa,0,0,89,UK_Card,1
2,Germany,Diners,1,0,238,UK_Card,0
3,Germany,Diners,1,0,238,UK_Card,1
4,Austria,Diners,0,0,124,Simplecard,0


## baseline model 
- no handling imbalance, no hyperparameter tuning, etc

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# Retrieve, preproccess, validate data
data = pd.read_excel(data_location, index_col=0)
processed_data = preprocess_data(data)
print(validate_data(processed_data))

Validation complete and successful.
None


In [4]:
processed_data.dtypes

country         object
card            object
3d_secured      object
is_peak_time    object
amount           int64
psp             object
success          int64
dtype: object

In [5]:


# Separate features and target
X = processed_data.drop(columns=['success'])  # Drop 'success' from features
y = processed_data['success'].astype(int)  # Use 'success' as the binary target

# Encode categorical features
X = pd.get_dummies(X, drop_first=True)

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=processed_data['psp'], random_state=42)

# Scale the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Step 2: Use OneVsRestClassifier with Logistic Regression
model = OneVsRestClassifier(LogisticRegression(max_iter=1000))

# Train the model
model.fit(X_train, y_train)

# Step 3: Predict probabilities for success, stratified by PSP
y_pred_probs = model.predict_proba(X_test)

# Evaluate the model
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Step 4: Calculate odds for success
odds = y_pred_probs / (1 - y_pred_probs)

# Optional: Display the odds for a few samples
print("\nOdds of Success for the first 5 samples:")
print(odds[:5])


Accuracy: 0.7925014878000397

Classification Report:
               precision    recall  f1-score   support

           0       0.79      1.00      0.88      7973
           1       0.68      0.02      0.03      2109

    accuracy                           0.79     10082
   macro avg       0.74      0.51      0.46     10082
weighted avg       0.77      0.79      0.71     10082


Odds of Success for the first 5 samples:
[[6.12414265 0.16328816]
 [3.92997047 0.25445484]
 [5.88120358 0.17003322]
 [6.82554497 0.14650845]
 [6.58454249 0.15187084]]


In [6]:
from sklearn.utils.class_weight import compute_class_weight
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from collections import defaultdict

# Step 1: Create a dictionary to store PSP-specific models and results
psp_models = defaultdict(dict)

# Step 2: Process and split data for each PSP
for psp in processed_data['psp'].unique():
    print(f"Modeling for PSP: {psp}")
    
    # Filter data for the current PSP
    psp_data = processed_data[processed_data['psp'] == psp]
    
    # Separate features and target
    X_psp = psp_data.drop(columns=['psp', 'success'])
    y_psp = psp_data['success'].astype(int)
    
    # Encode categorical features
    X_psp = pd.get_dummies(X_psp, drop_first=True)
    
    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X_psp, y_psp, test_size=0.2, random_state=42)
    
    # Step 3: Handle class imbalance using SMOTE
    smote = SMOTE(random_state=42)
    X_train, y_train = smote.fit_resample(X_train, y_train)
    
    # Step 4: Train the OneVsRestClassifier with class weight adjustment
    class_weights = compute_class_weight('balanced', classes=[0, 1], y=y_train)
    class_weight_dict = {0: class_weights[0], 1: class_weights[1]}
    
    model = OneVsRestClassifier(
        LogisticRegression(max_iter=1000, class_weight=class_weight_dict)
    )
    model.fit(X_train, y_train)
    
    # Step 5: Evaluate the model
    y_pred = model.predict(X_test)
    y_pred_probs = model.predict_proba(X_test)
    
    print(f"Classification Report for PSP {psp}:\n", classification_report(y_test, y_pred))
    
    # Calculate odds
    odds = y_pred_probs / (1 - y_pred_probs)
    print(f"Odds of Success for PSP {psp}:\n", odds[:5])
    
    # Save model and results
    psp_models[psp]['model'] = model
    psp_models[psp]['classification_report'] = classification_report(y_test, y_pred, output_dict=True)
    psp_models[psp]['odds'] = odds


Modeling for PSP: UK_Card
Classification Report for PSP UK_Card:
               precision    recall  f1-score   support

           0       0.84      0.61      0.71      4282
           1       0.24      0.51      0.32      1010

    accuracy                           0.59      5292
   macro avg       0.54      0.56      0.52      5292
weighted avg       0.73      0.59      0.64      5292

Odds of Success for PSP UK_Card:
 [[0.75480371 1.32484776]
 [1.15603106 0.86502866]
 [0.74934718 1.33449491]
 [0.53564794 1.86689788]
 [0.67157121 1.48904538]]
Modeling for PSP: Simplecard
Classification Report for PSP Simplecard:
               precision    recall  f1-score   support

           0       0.85      0.72      0.78      2100
           1       0.18      0.32      0.23       390

    accuracy                           0.66      2490
   macro avg       0.51      0.52      0.50      2490
weighted avg       0.75      0.66      0.69      2490

Odds of Success for PSP Simplecard:
 [[1.7343104

In [7]:
import pandas as pd
from sklearn.utils.class_weight import compute_class_weight
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from collections import defaultdict

# Create a dictionary to store PSP-specific models and results
psp_models = defaultdict(dict)

# DataFrame to store results for all PSPs
results_list = []

# Process and split data for each PSP
for psp in processed_data['psp'].unique():
    print(f"Modeling for PSP: {psp}")
    
    # Filter data for the current PSP
    psp_data = processed_data[processed_data['psp'] == psp]
    
    # Separate features and target
    X_psp = psp_data.drop(columns=['psp', 'success'])
    y_psp = psp_data['success'].astype(int)
    
    # Encode categorical features
    X_psp = pd.get_dummies(X_psp, drop_first=True)
    
    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X_psp, y_psp, test_size=0.2, random_state=42)
    
    # Handle class imbalance using SMOTE
    smote = SMOTE(random_state=42)
    X_train, y_train = smote.fit_resample(X_train, y_train)
    
    # Train the OneVsRestClassifier with class weight adjustment
    class_weights = compute_class_weight('balanced', classes=[0, 1], y=y_train)
    class_weight_dict = {0: class_weights[0], 1: class_weights[1]}
    
    model = OneVsRestClassifier(
        LogisticRegression(max_iter=1000, class_weight=class_weight_dict)
    )
    model.fit(X_train, y_train)
    
    # Predict probabilities on the test set
    y_pred_probs = model.predict_proba(X_test)
    
    # Prepare a DataFrame for this PSP's results
    psp_result = pd.DataFrame(X_test, columns=X_psp.columns)  # Include feature columns
    psp_result[f'{psp}_success_probability'] = y_pred_probs[:, 1]  # Success probability for this PSP
    results_list.append(psp_result)

# Combine results for all PSPs into a single DataFrame
combined_results = pd.concat(results_list, axis=0)

# Pivot probabilities into separate columns for each PSP
probabilities = combined_results.filter(like='_success_probability')
features = combined_results.drop(columns=probabilities.columns)

final_results = pd.concat([features.reset_index(drop=True), probabilities.reset_index(drop=True)], axis=1)

# Display first few rows
print("Results for Test Data (Input Features and Success Probabilities Per PSP):")
display(final_results.head())

Modeling for PSP: UK_Card
Modeling for PSP: Simplecard
Modeling for PSP: Moneycard
Modeling for PSP: Goldcard
Results for Test Data (Input Features and Success Probabilities Per PSP):


Unnamed: 0,amount,country_Germany,country_Switzerland,card_Master,card_Visa,3d_secured_1,is_peak_time_1,UK_Card_success_probability,Simplecard_success_probability,Moneycard_success_probability,Goldcard_success_probability
0,243,False,False,True,False,False,False,0.569864,,,
1,152,False,False,True,False,True,True,0.463815,,,
2,61,False,True,True,False,False,False,0.571642,,,
3,48,True,False,True,False,False,False,0.651191,,,
4,215,False,False,True,False,False,False,0.59824,,,
