# Importing required libraries


In [34]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix, accuracy_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
import joblib
from imblearn.over_sampling import SMOTE


# Load the feature-engineered dataset


In [35]:
df = pd.read_csv('../data/rfms_with_default_estimator.csv')

In [36]:
# Identify categorical columns
categorical_columns = df.select_dtypes(include=['object']).columns

# Use Label Encoding for high-cardinality categorical columns
label_encoder = LabelEncoder()

In [37]:
# Identify categorical columns
categorical_columns = df.select_dtypes(include=['object']).columns

# Check if there are any categorical columns
if len(categorical_columns) > 0:
    # Use Label Encoding for high-cardinality categorical columns
    label_encoder = LabelEncoder()

    # Loop through the categorical columns and apply LabelEncoder where appropriate
    for col in categorical_columns:
        unique_vals = df[col].nunique()
        if unique_vals > 100:  # Threshold for high cardinality (adjust if necessary)
            print(f"Encoding high-cardinality column: {col} with {unique_vals} unique values.")
            df[col] = label_encoder.fit_transform(df[col])
        else:
            # For low-cardinality columns, use One-Hot Encoding
            df = pd.get_dummies(df, columns=[col], drop_first=True)
else:
    print("No categorical columns found.")

Encoding high-cardinality column: CustomerId with 3742 unique values.


In [6]:
df.columns

Index(['CustomerId', 'Recency', 'Frequency', 'Monetary', 'Default_Estimator',
       'RFMS'],
      dtype='object')

In [38]:
df.head()

Unnamed: 0,CustomerId,Recency,Frequency,Monetary,Default_Estimator,RFMS
0,0,2144,1,-10000.0,1,-7855.0
1,1,2144,1,-10000.0,1,-7855.0
2,2,2149,5,20000.0,0,22154.0
3,3,2086,11,4225.0,0,6322.0
4,4,2072,6,20000.0,0,22078.0


In [39]:
# # Define target and features
# X = df.drop(columns='Default_Estimator')  # Assuming 'Default_Estimator' is the target variable
# y = df['Default_Estimator']
# Define target and features
y = df['Default_Estimator']  # Target
X = df[['Recency', 'Frequency', 'Monetary']]  # Features

# Display the shape of X and y
print("Features (X) shape:", X.shape)
print("Target (y) shape:", y.shape)


Features (X) shape: (3742, 3)
Target (y) shape: (3742,)


In [40]:
# Standardize the numeric features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)


In [41]:
# Ensure target is categorical
y = df['Default_Estimator']

# Check the unique values of the target to ensure it's not continuous
print(y.unique())

[1 0]


In [42]:
# Assuming 'FraudResult' has a threshold (for example, above 0.5 is fraud, below is not):
y_binarized = (y > 0.5).astype(int)

In [43]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_binarized, test_size=0.3, random_state=42)


In [44]:
# Fit models
models = {
    'Logistic Regression': LogisticRegression(),
    'Random Forest': RandomForestClassifier(),
        "Decision Tree": DecisionTreeClassifier(),

}



In [45]:
# Step 3: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Step 4: Apply SMOTE to the training set
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

In [46]:
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    
    print(f"Model: {name}")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("ROC AUC Score:", roc_auc_score(y_test, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred))

    

Model: Logistic Regression
Accuracy: 1.0
ROC AUC Score: 1.0
Confusion Matrix:
 [[708   0]
 [  0  41]]
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       708
           1       1.00      1.00      1.00        41

    accuracy                           1.00       749
   macro avg       1.00      1.00      1.00       749
weighted avg       1.00      1.00      1.00       749

Model: Random Forest
Accuracy: 1.0
ROC AUC Score: 1.0
Confusion Matrix:
 [[708   0]
 [  0  41]]
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       708
           1       1.00      1.00      1.00        41

    accuracy                           1.00       749
   macro avg       1.00      1.00      1.00       749
weighted avg       1.00      1.00      1.00       749

Model: Decision Tree
Accuracy: 1.0
ROC AUC Score: 1.0
Confusion Matrix:
 [[708   0]
 [  0  41]]
Clas

# Train and Evaluate Models


In [47]:
# Step 5: Loop through each model, train, and evaluate it
for name, model in models.items():
    print(f"\nTraining {name}...")
    
    # Train the model on the SMOTE-balanced data
    model.fit(X_train_smote, y_train_smote)
    
    # Make predictions on the test set
    y_pred = model.predict(X_test)
    
    # Evaluate the model's performance
    print(f"Model: {name}")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("ROC AUC Score:", roc_auc_score(y_test, model.predict_proba(X_test)[:, 1]))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred))


Training Logistic Regression...
Model: Logistic Regression
Accuracy: 1.0
ROC AUC Score: 1.0
Confusion Matrix:
 [[708   0]
 [  0  41]]
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       708
           1       1.00      1.00      1.00        41

    accuracy                           1.00       749
   macro avg       1.00      1.00      1.00       749
weighted avg       1.00      1.00      1.00       749


Training Random Forest...
Model: Random Forest
Accuracy: 1.0
ROC AUC Score: 1.0
Confusion Matrix:
 [[708   0]
 [  0  41]]
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       708
           1       1.00      1.00      1.00        41

    accuracy                           1.00       749
   macro avg       1.00      1.00      1.00       749
weighted avg       1.00      1.00      1.00       749


Training Decision Tree...
Model: Decisi

In [27]:
from sklearn.model_selection import cross_val_score

smote = SMOTE(random_state=42)
X_smote, y_smote = smote.fit_resample(X, y)

model = LogisticRegression(max_iter=1000)

# Perform cross-validation on the SMOTE-balanced data
cv_scores = cross_val_score(model, X_smote, y_smote, cv=5, scoring='accuracy')

print("Cross-validation Accuracy scores:", cv_scores)
print("Mean CV Accuracy:", cv_scores.mean())


Cross-validation Accuracy scores: [0.99929379 1.         1.         1.         1.        ]
Mean CV Accuracy: 0.9998587570621469


In [28]:
# Save the best model (for API serving)
best_model = RandomForestClassifier(n_estimators=100, max_depth=20, random_state=42)
best_model.fit(X_train, y_train)
joblib.dump(best_model, 'best_model1.pkl')
# joblib.dump(scaler, 'scaler.pkl')

print("Modelling Completed.")

Modelling Completed.
