<a href="https://colab.research.google.com/github/habrev/Bati-bank/blob/task-4/notebooks/modelling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
df = pd.read_csv("woe_binning.csv")

In [None]:
df.columns

Index(['index', 'TransactionId', 'CustomerId', 'Amount',
       'TransactionStartTime', 'PricingStrategy', 'FraudResult',
       'TotalTransactionAmount', 'AverageTransactionAmount',
       'TotalTransactions', 'StdTransactionAmount', 'TransactionHour',
       'TransactionDay', 'TransactionMonth', 'TransactionYear',
       'ProductCategory_airtime', 'ProductCategory_data_bundles',
       'ProductCategory_financial_services', 'ProductCategory_movies',
       'ProductCategory_other', 'ProductCategory_ticket',
       'ProductCategory_transport', 'ProductCategory_tv',
       'ProductCategory_utility_bill', 'ChannelId_ChannelId_1',
       'ChannelId_ChannelId_2', 'ChannelId_ChannelId_3',
       'ChannelId_ChannelId_5', 'Recency', 'Frequency', 'Monetary',
       'RFMS_Label', 'User_Label', 'Good_Bad', 'Recency_bin', 'Frequency_bin',
       'Monetary_bin'],
      dtype='object')

In [None]:
# Select only numeric columns from the DataFrame
numeric_df = df.select_dtypes(include=[float, int])

# Calculate the correlation matrix for the numeric columns
correlation_matrix = numeric_df.corr()

# Get the absolute correlation values for the target column
corr_with_target = correlation_matrix['Good_Bad'].abs()

# Set a correlation threshold (e.g., 0.1) to select relevant features
threshold = 0.1

# Select features that have correlation above the threshold (excluding the target column itself)
selected_features = corr_with_target[corr_with_target > threshold].index.tolist()

# Ensure the 'Good_Bad' column is excluded from selected features
if 'Good_Bad' in selected_features:
    selected_features.remove('Good_Bad')

# Return the DataFrame with only the selected features and the target column
df = df[selected_features + ['Good_Bad']]

In [None]:
corr_with_target.sort_values(ascending=False).reset_index(name='Correlation with Risk_Label')

Unnamed: 0,index,Correlation with Risk_Label
0,RFMS_Label,1.0
1,Good_Bad,1.0
2,TransactionYear,0.456502
3,TransactionMonth,0.454499
4,Recency,0.420896
5,index,0.420122
6,Monetary,0.12139
7,TotalTransactionAmount,0.12139
8,PricingStrategy,0.111403
9,StdTransactionAmount,0.082434


In [None]:
df.columns

Index(['index', 'PricingStrategy', 'TotalTransactionAmount',
       'TransactionMonth', 'TransactionYear', 'Recency', 'Monetary',
       'RFMS_Label', 'Good_Bad'],
      dtype='object')

In [None]:
features = df[selected_features]
target = df['Good_Bad']

X = features
y = target

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report

In [None]:
from sklearn.model_selection import train_test_split

# Assume 'features' is your features DataFrame and 'target' is your binary target variable
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [None]:
!pip install imblearn

Collecting imblearn
  Downloading imblearn-0.0-py2.py3-none-any.whl.metadata (355 bytes)
Downloading imblearn-0.0-py2.py3-none-any.whl (1.9 kB)
Installing collected packages: imblearn
Successfully installed imblearn-0.0


In [None]:
from imblearn.over_sampling import SMOTE

# Initialize SMOTE
smote = SMOTE(random_state=42)

# Fit and resample the training data
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

# Now use X_resampled and y_resampled for training your model

In [None]:
# Define models
models = {
    'Logistic Regression': LogisticRegression(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'Gradient Boosting': GradientBoostingClassifier()
}

# Define hyperparameter grids for each model
param_grids = {
    'Logistic Regression': {'classifier__C': [0.01, 0.1, 1, 10, 100]},
    'Decision Tree': {'classifier__max_depth': [3, 5, 7, None]},
    'Random Forest': {'classifier__n_estimators': [50, 100, 200], 'classifier__max_depth': [None, 5, 10]},
    'Gradient Boosting': {'classifier__learning_rate': [0.01, 0.1, 0.2], 'classifier__n_estimators': [50, 100, 200]}
}

# Define pipelines
pipelines = {name: Pipeline([
    ('scaler', StandardScaler()),  # Standardize the data
    ('classifier', model)  # The model to be trained
]) for name, model in models.items()}

# Set up GridSearchCV for each model pipeline
grid_searches = {
    name: GridSearchCV(estimator=pipeline, param_grid=param_grids[name], cv=5, scoring='accuracy')
    for name, pipeline in pipelines.items()
}

In [None]:
# Train each pipeline with the resampled dataset
for name, grid_search in grid_searches.items():
    grid_search.fit(X_resampled, y_resampled)
    print(f"{name} model trained with best parameters: {grid_search.best_params_}")

Logistic Regression model trained with best parameters: {'classifier__C': 0.01}
Decision Tree model trained with best parameters: {'classifier__max_depth': 3}
Random Forest model trained with best parameters: {'classifier__max_depth': None, 'classifier__n_estimators': 50}
Gradient Boosting model trained with best parameters: {'classifier__learning_rate': 0.01, 'classifier__n_estimators': 50}


In [None]:
# Initialize a dictionary to store performance metrics
performance_metrics = {}
y_probs = {}

for name, pipeline in grid_searches.items():
    y_pred = pipeline.predict(X_test)  # Predictions on the test set
    y_prob = pipeline.predict_proba(X_test)[:, 1]  # Get probabilities for the positive class
    y_probs[name] = y_prob  # Store probabilities for ROC curve
    # Calculate metrics
    performance_metrics[name] = {
        'Accuracy': accuracy_score(y_test, y_pred),
        'Precision': precision_score(y_test, y_pred),
        'Recall': recall_score(y_test, y_pred),
        'F1 Score': f1_score(y_test, y_pred),
        'ROC AUC': roc_auc_score(y_test, y_prob)
    }

# # Display the performance metrics
# for model, metrics in performance_metrics.items():
#     print(f"{model}: {metrics}")


In [None]:
# Create a DataFrame from the performance metrics
performance_df = pd.DataFrame(performance_metrics).T
performance_df

Unnamed: 0,Accuracy,Precision,Recall,F1 Score,ROC AUC
Logistic Regression,1.0,1.0,1.0,1.0,1.0
Decision Tree,1.0,1.0,1.0,1.0,1.0
Random Forest,1.0,1.0,1.0,1.0,1.0
Gradient Boosting,1.0,1.0,1.0,1.0,1.0


In [None]:
import joblib

# Assuming 'grid_search' is your GridSearchCV object and it has been run
best_model = grid_search.best_estimator_

# Save the best model to a file
joblib.dump(best_model, 'best_model.pkl')

['best_model.pkl']