In [1]:
import os

In [2]:
%pwd

'/home/tousside/Documents/recrutement/cowrywise-customer-plan-abandonment/research'

In [3]:
os.chdir("../")

In [4]:
import pandas as pd

In [5]:
data = pd.read_csv("artifacts/data_transformation/train.csv")

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2113 entries, 0 to 2112
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   gender_id                 1756 non-null   float64
 1   risk_apetite              2113 non-null   int64  
 2   fraud_score               2113 non-null   int64  
 3   monthly_expense           2113 non-null   float64
 4   type                      2113 non-null   object 
 5   total_transactions        2113 non-null   int64  
 6   total_transaction_amount  2113 non-null   float64
 7   total_withdrawn_amount    2113 non-null   float64
 8   plan_abondonment          2113 non-null   int64  
dtypes: float64(4), int64(4), object(1)
memory usage: 148.7+ KB


# set categorical variables

In [7]:
# gender_id , and type as categorical variables
categorical_cols = ["gender_id", "type"]
data[categorical_cols] = data[categorical_cols].astype(str)

In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2113 entries, 0 to 2112
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   gender_id                 2113 non-null   object 
 1   risk_apetite              2113 non-null   int64  
 2   fraud_score               2113 non-null   int64  
 3   monthly_expense           2113 non-null   float64
 4   type                      2113 non-null   object 
 5   total_transactions        2113 non-null   int64  
 6   total_transaction_amount  2113 non-null   float64
 7   total_withdrawn_amount    2113 non-null   float64
 8   plan_abondonment          2113 non-null   int64  
dtypes: float64(3), int64(4), object(2)
memory usage: 148.7+ KB


In [9]:
# Step 1: Separate features and target
X = data.drop(columns="plan_abondonment")
y = data["plan_abondonment"]

# TReat non values for gender_id

In [10]:

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier  # or any other model
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import StandardScaler, FunctionTransformer
from sklearn.linear_model import LogisticRegression

In [11]:
categorical_cols = ["gender_id", "type"]

# get numerical variables

In [12]:
X_train = data.drop(columns="plan_abondonment")
y_train = data["plan_abondonment"]

In [13]:
numeric_cols = X_train.select_dtypes(include=["int64", "float64"]).columns.tolist()

# Pipeline

In [14]:
# Step 3: ColumnTransformer for imputing
preprocessor = ColumnTransformer(transformers=[
    ("num", SimpleImputer(strategy="median"), numeric_cols),
    ("scaler", StandardScaler(), numeric_cols),
    ("cat", SimpleImputer(strategy="most_frequent"), categorical_cols)
], remainder="passthrough")  # Keep other columns (if any)


In [15]:
# Step 4: Function to rebuild DataFrame from transformed array
def to_dataframe(X_array):
    return pd.DataFrame(X_array, columns=numeric_cols + categorical_cols)

In [16]:
def array_to_str_keyed_dict_list(X):
    """
    Converts a 2D array into a list of dictionaries
    where keys are string representations of column indices.
    """
    return [dict(zip(map(str, range(X.shape[1])), row)) for row in X]


In [17]:
# Full pipeline
pipeline = Pipeline([
    ("preprocessing", preprocessor),
    ("to_dict", FunctionTransformer(array_to_str_keyed_dict_list)),
    ("vectorize", DictVectorizer()),
    ("clf", LogisticRegression(max_iter=1000))
])

In [18]:
pipeline.fit(X_train, y_train)

# Predict
y_pred = pipeline.predict(X_train)

In [19]:
from sklearn.metrics import accuracy_score

In [20]:
test_data = pd.read_csv("artifacts/data_transformation/test.csv")
test_data[categorical_cols] = test_data[categorical_cols].astype(str)
# Predict on test data
test_X = test_data.drop(columns="plan_abondonment")
test_y = test_data["plan_abondonment"]
y_pred_test = pipeline.predict(test_X)
# Evaluate on test data
test_acc = accuracy_score(test_y, y_pred_test)
print("Test Accuracy:", test_acc)

Test Accuracy: 0.5801418439716312


In [20]:
X_train.head()

Unnamed: 0,gender_id,risk_apetite,fraud_score,monthly_expense,type,total_transactions,total_transaction_amount,total_withdrawn_amount
0,,0,0,0.0,Investment,1,5000000.0,0.0
1,2.0,1,0,40000000.0,Savings,319,111000000.0,0.0
2,1.0,2,0,100000000.0,Investment,1,0.0,0.0
3,2.0,2,0,5000000.0,Savings,27,2000000.0,0.0
4,,0,0,100000000.0,Savings,5,10000000.0,0.0


In [21]:
X_train.columns

Index(['gender_id', 'risk_apetite', 'fraud_score', 'monthly_expense', 'type',
       'total_transactions', 'total_transaction_amount',
       'total_withdrawn_amount'],
      dtype='object')

In [25]:
X_train["fraud_score"].unique()

array([ 0, 15, 10, 20, 50])

In [21]:
# Random forest

In [22]:
from sklearn.ensemble import RandomForestClassifier

In [23]:
# Full pipeline
pipeline = Pipeline([
    ("preprocessing", preprocessor),
    ("to_dict", FunctionTransformer(array_to_str_keyed_dict_list)),
    ("vectorize", DictVectorizer()),
    ("clf", RandomForestClassifier(random_state=42))
])

In [24]:
pipeline.fit(X_train, y_train)

# Predict
y_pred = pipeline.predict(X_train)

In [25]:
from sklearn.metrics import accuracy_score

In [26]:
acc = accuracy_score(y_train, y_pred)
print("Accuracy:", acc)

Accuracy: 0.972077614765736


In [27]:
test_data = pd.read_csv("artifacts/data_transformation/test.csv")
test_data[categorical_cols] = test_data[categorical_cols].astype(str)
# Predict on test data
test_X = test_data.drop(columns="plan_abondonment")
test_y = test_data["plan_abondonment"]
y_pred_test = pipeline.predict(test_X)
# Evaluate on test data
test_acc = accuracy_score(test_y, y_pred_test)
print("Test Accuracy:", test_acc)

Test Accuracy: 0.8312056737588652


# xgboost classifier

In [28]:
from xgboost import XGBClassifier

In [29]:
# Full pipeline
pipeline = Pipeline([
    ("preprocessing", preprocessor),
    ("to_dict", FunctionTransformer(array_to_str_keyed_dict_list)),
    ("vectorize", DictVectorizer()),
     ('classifier', XGBClassifier())
])

In [30]:
pipeline.fit(X_train, y_train)

# Predict
y_pred = pipeline.predict(X_train)

In [31]:
from sklearn.metrics import accuracy_score

In [32]:
acc = accuracy_score(y_train, y_pred)
print("Accuracy:", acc)

Accuracy: 0.9526739233317558


In [33]:
test_data = pd.read_csv("artifacts/data_transformation/test.csv")
test_data[categorical_cols] = test_data[categorical_cols].astype(str)
# Predict on test data
test_X = test_data.drop(columns="plan_abondonment")
test_y = test_data["plan_abondonment"]
y_pred_test = pipeline.predict(test_X)
# Evaluate on test data
test_acc = accuracy_score(test_y, y_pred_test)
print("Test Accuracy:", test_acc)

Test Accuracy: 0.8411347517730496


In [34]:
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, log_loss, confusion_matrix, classification_report,
    average_precision_score, cohen_kappa_score
)


In [None]:
test_data = pd.read_csv("artifacts/data_transformation/test.csv")
test_data[categorical_cols] = test_data[categorical_cols].astype(str)
# Predict on test data
test_X = test_data.drop(columns="plan_abondonment")
test_y = test_data["plan_abondonment"]
y_pred_test = pipeline.predict(test_X)
# Evaluate on test data

In [36]:
from sklearn.metrics import classification_report, roc_auc_score

print(classification_report(test_y, y_pred_test))
y_proba = pipeline.predict_proba(test_X)[:, 1]  # Get probabilities for the positive class
print("ROC AUC:", roc_auc_score(test_y, y_proba))  # use y_proba for probabilistic output


              precision    recall  f1-score   support

           0       0.79      0.83      0.81       289
           1       0.88      0.85      0.86       416

    accuracy                           0.84       705
   macro avg       0.84      0.84      0.84       705
weighted avg       0.84      0.84      0.84       705

ROC AUC: 0.913794250731967


# modular codes

In [64]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class ModelTrainerConfig:
    root_dir: Path
    train_data_path: Path
    test_data_path: Path
    model_name: str
    model_params: dict
    target_column: str

In [65]:
from mlProject.constants import *
from mlProject.utils.common import read_yaml, create_directories

In [66]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH,
        schema_filepath = SCHEMA_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])


    def get_model_trainer_config(self) -> ModelTrainerConfig:
        config = self.config.model_trainer
        model_params = self.params.XGBoost
        schema =  self.schema.TARGET_COLUMN

        create_directories([config.root_dir])

        model_trainer_config = ModelTrainerConfig(
            root_dir=config.root_dir,
            train_data_path = config.train_data_path,
            test_data_path = config.test_data_path,
            model_name = config.model_name,
            model_params = model_params,
            target_column = schema.name
            
        )

        return model_trainer_config

In [67]:

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier  # or any other model
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import StandardScaler, FunctionTransformer
from sklearn.linear_model import LogisticRegression
import joblib

In [70]:
class ModelTrainer:
    def __init__(self, config: ModelTrainerConfig):
        self.config = config
    def array_to_str_keyed_dict_list(self, X):
        """
        Converts a 2D array into a list of dictionaries
        where keys are string representations of column indices.
        """
        return [dict(zip(map(str, range(X.shape[1])), row)) for row in X]

    
    def train(self):
        train_data = pd.read_csv(self.config.train_data_path)
        test_data = pd.read_csv(self.config.test_data_path)


        train_x = train_data.drop([self.config.target_column], axis=1)
        test_x = test_data.drop([self.config.target_column], axis=1)
        train_y = train_data[[self.config.target_column]]
        test_y = test_data[[self.config.target_column]]
        
        # gender_id , and type as categorical variables
        categorical_cols = ["gender_id", "type"]
        train_x[categorical_cols] = train_x[categorical_cols].astype(str)
        
        numeric_cols = train_x.select_dtypes(include=["int64", "float64"]).columns.tolist()
        
        # Step 3: ColumnTransformer for imputing
        preprocessor = ColumnTransformer(transformers=[
            ("num", SimpleImputer(strategy="median"), numeric_cols),
            ("scaler", StandardScaler(), numeric_cols),
            ("cat", SimpleImputer(strategy="most_frequent"), categorical_cols)
        ], remainder="passthrough")  # Keep other columns (if any)

        pipeline = Pipeline([
            ("preprocessing", preprocessor),
            ("to_dict", FunctionTransformer(self.array_to_str_keyed_dict_list)),
            ("vectorize", DictVectorizer()),
            ('classifier', XGBClassifier(**self.config.model_params, random_state=42))
        ])

        
        pipeline.fit(train_x, train_y)

        joblib.dump(pipeline, os.path.join(self.config.root_dir, self.config.model_name))



In [71]:
try:
    config = ConfigurationManager()
    model_trainer_config = config.get_model_trainer_config()
    model_trainer_config = ModelTrainer(config=model_trainer_config)
    model_trainer_config.train()
except Exception as e:
    raise e

[2025-05-29 11:44:25,611: INFO: common: yaml file: config/config.yaml loaded successfully]
[2025-05-29 11:44:25,612: INFO: common: yaml file: params.yaml loaded successfully]
[2025-05-29 11:44:25,614: INFO: common: yaml file: schema.yaml loaded successfully]
[2025-05-29 11:44:25,615: INFO: common: created directory at: artifacts]
[2025-05-29 11:44:25,615: INFO: common: created directory at: artifacts/model_trainer]
