# Loading The Data


In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import pandas as pd
import os

trx_data= pd.read_csv('../data/trx_data.csv')
profile = pd.read_csv('../data/profile.csv')
train_label = pd.read_csv('../data/train_label.csv')

# **GenAI Integration Paper Project - CHATGPT and GITHUB Copilot**

# **Utilising CHATGPT**

## Data Analysis

In [2]:
print('Transaction data \n')
print(trx_data.info())

print('\nProfile data \n')
print(profile.info())

print('\nTrain label data \n')
print(train_label.info())

Transaction data 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2300515 entries, 0 to 2300514
Data columns (total 5 columns):
 #   Column            Dtype  
---  ------            -----  
 0   transaction_time  object 
 1   user_id           int64  
 2   transaction_type  object 
 3   asset_type        object 
 4   gtv               float64
dtypes: float64(1), int64(1), object(3)
memory usage: 87.8+ MB
None

Profile data 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 188427 entries, 0 to 188426
Data columns (total 9 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   user_id                188427 non-null  int64  
 1   mobile_brand_name      188427 non-null  object 
 2   mobile_marketing_name  188427 non-null  object 
 3   age_in_year            188427 non-null  float64
 4   gender_name            188425 non-null  object 
 5   marital_status         84198 non-null   object 
 6   education_background   84198 

In [None]:
#Checking and validating The number of user_id

def checkuserid(dataframe):
    dfname = [name for name, obj in globals().items() if obj is dataframe]
    distinct_count = dataframe['user_id'].nunique()
    print(f"There are {distinct_count} user id in {dfname}")
    if dataframe['user_id'].nunique() == len(dataframe):
        print(f"No duplicated user_id found in {dfname}\n")
    else:
        print(f"There are duplicated user_id in {dfname}\n")

checkuserid(trx_data)
checkuserid(profile)
checkuserid(train_label)

There are 188427 user id in ['trx_data']
There are duplicated user_id in ['trx_data']

There are 188427 user id in ['profile']
No duplicated user_id found in ['profile']

There are 150741 user id in ['train_label']
No duplicated user_id found in ['train_label']



## Data Pre-processing & Feature Engineering

### Processing Transaction Data (trx_data)

In [None]:
# Assuming trx_data is a DataFrame containing transaction data

# Convert transaction_time to datetime if it's not already
trx_data['transaction_time'] = pd.to_datetime(trx_data['transaction_time'])

# Step 1: First Transaction (earliest)
first_transaction = trx_data.groupby('user_id')['transaction_time'].min().reset_index(name='first_transaction')

# Step 2: Last Transaction (latest)
last_transaction = trx_data.groupby('user_id')['transaction_time'].max().reset_index(name='last_transaction')

# Step 3: Recency of Transaction (difference between current date and last transaction)
current_date = pd.to_datetime('today')  # Get today's date
recency_of_transaction = pd.merge(first_transaction, last_transaction, on='user_id')
recency_of_transaction['recency_of_transaction'] = (current_date - recency_of_transaction['last_transaction']).dt.days

# Step 4: GTV Count
gtv_count = trx_data.groupby('user_id')['gtv'].count().reset_index(name='gtv_count')

# Step 5: GTV Max
gtv_max = trx_data.groupby('user_id')['gtv'].max().reset_index(name='gtv_max')

# Step 6: GTV Sum
gtv_sum = trx_data.groupby('user_id')['gtv'].sum().reset_index(name='gtv_sum')

# Step 7: GTV Mean
gtv_mean = trx_data.groupby('user_id')['gtv'].mean().reset_index(name='gtv_mean')

# Step 8: GTV Standard Deviation
gtv_std_dev = trx_data.groupby('user_id')['gtv'].std().reset_index(name='gtv_std_dev')
gtv_std_dev['gtv_std_dev'].fillna(0, inplace=True)

user_asset_type = trx_data[['user_id', 'asset_type']].drop_duplicates()
user_asset_type_ohe = pd.get_dummies(user_asset_type, columns=['asset_type'], prefix='asset_type')
user_asset_type_ohe.replace({True: 1, False:0}, inplace=True)
user_asset_type_ohe = user_asset_type_ohe.groupby('user_id').sum().reset_index()

#Merge all features together
final_trx_data = pd.merge(first_transaction, last_transaction, on='user_id')
final_trx_data = pd.merge(final_trx_data, recency_of_transaction[['user_id', 'recency_of_transaction']], on='user_id')
final_trx_data = pd.merge(final_trx_data, gtv_count, on='user_id')
final_trx_data = pd.merge(final_trx_data, gtv_max, on='user_id')
final_trx_data = pd.merge(final_trx_data, gtv_sum, on='user_id')
final_trx_data = pd.merge(final_trx_data, gtv_mean, on='user_id')
final_trx_data = pd.merge(final_trx_data, gtv_std_dev, on='user_id')
final_trx_data = pd.merge(final_trx_data, user_asset_type_ohe, on='user_id')

# Drop any duplicate columns (if necessary)
final_trx_data = final_trx_data.loc[:, ~final_trx_data.columns.duplicated()]

#The final processed transaction data
final_trx_data

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  gtv_std_dev['gtv_std_dev'].fillna(0, inplace=True)
  user_asset_type_ohe.replace({True: 1, False:0}, inplace=True)


Unnamed: 0,user_id,first_transaction,last_transaction,recency_of_transaction,gtv_count,gtv_max,gtv_sum,gtv_mean,gtv_std_dev,asset_type_crypto,asset_type_fx,asset_type_gold,asset_type_gss,asset_type_idss,asset_type_mfund,asset_type_stock_index
0,0,2022-04-04 13:47:11.664,2022-04-04 14:37:23.163,1083,3,17.848214,43.444643,14.481548,4.846524,1,0,1,0,0,0,0
1,1,2022-04-12 19:46:00.172,2022-05-29 08:26:49.429,1029,9,1679.078571,2438.003571,270.889286,528.277342,1,0,1,0,0,0,0
2,2,2022-05-27 19:26:01.962,2022-05-27 19:26:01.962,1030,1,162.785714,162.785714,162.785714,0.000000,0,0,1,0,0,0,0
3,3,2022-04-10 09:20:43.624,2022-05-09 09:55:17.403,1049,6,714.282143,3389.005357,564.834226,122.742497,1,1,0,1,0,0,0
4,4,2022-04-10 00:12:52.696,2022-04-13 16:52:29.426,1074,3,26.750000,59.955357,19.985119,5.858902,0,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
188422,188422,2022-05-31 22:26:30.220,2022-05-31 22:26:30.220,1026,1,17.823214,17.823214,17.823214,0.000000,1,0,0,0,0,0,0
188423,188423,2022-05-31 22:03:27.902,2022-05-31 22:10:11.877,1026,2,1789.282143,3555.123214,1777.561607,16.575341,1,0,0,0,0,0,0
188424,188424,2022-05-31 22:44:52.030,2022-05-31 22:44:52.030,1026,1,877.842857,877.842857,877.842857,0.000000,0,0,1,0,0,0,0
188425,188425,2022-05-31 23:59:23.619,2022-05-31 23:59:23.619,1026,1,357.142857,357.142857,357.142857,0.000000,1,0,0,0,0,0,0


In [None]:
print(final_trx_data.count())

user_id                   188427
first_transaction         188427
last_transaction          188427
recency_of_transaction    188427
gtv_count                 188427
gtv_max                   188427
gtv_sum                   188427
gtv_mean                  188427
gtv_std_dev               188427
asset_type_crypto         188427
asset_type_fx             188427
asset_type_gold           188427
asset_type_gss            188427
asset_type_idss           188427
asset_type_mfund          188427
asset_type_stock_index    188427
dtype: int64


### Merging training label, user profile, and processed transaction data

In [None]:
merged_data = pd.merge(train_label,profile, on='user_id')
merged_data = pd.merge(merged_data,final_trx_data, on='user_id')

# Fill NaN in 'gender_name' with the most frequent value (mode)
mode_value = merged_data['gender_name'].mode()[0]
merged_data['gender_name'].fillna(mode_value, inplace=True)
encoder = LabelEncoder()
merged_data['gender_name'] = encoder.fit_transform(merged_data['gender_name'])


# Mobile Brand Name in Test and train data are not consistent, difficult to be used as the features, and the feature importance is considered low, hence we need to drop this
# # One-Hot Encoding for 'mobile_brand_name'
# mobile_brand_ohe = pd.get_dummies(merged_data['mobile_brand_name'], prefix='mobile_brand')
# mobile_brand_ohe.replace({True: 1, False:0}, inplace=True)

# # Merge with original DataFrame
# merged_data = pd.concat([merged_data, mobile_brand_ohe], axis=1)

#Merged Data to be trained
merged_data = merged_data.drop(['first_transaction','last_transaction','marital_status','education_background','income_level','occupation','mobile_marketing_name','mobile_brand_name'],axis = 1)
merged_data

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  merged_data['gender_name'].fillna(mode_value, inplace=True)


Unnamed: 0,user_id,tgt,age_in_year,gender_name,recency_of_transaction,gtv_count,gtv_max,gtv_sum,gtv_mean,gtv_std_dev,asset_type_crypto,asset_type_fx,asset_type_gold,asset_type_gss,asset_type_idss,asset_type_mfund,asset_type_stock_index
0,92167,0,30.0,1,1041,4,1799.612500,1933.312500,483.328125,877.641633,1,0,1,0,0,0,0
1,9930,0,25.0,0,1058,1,16277.750000,16277.750000,16277.750000,0.000000,0,0,1,0,0,0,0
2,176146,0,44.0,0,1034,2,1800.960714,1834.007143,917.003571,1250.104180,0,0,1,0,0,0,0
3,54961,0,27.0,1,1068,1,1690.894643,1690.894643,1690.894643,0.000000,0,0,1,0,0,0,0
4,49665,1,19.0,1,1026,12,2265.935714,8519.478571,709.956548,863.119378,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
150736,61841,0,23.0,1,1042,9,557.112500,1407.669643,156.407738,218.282598,1,1,0,0,0,0,0
150737,118260,0,54.0,1,1027,7,1777.976786,1983.362500,283.337500,659.479922,1,0,1,0,0,0,0
150738,151792,0,53.0,1,1042,4,35.714286,69.850000,17.462500,12.652935,1,0,1,0,0,0,0
150739,85382,0,37.0,0,1071,2,17.855357,26.923214,13.461607,6.213701,1,0,0,0,0,0,0


### Train data set Analysis

In [None]:
# Drop 'user_id' since it's just an identifier (not a feature)
features = merged_data.drop(columns=['user_id', 'tgt'])

# Target variable
target = merged_data['tgt']

# Split data
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

In [None]:
import numpy as np
from collections import Counter

print("Train Labels Distribution:", Counter(y_train))
print("Test Labels Distribution:", Counter(y_test))

Train Labels Distribution: Counter({0: 87557, 1: 33035})
Test Labels Distribution: Counter({0: 21759, 1: 8390})


Train dataset is imbalanced (0s: 72.6%, 1s: 27.4%).

### Due to imbalance training dataset, increase the number of class 1 samples using SMOTE (Synthetic Minority Over-sampling Technique

In [None]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)


In [None]:
print("Train Labels Distribution:", Counter(y_train_resampled))
print("Test Labels Distribution:", Counter(y_test))

Train Labels Distribution: Counter({1: 87557, 0: 87557})
Test Labels Distribution: Counter({0: 21759, 1: 8390})


## Checking Feature Importance via RandomForest Model Training

In [None]:


# Train RandomForest to get feature importance
model = RandomForestClassifier(n_estimators=300, random_state=42)
model.fit(X_train, y_train)

# Get feature importance
feature_importance = pd.DataFrame({'Feature': features.columns, 'Importance': model.feature_importances_})
feature_importance = feature_importance.sort_values(by='Importance', ascending=False)

# Display top features
feature_importance


Unnamed: 0,Feature,Importance
2,recency_of_transaction,0.174719
5,gtv_sum,0.155643
4,gtv_max,0.13788
7,gtv_std_dev,0.126062
6,gtv_mean,0.124274
0,age_in_year,0.121336
3,gtv_count,0.106543
1,gender_name,0.013014
10,asset_type_gold,0.009819
9,asset_type_fx,0.007451


### Evaluate ROC AUC Score for Predicted probability from Random Forest Model

In [None]:
from sklearn.metrics import roc_auc_score

# Predict probabilities for the positive class (1)
y_pred_proba = model.predict_proba(X_test)[:, 1]

# Compute ROC AUC score
roc_auc = roc_auc_score(y_test, y_pred_proba)

print(f"ROC AUC Score: {roc_auc:.4f}")


ROC AUC Score: 0.8406


##Test Data Pre-Processing to be Fit into the Trained Model

In [None]:
test_data = pd.merge(profile,final_trx_data, on='user_id')

test_user_ids = test_data['user_id']
train_user_ids = train_label['user_id']

#Find the user_ids in test_data that are not in train_label
user_ids_not_in_train = test_user_ids[~test_user_ids.isin(train_user_ids)]

# Filter the test_data to include only these user_ids
test_data = test_data[test_data['user_id'].isin(user_ids_not_in_train)]


# Fill NaN in 'gender_name' with the most frequent value (mode).
mode_value = test_data['gender_name'].mode()[0]
test_data['gender_name'].fillna(mode_value, inplace=True)
encoder = LabelEncoder()
test_data['gender_name'] = encoder.fit_transform(test_data['gender_name'])

#Droo the features to match the train dataset for prediction using the trained model
test_data = test_data.drop(['first_transaction','last_transaction','marital_status','education_background','income_level','occupation','mobile_marketing_name','mobile_brand_name'],axis = 1)

test_data.isna().sum()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_data['gender_name'].fillna(mode_value, inplace=True)


Unnamed: 0,0
user_id,0
age_in_year,0
gender_name,0
recency_of_transaction,0
gtv_count,0
gtv_max,0
gtv_sum,0
gtv_mean,0
gtv_std_dev,0
asset_type_crypto,0


## Predict Probability Using RandomForest Model

In [None]:
# Drop 'user_id' since it's just an identifier (not a feature)
test_features = test_data.drop(columns=['user_id'])

# Make predictions using the trained model (class probabilities)
pred_prob = model.predict_proba(test_features)

# Make predictions using the trained model
test_data['pred_prob'] = pred_prob[:, 1]

test_data[['user_id', 'pred_prob']].to_csv('predictions.csv', index=False)

In [None]:
test_data[['user_id', 'pred_prob']]

Unnamed: 0,user_id,pred_prob
7,77148,0.846667
9,161575,0.036667
11,178165,0.003333
12,46124,0.706667
13,65175,0.296667
...,...,...
188398,46436,0.396667
188401,17542,0.250000
188402,188268,0.610000
188403,68016,0.556667


## Exploring XGBOOST Model for better roc_auc_score

In [None]:
import xgboost as xgb
from sklearn.metrics import roc_auc_score

# Convert to DMatrix for XGBoost
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

# Set parameters to use GPU
params = {
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'tree_method': 'gpu_hist',  # Use GPU
    'gpu_id': 0  # Specify the GPU to use
}

# Train the model
model = xgb.train(params, dtrain, num_boost_round=100)

# Make predictions
y_pred_proba_xgb = model.predict(dtest)
roc_auc_xgb = roc_auc_score(y_test, y_pred_proba_xgb)

print(f"Optimized ROC AUC Score (XGBoost GPU): {roc_auc_xgb:.4f}")



    E.g. tree_method = "hist", device = "cuda"



Optimized ROC AUC Score (XGBoost GPU): 0.8462



    E.g. tree_method = "hist", device = "cuda"



In [None]:
# Drop 'user_id' since it's just an identifier (not a feature), drop 'pred_prob' from previous model prediction
test_features = test_data.drop(columns=['user_id','pred_prob'])

# Convert test_features to DMatrix
dtest_features = xgb.DMatrix(test_features)  # Convert to DMatrix

# Make predictions using the trained model (class probabilities)
pred_prob = model.predict(dtest_features) # Use dtest_features instead of test_features


# Make predictions using the trained model
test_data['pred_prob'] = pred_prob #[:, 1]  Remove slicing as pred_prob is already 1D

test_data[['user_id', 'pred_prob']].to_csv('new_predictions.csv', index=False)

In [None]:
test_data[['user_id', 'pred_prob']]

Unnamed: 0,user_id,pred_prob
7,77148,0.475010
9,161575,0.056450
11,178165,0.033531
12,46124,0.693181
13,65175,0.311825
...,...,...
188398,46436,0.326788
188401,17542,0.379281
188402,188268,0.679229
188403,68016,0.340383


#**Utilising GITHUB COPILOT**

##Training Dataset

In [None]:
# Drop 'user_id' since it's just an identifier (not a feature)
features = merged_data.drop(columns=['user_id', 'tgt'])

# Target variable
target = merged_data['tgt']

# Split data
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

##Models Class

In [None]:
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

class LogisticRegressionModel:
    def __init__(self):
        self.coefficients = None
        self.intercept = None

    def sigmoid(self, z):
        return 1 / (1 + np.exp(-z))

    def fit(self, X, y, learning_rate=0.01, num_iterations=1000):
        num_samples, num_features = X.shape
        self.coefficients = np.zeros(num_features)
        self.intercept = 0

        for _ in range(num_iterations):
            linear_model = np.dot(X, self.coefficients) + self.intercept
            y_predicted = self.sigmoid(linear_model)

            # Gradient descent
            dw = (1 / num_samples) * np.dot(X.T, (y_predicted - y))
            db = (1 / num_samples) * np.sum(y_predicted - y)

            self.coefficients -= learning_rate * dw
            self.intercept -= learning_rate * db
    def predict_proba(self, X):
      """
      Predict probabilities for the input data.
      """
      linear_model = np.dot(X, self.coefficients) + self.intercept
      probabilities = self.sigmoid(linear_model)
      return probabilities

    def predict(self, X):
        linear_model = np.dot(X, self.coefficients) + self.intercept
        y_predicted = self.sigmoid(linear_model)
        y_predicted_class = [1 if i > 0.5 else 0 for i in y_predicted]
        return y_predicted_class


class RandomForestModel:
    def __init__(self, n_estimators=100, random_state=42):
        self.model = RandomForestClassifier(n_estimators=n_estimators, random_state=random_state)

    def fit(self, X, y):
        self.model.fit(X, y)

    def predict_proba(self, X):
        return self.model.predict_proba(X)[:, 1]  # Return probabilities for the positive class

    def predict(self, X):
        return self.model.predict(X)


class XGBoostModel:
    def __init__(self, learning_rate=0.1, n_estimators=100, max_depth=3, random_state=42):
        self.model = XGBClassifier(learning_rate=learning_rate, n_estimators=n_estimators, max_depth=max_depth, random_state=random_state, use_label_encoder=False, eval_metric='auc')

    def fit(self, X, y):
        self.model.fit(X, y)

    def predict_proba(self, X):
        return self.model.predict_proba(X)[:, 1]  # Return probabilities for the positive class

    def predict(self, X):
        return self.model.predict(X)

## Evaluate Model Function


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

def evaluate_model(model, X_test, y_test):
    # Make predictions
    predictions = model.predict(X_test)

    # Check if the model supports probability predictions for ROC AUC
    if hasattr(model, "predict_proba"):
        probabilities = model.predict_proba(X_test)  # Get probabilities for the positive class
        roc_auc = roc_auc_score(y_test, probabilities)
    else:
        roc_auc = None  # ROC AUC cannot be calculated if probabilities are not available

    # Calculate metrics
    accuracy = accuracy_score(y_test, predictions)
    precision = precision_score(y_test, predictions)
    recall = recall_score(y_test, predictions)
    f1 = f1_score(y_test, predictions)

    # Create a results dictionary
    results = {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1_score': f1,
        'roc_auc': roc_auc
    }

    return results

## Train Logistic Regression Model


In [None]:
import joblib

# Logistic Regression
logistic_model = LogisticRegressionModel()
logistic_model.fit(X_train, y_train)
logistic_predictions = logistic_model.predict(X_test)
logistic_results = evaluate_model(logistic_model, X_test, y_test)
print("Logistic Regression Results:", logistic_results)
# joblib.dump(logistic_model,'../models/logistic_regression_model.pkl')


  return 1 / (1 + np.exp(-z))
  return 1 / (1 + np.exp(-z))
  return 1 / (1 + np.exp(-z))
  return 1 / (1 + np.exp(-z))
  return 1 / (1 + np.exp(-z))
  return 1 / (1 + np.exp(-z))
  return 1 / (1 + np.exp(-z))
  return 1 / (1 + np.exp(-z))
  return 1 / (1 + np.exp(-z))
  return 1 / (1 + np.exp(-z))
  return 1 / (1 + np.exp(-z))
  return 1 / (1 + np.exp(-z))
  return 1 / (1 + np.exp(-z))
  return 1 / (1 + np.exp(-z))
  return 1 / (1 + np.exp(-z))
  return 1 / (1 + np.exp(-z))
  return 1 / (1 + np.exp(-z))
  return 1 / (1 + np.exp(-z))
  return 1 / (1 + np.exp(-z))
  return 1 / (1 + np.exp(-z))
  return 1 / (1 + np.exp(-z))
  return 1 / (1 + np.exp(-z))
  return 1 / (1 + np.exp(-z))
  return 1 / (1 + np.exp(-z))
  return 1 / (1 + np.exp(-z))
  return 1 / (1 + np.exp(-z))
  return 1 / (1 + np.exp(-z))
  return 1 / (1 + np.exp(-z))
  return 1 / (1 + np.exp(-z))
  return 1 / (1 + np.exp(-z))
  return 1 / (1 + np.exp(-z))
  return 1 / (1 + np.exp(-z))
  return 1 / (1 + np.exp(-z))
  return 1

Logistic Regression Results: {'accuracy': 0.7252645195528873, 'precision': 0.7420814479638009, 'recall': 0.019547079856972585, 'f1_score': 0.038090814075020324, 'roc_auc': np.float64(0.5086687732847219)}


### Train Random Forest Model

In [None]:
# Random Forest
rf_model = RandomForestModel(n_estimators=300)
rf_model.fit(X_train, y_train)
rf_predictions = rf_model.predict(X_test)
rf_results = evaluate_model(rf_model, X_test, y_test)
print("Random Forest Results:", rf_results)
# joblib.dump(rf_model,'../models/random_forest_model.pkl')

Random Forest Results: {'accuracy': 0.8032438886861919, 'precision': 0.693117536140792, 'recall': 0.5257449344457688, 'f1_score': 0.5979395418191676, 'roc_auc': np.float64(0.8406442615144634)}


### Train XGBoost Model

In [None]:
# XGBoost
xgb_model = XGBoostModel(learning_rate=0.1, n_estimators=300)
xgb_model.fit(X_train, y_train)
xgb_predictions = xgb_model.predict(X_test)
xgb_results = evaluate_model(xgb_model, X_test, y_test)
print("XGBoost Results:", xgb_results)
# joblib.dump(xgb_model,'../models/xgboost_model.pkl')

Parameters: { "use_label_encoder" } are not used.



XGBoost Results: {'accuracy': 0.8027795283425653, 'precision': 0.7549019607843137, 'recall': 0.43134684147794994, 'f1_score': 0.548998786407767, 'roc_auc': np.float64(0.8357610986228432)}


##Models Evaluation  Result

In [None]:
eval = pd.DataFrame({
    'Model': ['Logistic Regression', 'Random Forest', 'XGBoost'],
    'Accuracy': [logistic_results['accuracy'], rf_results['accuracy'], xgb_results['accuracy']],
    'Precision': [logistic_results['precision'], rf_results['precision'], xgb_results['precision']],
    'Recall': [logistic_results['recall'], rf_results['recall'], xgb_results['recall']],
    'f1_score': [logistic_results['f1_score'], rf_results['f1_score'], xgb_results['f1_score']],
    'ROC AUC': [logistic_results['roc_auc'], rf_results['roc_auc'], xgb_results['roc_auc']]
})

eval

Unnamed: 0,Model,Accuracy,Precision,Recall,f1_score,ROC AUC
0,Logistic Regression,0.725265,0.742081,0.019547,0.038091,0.508669
1,Random Forest,0.803244,0.693118,0.525745,0.59794,0.840644
2,XGBoost,0.80278,0.754902,0.431347,0.548999,0.835761


## Predict Model

In [None]:
test_data = pd.merge(profile,final_trx_data, on='user_id')

test_user_ids = test_data['user_id']
train_user_ids = train_label['user_id']

#Find the user_ids in test_data that are not in train_label
user_ids_not_in_train = test_user_ids[~test_user_ids.isin(train_user_ids)]

# Filter the test_data to include only these user_ids
test_data = test_data[test_data['user_id'].isin(user_ids_not_in_train)]


# Fill NaN in 'gender_name' with the most frequent value (mode).
mode_value = test_data['gender_name'].mode()[0]
test_data['gender_name'].fillna(mode_value, inplace=True)
encoder = LabelEncoder()
test_data['gender_name'] = encoder.fit_transform(test_data['gender_name'])

#Droo the features to match the train dataset for prediction using the trained model
test_data = test_data.drop(['first_transaction','last_transaction','marital_status','education_background','income_level','occupation','mobile_marketing_name','mobile_brand_name'],axis = 1)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_data['gender_name'].fillna(mode_value, inplace=True)


In [None]:
test_data

Unnamed: 0,user_id,age_in_year,gender_name,recency_of_transaction,gtv_count,gtv_max,gtv_sum,gtv_mean,gtv_std_dev,asset_type_crypto,asset_type_fx,asset_type_gold,asset_type_gss,asset_type_idss,asset_type_mfund,asset_type_stock_index,pred_proba
7,77148,47.0,1,1044,7,607.878571,2938.273214,419.753316,241.635548,1,1,1,1,0,0,0,0.0
9,161575,51.0,0,1041,3,1796.391071,3610.673214,1203.557738,1019.125713,1,0,0,0,0,0,0,0.0
11,178165,44.0,0,1032,4,1787.500000,3613.912500,903.478125,1012.446800,1,0,0,0,0,0,0,0.0
12,46124,51.0,0,1029,24,311.221429,1893.830357,78.909598,74.748788,1,0,0,0,0,0,0,0.0
13,65175,48.0,0,1060,10,178.573214,527.569643,52.756964,49.896235,1,0,0,0,0,0,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
188398,46436,43.0,0,1034,4,1001.630357,2003.260714,500.815179,381.580992,1,0,0,0,0,0,0,0.0
188401,17542,43.0,1,1067,3,694.628571,1836.580357,612.193452,72.260654,1,0,0,0,0,0,0,0.0
188402,188268,43.0,1,1026,2,624.992857,982.132143,491.066071,189.401077,1,0,0,0,0,0,0,0.0
188403,68016,43.0,1,1042,36,5365.917857,50924.210714,1414.561409,1131.177359,1,0,0,0,0,0,0,0.0


In [None]:
import pandas as pd
import joblib  # For loading the saved model

X_test = test_data.drop(['user_id','pred_proba'],axis = 1)

# Load the trained modela
# logistic_model = joblib.load('../models/logistic_regression_model.pkl')
# rf_model = joblib.load('../models/random_forest_model.pkl')
# xgb_model = joblib.load('../models/xgboost_model.pkl')

# print("Model loaded successfully!")

# Make predictions
logistic_model_predictions = logistic_model.predict_proba(X_test)
rf_model_predictions = rf_model.predict_proba(X_test)
xgb_model_predictions = xgb_model.predict_proba(X_test)





  return 1 / (1 + np.exp(-z))


## Save predictions to a CSV file

In [None]:
#Logistic_model_prediction
test_data['pred_proba'] = logistic_model_predictions
test_data[['user_id', 'pred_proba']].to_csv('data/test_logistic_model_predictions.csv', index=False)

In [None]:
#rf_model_prediction
test_data['pred_proba'] = rf_model_predictions
test_data[['user_id', 'pred_proba']].to_csv('data/test_rf_model_predictions.csv', index=False)

In [None]:
#xgb_model_prediction
test_data['pred_proba'] = xgb_model_predictions
test_data[['user_id', 'pred_proba']].to_csv('data/test_xgb_model_predictions.csv', index=False)