## Import library

In [75]:
# !pip install imbalanced-learn

In [76]:
import numpy as np
import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import KFold, RepeatedKFold, GridSearchCV, cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, r2_score as r2, RocCurveDisplay
from sklearn.metrics import f1_score, roc_auc_score, recall_score, accuracy_score, precision_score, roc_curve, precision_recall_curve, auc
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.ensemble import RandomForestClassifier, StackingClassifier, StackingRegressor
from xgboost import XGBClassifier
from collections import Counter
import time
import datetime
import pytz
import h5py
import joblib

desired_timezone = pytz.timezone('Asia/Ho_Chi_Minh')
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

import random
random.seed(42)

## Define Variables

In [77]:
cols = ['avg_sent_time',
 'sent',
 'time_difference_mins',
 'avg_received_time',
 'total_txs',
 'avg_gas_fee',
 'unique_sent_addresses',
 'received',
 'unique_received_addresses', 'total_eth_received']

In [78]:
data_path = "/content/drive/MyDrive/Colab Notebooks/Dataset/address_data_combined.csv"

## Load Data

In [79]:
df = pd.read_csv(data_path, usecols = cols + ['address', 'flag'])

In [80]:
X_sample = df.drop(columns=['address', 'flag'])
y_sample = df['flag']

In [81]:
X_sample = X_sample.round(3)

## Resampling

In [83]:
counter = Counter(y_sample)
print(counter)

Counter({1: 94145, 0: 10643})


In [84]:
oversample = SMOTE(random_state=42)
X_resample, y_resample = oversample.fit_resample(X_sample, y_sample)

In [85]:
counter = Counter(y_resample)
print(counter)

Counter({1: 94145, 0: 94145})


## Training and Testing

### Split train and test data

In [86]:
X_train_full, X_test, y_train_full, y_test = train_test_split(X_resample, y_resample, test_size=0.3, random_state=42, stratify=y_resample)

In [87]:
counter = Counter(y_train_full)
print(counter)

Counter({1: 65902, 0: 65901})


In [88]:
scaler = MinMaxScaler()

In [89]:
# Log transformation for skewed data and scaling
for c in cols:
    # Log transformation
    X_train_full[c] = X_train_full[c].apply(lambda x: np.log(x) if x > 0 else 0)
    X_test[c] = X_test[c].apply(lambda x: np.log(x) if x > 0 else 0)

# Scaling using only the training data to avoid data leakage
X_train_full_scaled = scaler.fit_transform(X_train_full)
X_test_scaled = scaler.transform(X_test)

In [90]:
# Save the scaler for future use
joblib.dump(scaler, "/content/drive/MyDrive/Colab Notebooks/Dataset/scaler.pkl")

['/content/drive/MyDrive/Colab Notebooks/Dataset/scaler.pkl']

In [91]:
np.isnan(X_train_full_scaled).any()

False

In [92]:
X_train_full_scaled.shape

(131803, 10)

In [93]:
X_train_full.head()

Unnamed: 0,avg_sent_time,avg_received_time,time_difference_mins,sent,received,unique_received_addresses,unique_sent_addresses,avg_gas_fee,total_txs,total_eth_received
76630,9.023762,0.0,10.815521,1.791759,0.0,0.0,1.098612,-5.809143,1.791759,0.0
80150,6.565472,9.501068,12.881126,6.12905,1.609438,0.693147,3.850148,-4.961845,6.139885,5.447819
147118,11.724262,0.0,12.417409,0.693147,0.0,0.0,0.693147,-5.708238,0.693147,0.0
33192,10.399191,0.0,11.092338,0.693147,0.0,0.0,0.693147,-6.907755,1.098612,-0.742337
117319,10.930019,10.582379,12.414507,1.098612,0.693147,0.0,0.0,-7.670626,1.609438,-0.90634


In [94]:
X_train_full_scaled[0]

array([0.78521426, 0.40070596, 0.8024074 , 0.18092298, 0.        ,
       0.        , 0.11098578, 0.64006254, 0.18092298, 0.42210334])

In [95]:
X_train_full_scaled = np.round(X_train_full_scaled, 3)

In [96]:
X_train_full_scaled[0]

array([0.785, 0.401, 0.802, 0.181, 0.   , 0.   , 0.111, 0.64 , 0.181,
       0.422])

In [97]:
X_test_scaled[0]

array([0.29434402, 0.94440226, 0.85241455, 0.        , 0.        ,
       0.        , 0.        , 0.47758638, 0.06999056, 0.31629774])

In [98]:
X_test_scaled = np.round(X_test_scaled, 3)

In [99]:
X_test_scaled[0]

array([0.294, 0.944, 0.852, 0.   , 0.   , 0.   , 0.   , 0.478, 0.07 ,
       0.316])

### Config model

In [100]:
xgb_params = {'learning_rate': 0.05,
              'max_depth': 8,
              'n_estimators': 1000}

### Functions

In [101]:
def get_model():
    xgb = XGBClassifier(**xgb_params)
    return xgb

In [102]:
def evaluate_model(model, X, y):
    cv = RepeatedKFold(n_splits=5, n_repeats=1, random_state=42)
    scores = cross_val_score(model, X, y, scoring='f1', cv=cv, n_jobs=-1)
    return scores

In [103]:
def training_model(model, X, y):
    current_time = time.time()
    model.fit(X_train_full_scaled, y_train_full)
    training_time = time.time() - current_time

    current_time = time.time()
    y_pred = model.predict(X_test_scaled)
    prediction_time = time.time() - current_time

    return y_pred, training_time, prediction_time

### Training

In [104]:
xgb = get_model()

In [105]:
results= list()

scores = evaluate_model(xgb, X_train_full_scaled, y_train_full)
results.append(scores)

print('%.3f (%.3f)' % (np.mean(scores), np.std(scores)))

0.969 (0.001)


In [106]:
results

[array([0.9689123 , 0.97147092, 0.96994374, 0.96719482, 0.96799765])]

In [107]:
y_pred, training_time, prediction_time = training_model(xgb, X_train_full_scaled, y_train_full)

In [108]:
metrics_dict = {
        'Accuracy': accuracy_score,
        'Precision': precision_score,
        'Recall': recall_score,
        'F1': f1_score,
        'ROC-AUC': roc_auc_score
    }

xgb_metrics = {metric: func(y_test, y_pred) for metric, func in metrics_dict.items()}
xgb_metrics["training_time"] = training_time
xgb_metrics["prediction_time"] = prediction_time

### Result Model

In [109]:
xgb_metrics

{'Accuracy': 0.9682227769221237,
 'Precision': 0.9454625075793304,
 'Recall': 0.9937683673830684,
 'F1': 0.9690137927462928,
 'ROC-AUC': 0.9682232291525171,
 'training_time': 18.985010385513306,
 'prediction_time': 2.606093168258667}

In [110]:
results_path = "/content/drive/MyDrive/Colab Notebooks/Dataset/model_results.h5"
with h5py.File(results_path, "w") as h5file:
    h5file.create_dataset("y_pred", data=y_pred)
    h5file.create_dataset("y_test", data=y_test)
    for metric, value in xgb_metrics.items():
        h5file.attrs[metric] = value

In [111]:
# print(f"Model results saved to {results_path}")
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.94      0.97     28244
           1       0.95      0.99      0.97     28243

    accuracy                           0.97     56487
   macro avg       0.97      0.97      0.97     56487
weighted avg       0.97      0.97      0.97     56487



In [112]:
joblib.dump(xgb, "/content/drive/MyDrive/Colab Notebooks/Dataset/xgb_model.pkl")

['/content/drive/MyDrive/Colab Notebooks/Dataset/xgb_model.pkl']

In [125]:
for value in xgb.predict(np.array([[0.294, 0.944, 0.853, 0., 0., 0., 0., 0.509, 0.070, 0.654]])):
    print(value)

0


from flask import Flask, jsonify, request
# Set up Flask API
app = Flask(__name__)

@app.route('/metrics', methods=['GET'])
def get_metrics():
    with h5py.File(results_path, "r") as h5file:
        metrics = {key: h5file.attrs[key] for key in h5file.attrs.keys()}
    return jsonify(metrics)

@app.route('/predictions', methods=['GET'])
def get_predictions():
    with h5py.File(results_path, "r") as h5file:
        y_pred = np.array(h5file['y_pred'])
        y_test = np.array(h5file['y_test'])
    predictions = {"y_pred": y_pred.tolist(), "y_test": y_test.tolist()}
    return jsonify(predictions)

@app.route('/predict', methods=['POST'])
def predict():
    data = request.json
    input_df = pd.DataFrame(data, index=[0])
    
    # Load the scaler and model
    scaler = joblib.load("scaler.pkl")
    model = joblib.load("xgb_model.pkl")
    
    # Log transformation for skewed data and scaling
    for c in cols:
        input_df[c] = input_df[c].apply(lambda x: np.log(x) if x > 0 else 0)
    
    input_scaled = scaler.transform(input_df)
    prediction = model.predict(input_scaled)
    
    return jsonify({"prediction": int(prediction[0])})

if __name__ == '__main__':
    app.run(debug=True)

{
    "avg_sent_time": 0.5,
    "sent": 100,
    "time_difference_mins": 10,
    "avg_received_time": 0.3,
    "total_txs": 200,
    "avg_gas_fee": 0.0005,
    "unique_sent_addresses": 10,
    "received": 150,
    "unique_received_addresses": 20,
    "total_eth_received": 5.0
}