## Import Libraries

### Connect Drive

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

### Install Package

In [None]:
# !pip install scikeras==0.12

Collecting scikeras==0.12
  Downloading scikeras-0.12.0-py3-none-any.whl (27 kB)
Installing collected packages: scikeras
Successfully installed scikeras-0.12.0


In [None]:
# !pip install imbalanced-learn



### Import Package

In [None]:
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.preprocessing import MinMaxScaler, LabelEncoder, PolynomialFeatures
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.metrics import f1_score, precision_score, confusion_matrix, roc_auc_score, recall_score, accuracy_score
from sklearn.metrics import roc_curve, auc, classification_report, r2_score as r2, RocCurveDisplay
from sklearn.model_selection import KFold, RepeatedKFold, GridSearchCV, train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
import keras
from keras.constraints import max_norm as MaxNorm
from keras.models import Sequential
from tqdm import tqdm
import time


%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
matplotlib.rcParams['figure.figsize'] = (11.0, 8.0)

import random
random.seed(42)

import warnings
warnings.filterwarnings("ignore")

## Define Variables

In [None]:
data_path = "/content/drive/MyDrive/Colab Notebooks/Data/address_data_combined.csv"

In [None]:
columns = ['Avg min between sent tnx', 'Avg min between received tnx',
       'Time Diff between first and last (Mins)', 'Number of Tnx Error',
       'Unique Received From Addresses', 'min value received',
       'max value received ', 'avg val received', 'min val sent',
       'avg val sent', 'avg gas fee', 'total transactions (including tnx to create contract)',
       'total ether received', 'total ether balance']

## Load Data

In [None]:
selected_columns = columns + col
df = pd.read_csv(data_path, usecols=selected_columns)

## Slpit train and test data

In [None]:
X = df.drop(columns=['Address', 'FLAG'])
y = df['FLAG']

In [None]:
# Split the data
X_train_full, X_test, y_train_full, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

In [None]:
# Initialize MinMaxScaler
scaler = MinMaxScaler()

In [None]:
# Log transformation for skewed data and scaling
for c in columns:
    # Log transformation
    X_train_full[c] = X_train_full[c].apply(lambda x: np.log(x) if x > 0 else 0)
    X_test[c] = X_test[c].apply(lambda x: np.log(x) if x > 0 else 0)

# Scaling using only the training data to avoid data leakage
X_train_full_scaled = scaler.fit_transform(X_train_full)
X_test_scaled = scaler.transform(X_test)

In [None]:
np.isnan(X_train_full_scaled).any()

In [None]:
X_train_full_scaled.shape

In [None]:
X_train_full.head()

## Config Model LightGBM

In [None]:
lightgbm_params = {"bagging_fraction": 0.95,
                   "bagging_freq": 1,
                   "feature_fraction": 0.95,
                   "learning_rate": 0.2,
                   "max_bin": 300,
                   "max_depth": 6,
                   "min_gain_to_split": 0,
                   "num_leaves": 20}

## Functions

In [None]:
def get_lgbm_model():
    return lgb.LGBMClassifier(**lightgbm_params)

In [None]:
def plot_confusion_matrix(y_true, y_pred, model_name):
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt="d", cmap="coolwarm", cbar=False, linewidths=.5)
    plt.title(f'Confusion Matrix - {model_name}')
    plt.xlabel('Predicted labels')
    plt.ylabel('True labels')
    plt.show()

In [None]:
def evualtution_model():
    cv = RepeatedKFold(n_splits=5, n_repeats=1, random_state=42)
    scores = cross_val_score(get_lgbm_model(), X, y, scoring='f1', cv=cv, n_jobs=-1)
    return scores

## Training

In [None]:
evualtution_model()

In [None]:
# Fitting and evaluating the LGBM model
start_time = time.time()
lgbm_model = get_lgbm_model()
lgbm_model.fit(X_train_full_scaled, y_train_full)
end_time = time.time()

# Predictions on the test set
lgbm_predictions = lgbm_model.predict(X_test_scaled)

# Calculating evaluation metrics
metrics_dict = {
        'Accuracy': accuracy_score,
        'Precision': precision_score,
        'Recall': recall_score,
        'F1': f1_score,
        'ROC-AUC': roc_auc_score
    }
lgbm_metrics = {}
for metric, func in metrics_dict.items():
    lgbm_metrics[metric] = func(y_test, lgbm_predictions)


## Test

## Result

In [None]:
lgbm_model = get_lgbm_model()
lgbm_model.fit(X_train_full_scaled, y_train_full)
y_pred = lgbm_model.predict(X_test_scaled)
plot_confusion_matrix(y_test, y_pred, 'LightGBM')

## Plot

In [None]:
# Assuming lgbm_model is your trained LightGBM model and X_test_scaled contains the test data
y_pred_prob = lgbm_model.predict_proba(X_test_scaled)[:, 1]
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)
roc_auc = auc(fpr, tpr)

plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC)')
plt.legend(loc='lower right')
plt.show()

In [None]:
sns.set_context("poster")

In [None]:
disp = plot_confusion_matrix(lgbm_model, X_test_scaled, y_test, cmap='cividis', colorbar=False)

In [None]:
plt.scatter(X[:,0], X[:,1], c=y)
plt.show

In [None]:
plt.rcParams.update({'font.size':16, "figure.figsize":(8,5)})
RocCurveDisplay.from_estimator(lgbm_model, X_test_scaled, y_test)
plt.plot([0,1],[0,1],'r--')