# Predictive analysis- Machine learning model ( ADASYN for Data Balancing + Ensemble Hard Voting)

# Importing Libraries

In [None]:
#TODO
!pip install lime

Collecting lime
  Downloading lime-0.2.0.1.tar.gz (275 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/275.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m275.7/275.7 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: lime
  Building wheel for lime (setup.py) ... [?25l[?25hdone
  Created wheel for lime: filename=lime-0.2.0.1-py3-none-any.whl size=283834 sha256=90dd529f003da37de100b37462a4991f4ae60fcaf31e558edc4fd39f62aeb4bb
  Stored in directory: /root/.cache/pip/wheels/fd/a2/af/9ac0a1a85a27f314a06b39e1f492bee1547d52549a4606ed89
Successfully built lime
Installing collected packages: lime
Successfully installed lime-0.2.0.1


In [None]:
# Bitcoin Heist Data Analysis with ADASYN, and LIME

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from scipy.stats import uniform, randint
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from imblearn.over_sampling import ADASYN
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier, RandomForestClassifier
from lime import lime_tabular

# Exploratory Data Analysis

## Dataset loading

In [None]:
df=pd.read_csv("BitcoinHeistData.csv")

In [None]:
df.shape

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
df["label"].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
white,2875284
paduaCryptoWall,12390
montrealCryptoLocker,9315
princetonCerber,9223
princetonLocky,6625
montrealCryptXXX,2419
montrealNoobCrypt,483
montrealDMALockerv3,354
montrealDMALocker,251
montrealSamSam,62


So only a small fraction of transactions are fraudulent. Our anomaly detection can work well.

In [None]:
categorical_cols=df.select_dtypes(include="object").columns

In [None]:
numerical_cols=df.select_dtypes(include="number").columns

In [None]:
df.isnull().sum()

Unnamed: 0,0
address,0
year,0
day,0
length,0
weight,0
count,0
looped,0
neighbors,0
income,0
label,0


Woah! **No null values** :)

In [None]:
df["address"].nunique()

2631095

In [None]:
old_df=df.copy()

# Feature Engineering

In [None]:
df.drop(columns=["address","year","day"],axis=1,inplace=True)

In [None]:
for col in df.columns[:-1]:
    df[col]=df[col]

In [None]:
X=df.drop(columns=["label"])
y=df["label"]

In [None]:
X.columns

Index(['length', 'weight', 'count', 'looped', 'neighbors', 'income'], dtype='object')

In [None]:
df.head()

Unnamed: 0,length,weight,count,looped,neighbors,income,label
0,18,0.008333,1,0,2,100050000.0,princetonCerber
1,44,0.000244,1,0,1,100000000.0,princetonLocky
2,0,1.0,1,0,2,200000000.0,princetonCerber
3,72,0.003906,1,0,2,71200000.0,princetonCerber
4,144,0.072848,456,0,1,200000000.0,princetonLocky


## Number of Instances

In [None]:
new_df=pd.DataFrame()
grouped=df.groupby("label")

In [None]:
new_df["num_of_instances"]=grouped.size()

## Average

In [None]:
df.columns

Index(['length', 'weight', 'count', 'looped', 'neighbors', 'income', 'label'], dtype='object')

In [None]:
for col in X.columns:
    new_df[f"{col}_avg"]=grouped[col].mean()

In [None]:
new_df.shape

(29, 7)

In [None]:
new_df.head()

Unnamed: 0_level_0,num_of_instances,length_avg,weight_avg,count_avg,looped_avg,neighbors_avg,income_avg
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
montrealAPT,11,67.636364,0.707728,2047.0,734.090909,2.545455,371987300.0
montrealComradeCircle,1,144.0,0.051214,1241.0,0.0,2.0,203320000.0
montrealCryptConsole,7,43.428571,0.593306,831.714286,0.0,2.0,45463340.0
montrealCryptXXX,2419,47.447706,0.367505,791.848284,61.022323,2.011988,135534300.0
montrealCryptoLocker,9315,30.674396,0.888878,308.328824,100.981535,2.885346,1840825000.0


## Standard Deviation

In [None]:
for col in X.columns:
    new_df[f"{col}_std"]=grouped[col].agg(np.std).fillna(0)

  new_df[f"{col}_std"]=grouped[col].agg(np.std).fillna(0)
  new_df[f"{col}_std"]=grouped[col].agg(np.std).fillna(0)
  new_df[f"{col}_std"]=grouped[col].agg(np.std).fillna(0)
  new_df[f"{col}_std"]=grouped[col].agg(np.std).fillna(0)
  new_df[f"{col}_std"]=grouped[col].agg(np.std).fillna(0)
  new_df[f"{col}_std"]=grouped[col].agg(np.std).fillna(0)


## Minimum

In [None]:
for col in X.columns:
    new_df[f"{col}_min"]=grouped[col].min()

## Maximum

In [None]:
for col in X.columns:
    new_df[f"{col}_max"]=grouped[col].max()

In [None]:
new_df=new_df.reset_index()
new_df.head()

Unnamed: 0,label,num_of_instances,length_avg,weight_avg,count_avg,looped_avg,neighbors_avg,income_avg,length_std,weight_std,...,count_min,looped_min,neighbors_min,income_min,length_max,weight_max,count_max,looped_max,neighbors_max,income_max
0,montrealAPT,11,67.636364,0.707728,2047.0,734.090909,2.545455,371987300.0,73.145434,0.600596,...,1,0,1,57142857.0,144,1.666667,8076,8073,6,1088599000.0
1,montrealComradeCircle,1,144.0,0.051214,1241.0,0.0,2.0,203320000.0,0.0,0.0,...,1241,0,2,203320001.0,144,0.051214,1241,0,2,203320000.0
2,montrealCryptConsole,7,43.428571,0.593306,831.714286,0.0,2.0,45463340.0,68.747987,0.410909,...,1,0,2,30000000.0,144,1.0,3191,0,2,50300000.0
3,montrealCryptXXX,2419,47.447706,0.367505,791.848284,61.022323,2.011988,135534300.0,58.187904,0.434143,...,1,0,1,30000000.0,144,3.458951,9262,8489,15,1080000000.0
4,montrealCryptoLocker,9315,30.674396,0.888878,308.328824,100.981535,2.885346,1840825000.0,50.731602,1.555608,...,1,0,1,30000000.0,144,31.108593,6423,6418,79,445000000000.0


# Data Preprocessing

In [None]:
black_rows=df[df["label"]!="white"]
white_sample=df[df["label"]=="white"].sample(n=158587,random_state=42)
df=pd.concat([black_rows,white_sample]).sample(frac=1,random_state=50)
df.head()

## Label Encoding

We do label encoding of:
> * White label: 0
> * Ransomware: 1

In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

df['label'] = (df['label'] != 'white').astype(int)


In [None]:
df.head()

## Train Test Split

 Training data: 66%
> Test data: 33%

In [None]:
X = df.drop('label', axis=1)
y = df['label']

## Data balancing (ADASYN-TL technique) And Splitting into Training and Testing dataset

In [None]:
from sklearn.model_selection import train_test_split
adasyn = ADASYN(random_state=42)
X_balanced, y_balanced = adasyn.fit_resample(X, y)
X_train, X_test, y_train, y_test = train_test_split(X_balanced, y_balanced, test_size=0.35, random_state=42, shuffle = True)

In [None]:
y_balanced.value_counts()

## Standardization

We perform **normalization** on the train data and then scale test data accordingly.

In [None]:
from sklearn.preprocessing import StandardScaler

scaler=StandardScaler()
scaler.fit(X_train)

X_train_scaled=scaler.transform(X_train)
X_test_scaled=scaler.transform(X_test)

X_train_scaled=pd.DataFrame(X_train_scaled,columns=X_train.columns)
X_test_scaled=pd.DataFrame(X_test_scaled,columns=X_test.columns)

# Training

**This might take few minutes to run.**

In [None]:
# Define models
models = {
    'lr': LogisticRegression(solver='saga', max_iter=1000),
    # 'svc': SVC(probability=True, random_state=42),
    'nb': GaussianNB(),
    # 'rf': RandomForestClassifier(random_state=42),
    'xgb': XGBClassifier(random_state=42)
}

## Hyperparameter tuning

In [None]:
# Optimize XGBoost
param_grid_xgb = {
    'n_estimators': randint(50, 200),
    'max_depth': randint(3, 10),
    'learning_rate': uniform(0.01, 0.2),
    'subsample': uniform(0.7, 0.3),
    'colsample_bytree': uniform(0.7, 0.3)
}

In [None]:
search_xgb = RandomizedSearchCV(XGBClassifier(random_state=42), param_distributions=param_grid_xgb,
                                n_iter=5, cv=3)
search_xgb.fit(X_train_scaled, y_train)
models['xgb'] = search_xgb.best_estimator_

In [None]:
# Train voting classifier
voting_clf = VotingClassifier(
    estimators=[
        ('lr', models['lr']),
        ('nb', models['nb']),
        ('xgb', models['xgb'])
    ],
    voting='hard'
)
voting_clf.fit(X_train_scaled, y_train)



In [None]:
# import numpy as np
# from scipy.stats import uniform, randint
# from sklearn.model_selection import RandomizedSearchCV
# from xgboost import XGBClassifier

# # Define the XGBoost classifier
# model = XGBClassifier()

# # Define the hyperparameter search space
# param_space = {
#     'n_estimators': randint(100, 1000),
#     'max_depth': randint(1, 10),
#     'learning_rate': uniform(0.01, 0.3),
#     'subsample': uniform(0.6, 0.4),
#     'colsample_bytree': uniform(0.6, 0.4)
# }

# # Perform random search
# random_search = RandomizedSearchCV(model, param_distributions=param_space, n_iter=10, cv=5)
# random_search.fit(X_train_scaled, y_train)

# # Print the best parameters and score
# print("Best parameters found: ", random_search.best_params_)
# print("Best score: ", random_search.best_score_)

# Evaluation

We use the following metrics for evaluation:
> * accuracy
> * precision
> * recall
> * f1 score

In [None]:
# Evaluate voting classifier
results = {}
y_pred_voting = voting_clf.predict(X_test_scaled)
results['voting'] = {
    'accuracy': accuracy_score(y_test, y_pred_voting),
    'precision': precision_score(y_test, y_pred_voting),
    'recall': recall_score(y_test, y_pred_voting),
    'f1': f1_score(y_test, y_pred_voting)
}

In [None]:
# Print results
for name, metrics in results.items():
    print(f"\nResults for {name.upper()}:")
    for metric, value in metrics.items():
        print(f"{metric.capitalize()}: {value:.2f}")