In [2]:
import mlflow
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from statsmodels.stats.contingency_tables import mcnemar

from graph_reinforcement_learning_using_blockchain_data import config

config.load_dotenv()

[32m2025-05-09 12:01:24.074[0m | [1mINFO    [0m | [36mgraph_reinforcement_learning_using_blockchain_data.config[0m:[36m<module>[0m:[36m12[0m - [1mPROJ_ROOT path is: /Users/liamtessendorf/Programming/Uni/2_Master/4_FS25_Programming/graph-reinforcement-learning-using-blockchain-data[0m


True

## RF with GNN embeddings

In [2]:
df_features = pd.read_csv(
    config.PROCESSED_DATA_DIR / "flashbots" / "Q2_2023" / "features_edges_without_0logs.csv"
)
df_embeddings = pd.read_csv(config.FLASHBOTS_Q2_DATA_DIR / "embeddings_128.csv")

In [3]:
def parse_embedding(embedding_str):
    embedding_str = embedding_str.replace("\n", " ")
    embedding_array = np.fromstring(embedding_str.strip("[]"), sep=" ")
    return embedding_array


df_embeddings["embeddings"] = df_embeddings["embeddings"].apply(parse_embedding)

In [4]:
embedding_list = df_embeddings["embeddings"].tolist()
embedding_dim = embedding_list[0].shape[0]  # e.g. 128

embeddings_expanded = pd.DataFrame(
    embedding_list, columns=[f"emb_{i}" for i in range(embedding_dim)]
)
embeddings_expanded["transactionHash"] = df_embeddings["transactionHash"].values

In [5]:
df_merged = df_features.merge(embeddings_expanded, how="inner", on="transactionHash")

In [6]:
df = df_merged.drop(columns=["transactionHash", "from", "to", "blockNumber", "transactionIndex"])
df.head()

Unnamed: 0,gasUsed,cumulativeGasUsed,effectiveGasPrice,status,fee,num_logs,dummy_0xe9149e1b,dummy_0x8c5be1e5,dummy_0x7fcf532c,dummy_0xe1fffcc4,...,emb_118,emb_119,emb_120,emb_121,emb_122,emb_123,emb_124,emb_125,emb_126,emb_127
0,153529,10750253,19150328915,1,2940130847991035,3,0,1,0,0,...,0.703639,-0.676612,-0.518422,-0.524117,-0.732895,0.692418,-0.52364,-0.374066,-0.332854,-0.970089
1,123497,229900,139502435999,1,17228132338568503,7,0,0,0,0,...,-0.138172,0.087557,0.063972,0.171073,0.106778,-0.096407,0.173654,0.108045,0.091411,0.204592
2,46817,14885776,18825121161,1,881335697394537,1,0,0,0,0,...,0.703639,-0.676612,-0.518422,-0.524117,-0.732895,0.692418,-0.52364,-0.374066,-0.332854,-0.970089
3,32215,3160909,19660583885,1,633365709855275,1,0,0,0,0,...,0.703639,-0.676612,-0.518422,-0.524117,-0.732895,0.692418,-0.52364,-0.374066,-0.332854,-0.970089
4,153886,9279317,45282851253,1,6968396847919158,5,0,0,0,1,...,1.046832,-1.244183,-0.525696,-0.923329,-1.299295,0.577549,-0.721241,-1.12278,-1.086308,-0.922723


In [7]:
X = df.drop(columns=["label"])
# X = np.hstack([other_features, embedding_matrix])
y = df["label"]

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [3]:
mlflow.set_tracking_uri(uri="http://127.0.0.1:8080")

model_uri_gnn = "runs:/7b43596526c04f28b691185b808233fb/best_estimator"
model_uri_gnn2 = "runs:/5cdb567944334deebb164cac401932f5/best_estimator"
model_uri_non_gnn = "runs:/346e595d41db4b8ba68353a24e369c09/best_estimator"

model_gnn = mlflow.sklearn.load_model(model_uri_gnn)
model_gnn2 = mlflow.sklearn.load_model(model_uri_gnn2)
model_non_gnn = mlflow.sklearn.load_model(model_uri_non_gnn)

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

In [10]:
gnn_pred = model_gnn.predict(X_test)
gnn_pred2 = model_gnn2.predict(X_test)

## RF with standard features

In [13]:
df_features = pd.read_csv(
    config.PROCESSED_DATA_DIR / "flashbots" / "Q2_2023" / "features_edges_without_0logs.csv"
)

In [14]:
df = df_features.drop(columns=["transactionHash", "from", "to", "blockNumber", "transactionIndex"])

In [15]:
X = df.drop(columns=["label"])
y = df["label"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [16]:
rf_pred = model_non_gnn.predict(X_test)

## McNemar's Test

In [20]:
def mcnemar_test(y_true, y_pred_a, y_pred_b):
    y_true = np.array(y_true)
    y_pred_a = np.array(y_pred_a)
    y_pred_b = np.array(y_pred_b)

    # Compute elements of the contingency table:
    # n00: both classifiers correct
    # n01: classifier A correct, classifier B wrong
    # n10: classifier A wrong, classifier B correct
    # n11: both classifiers wrong
    n00 = np.sum((y_pred_a == y_true) & (y_pred_b == y_true))
    n01 = np.sum((y_pred_a == y_true) & (y_pred_b != y_true))
    n10 = np.sum((y_pred_a != y_true) & (y_pred_b == y_true))
    n11 = np.sum((y_pred_a != y_true) & (y_pred_b != y_true))

    table = [[n00, n01], [n10, n11]]

    # Use exact binomial test for small sample sizes (exact=True).
    result = mcnemar(table, exact=True)
    return table, result.statistic, result.pvalue

The code compares the predictions for each test instance to build a 2×2 table:
- n00: Number of instances where both classifiers are correct.
- n01: Instances where classifier A is correct and classifier B is wrong.
- n10: Instances where classifier A is wrong and classifier B is correct.
- n11: Instances where both classifiers are wrong.

In [21]:
table, stat, pvalue = mcnemar_test(y_test, rf_pred, gnn_pred)
print("Contingency Table:", table)
print("McNemar's Test Statistic:", stat)
print("p-value:", pvalue

Contingency Table: [[36264, 19], [277, 267]]
McNemar's Test Statistic: 19.0
p-value: 6.917112456951167e-60
