In [5]:
import mlflow
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from statsmodels.stats.contingency_tables import mcnemar

from graph_reinforcement_learning_using_blockchain_data import config

config.load_dotenv()

True

## RF with GNN embeddings

In [6]:
df_features = pd.read_csv(
    config.PROCESSED_DATA_DIR / "flashbots" / "Q2_2023" / "features_edges_without_0logs.csv"
)
df_embeddings = pd.read_csv(config.FLASHBOTS_Q2_DATA_DIR / "embeddings_128.csv")

In [7]:
def parse_embedding(embedding_str):
    embedding_str = embedding_str.replace("\n", " ")
    embedding_array = np.fromstring(embedding_str.strip("[]"), sep=" ")
    return embedding_array


df_embeddings["embeddings"] = df_embeddings["embeddings"].apply(parse_embedding)

In [8]:
embedding_list = df_embeddings["embeddings"].tolist()
embedding_dim = embedding_list[0].shape[0]

embeddings_expanded = pd.DataFrame(
    embedding_list, columns=[f"emb_{i}" for i in range(embedding_dim)]
)
embeddings_expanded["transactionHash"] = df_embeddings["transactionHash"].values

In [9]:
df_merged = df_features.merge(embeddings_expanded, how="inner", on="transactionHash")

In [10]:
df = df_merged.drop(columns=["transactionHash", "from", "to", "blockNumber", "transactionIndex"])
df.head()

Unnamed: 0,gasUsed,cumulativeGasUsed,effectiveGasPrice,status,fee,num_logs,dummy_0x1c411e9a,dummy_0xe1fffcc4,dummy_0xddf252ad,dummy_0xe9149e1b,...,emb_118,emb_119,emb_120,emb_121,emb_122,emb_123,emb_124,emb_125,emb_126,emb_127
0,157316,2135061,50000000000,1,7865800000000000,6,1,1,1,0,...,1.046832,-1.244183,-0.525696,-0.923329,-1.299295,0.577549,-0.721241,-1.12278,-1.086308,-0.922723
1,48537,10131517,22000000000,1,1067814000000000,1,0,0,1,0,...,0.703639,-0.676612,-0.518422,-0.524117,-0.732895,0.692418,-0.52364,-0.374066,-0.332854,-0.970089
2,62833,5225102,19360969611,1,1216507803567963,2,0,0,1,0,...,0.703639,-0.676612,-0.518422,-0.524117,-0.732895,0.692418,-0.52364,-0.374066,-0.332854,-0.970089
3,172470,3799719,20100000000,1,3466647000000000,5,0,0,1,0,...,0.977257,-0.838083,-0.549349,-0.583719,-0.645372,0.355149,-0.283619,-0.650481,-0.292547,-0.898376
4,254032,2564577,22568554601,1,5733135062401232,11,0,0,1,0,...,-0.174104,0.145695,0.086302,0.184217,0.141733,-0.12955,0.21648,0.129744,0.140465,0.234399


In [11]:
X = df.drop(columns=["label"])
# X = np.hstack([other_features, embedding_matrix])
y = df["label"]

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
mlflow.set_tracking_uri(uri="http://127.0.0.1:8080")

model_uri_gnn = "runs:/788f48b93d7c4aba889560a516d21416/best_estimator"
model_uri_gnn2 = "runs:/5cdb567944334deebb164cac401932f5/best_estimator"
model_uri_non_gnn = "runs:/dc223caf60224c66a105dbb2a9ef71e0/best_estimator"

model_gnn = mlflow.sklearn.load_model(model_uri_gnn)
model_gnn2 = mlflow.sklearn.load_model(model_uri_gnn2)
model_non_gnn = mlflow.sklearn.load_model(model_uri_non_gnn)

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

In [14]:
gnn_pred = model_gnn.predict(X_test)
gnn_pred2 = model_gnn2.predict(X_test)

## RF with standard features

In [15]:
df_features = pd.read_csv(
    config.PROCESSED_DATA_DIR / "flashbots" / "Q2_2023" / "features_edges_without_0logs.csv"
)

In [16]:
df = df_features.drop(columns=["transactionHash", "from", "to", "blockNumber", "transactionIndex"])

In [17]:
X = df.drop(columns=["label"])
y = df["label"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [18]:
rf_pred = model_non_gnn.predict(X_test)

## McNemar's Test

In [19]:
def mcnemar_test(y_true, y_pred_a, y_pred_b):
    y_true = np.array(y_true)
    y_pred_a = np.array(y_pred_a)
    y_pred_b = np.array(y_pred_b)

    # Compute elements of the contingency table:
    # n00: both classifiers correct
    # n01: classifier A correct, classifier B wrong
    # n10: classifier A wrong, classifier B correct
    # n11: both classifiers wrong
    n00 = np.sum((y_pred_a == y_true) & (y_pred_b == y_true))
    n01 = np.sum((y_pred_a == y_true) & (y_pred_b != y_true))
    n10 = np.sum((y_pred_a != y_true) & (y_pred_b == y_true))
    n11 = np.sum((y_pred_a != y_true) & (y_pred_b != y_true))

    table = [[n00, n01], [n10, n11]]

    # Use exact binomial test for small sample sizes (exact=True).
    result = mcnemar(table, exact=True)
    return table, result.statistic, result.pvalue

The code compares the predictions for each test instance to build a 2×2 table:
- n00: Number of instances where both classifiers are correct.
- n01: Instances where classifier A is correct and classifier B is wrong.
- n10: Instances where classifier A is wrong and classifier B is correct.
- n11: Instances where both classifiers are wrong.

In [20]:
table, stat, pvalue = mcnemar_test(y_test, rf_pred, gnn_pred)
print("Contingency Table:", table)
print("McNemar's Test Statistic:", stat)
print("p-value:", pvalue)

Contingency Table: [[35406, 53], [220, 336]]
McNemar's Test Statistic: 53.0
p-value: 2.3749759651453993e-25
