In [1]:
!pip install pandas


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
import re
import pandas as pd
from collections import defaultdict

def get_transaction_count(line):
    pattern = r"transactions = \{([^}]+)\}"
    match = re.search(pattern, line)
    if match:
        return set(re.findall(r"Transaction ([a-fA-F0-9]+)", match.group(1)))
    return set()

def get_timestamp(line):
    pattern = r"^\d+\.\d+"
    match = re.match(pattern, line)
    return float(match.group(0)) if match else None

def get_node_name(line):
    pattern = r"Node ([A-Za-z0-9]+)"
    match = re.search(pattern, line)
    return match.group(1) if match else None

def count_unique_mempool_transactions(file_path, node_number):
    unique_transactions = set()
    with open(file_path, 'r') as file:
        for line in file:
            if f"Node {node_number}" in line and "from mempool" in line:
                unique_transactions.update(re.findall(r"Transaction ([a-fA-F0-9]+)", line))
    return len(unique_transactions)

def extract_slot(message_line):
    """
    Extracts the slot number from an externalize message log line.
    Looks for the pattern "slot <number>".
    """
    pattern = r"slot (\d+)"
    match = re.search(pattern, message_line)
    if match:
        return int(match.group(1))
    return None

def process_log_lines(file_path):
    """
    Extracts **all** SCPExternalize messages per node and stores their relevant details.
    A new column 'Slot' is added by parsing the slot number from the log message.
    """
    data = []

    with open(file_path, 'r') as file:
        for line in file:
            # Only consider lines that contain relevant externalization messages
            if ("appended SCPExternalize message for slot" not in line and
                "adopting externalized value for slot" not in line):
                continue
            
            node_name = get_node_name(line)
            timestamp = get_timestamp(line)
            transactions = get_transaction_count(line)
            slot = extract_slot(line)  # Extract slot directly from the log line

            if node_name:
                data.append({
                    "node name": node_name,
                    "Timestamp of finalisation": timestamp,
                    "Finalised transactions": transactions,
                    "Externalize message": line.strip(),
                    "Slot": slot
                })
    
    # Convert the collected data to a DataFrame
    df = pd.DataFrame(data)
    
    # If any row didn't have a slot parsed, you might want to drop or handle it:
    df = df.dropna(subset=["Slot"])

    # Count the number of finalized transactions for each externalize message
    df["No. of finalised transactions"] = df["Finalised transactions"].apply(len)
    
    # Compute total transactions for each node from mempool logs
    df["total_transactions"] = df["node name"].apply(lambda node: count_unique_mempool_transactions(file_path, node))
    
    # Calculate number of transactions not finalized for each node
    df["no. of transactions not finalised"] = df["total_transactions"] - df["No. of finalised transactions"]

    return df

# Example usage:
# file_path = 'path/to/your/simulator_events_log.txt'
# df = process_log_lines(file_path)
# print(df)


In [3]:
#file_path = ('scripts/logs/run_1/simulator_events_log.txt')
file_path = ('src/simulator_events_log.txt')
df = process_log_lines(file_path)

# df_sorted = df.sort_values(by='Timestamp of finalisation', ascending=True)

pd.set_option('display.max_rows', None)  # Show all rows
pd.set_option('display.max_columns', None)  # Show all columns
pd.set_option('display.width', 1000)  # Prevent line wrapping
pd.set_option('display.colheader_justify', 'center')  # Center column headers
pd.set_option('display.float_format', '{:.4f}'.format)  # Format float values

df = df.sort_values(by='Slot', ascending=True)

display(df) 

Unnamed: 0,node name,Timestamp of finalisation,Finalised transactions,Externalize message,Slot,No. of finalised transactions,total_transactions,no. of transactions not finalised
0,GAM3VQ4PJQTSDSPBQYBROS36KU6JHKDEU4M5UJSAIFQS5E...,2.38,"{8dd7893e, 1384537f, dac90391, 109a0c10, 6999f...",2.38 - NODE - INFO - Node GAM3VQ4PJQTSDSPBQYBR...,1,16,8,-8
76,GCMSM2VFZGRPTZKPH5OABHGH4F3AVS6XTNJXDGCZ3MKCOS...,2.68,"{8dd7893e, 1384537f, dac90391, 109a0c10, 6999f...",2.68 - Node GCMSM2VFZGRPTZKPH5OABHGH4F3AVS6XTN...,1,16,3,-13
75,GARYGQ5F2IJEBCZJCBNPWNWVDOFK7IBOHLJKKSG2TMHDQK...,2.68,"{8dd7893e, 1384537f, dac90391, 109a0c10, 6999f...",2.68 - Node GARYGQ5F2IJEBCZJCBNPWNWVDOFK7IBOHL...,1,16,5,-11
74,GCCBIMY3S33HQCHNDECEOOFUSJLLZ2FOJRBWEFYCWPFAPU...,2.68,"{8dd7893e, 1384537f, dac90391, 109a0c10, 6999f...",2.68 - Node GCCBIMY3S33HQCHNDECEOOFUSJLLZ2FOJR...,1,16,4,-12
73,GD5JOWV72KZYUTAGFAWAZJDL7SWQHEU3OWDJXSKG2ZOU7A...,2.68,"{8dd7893e, 1384537f, dac90391, 109a0c10, 6999f...",2.68 - Node GD5JOWV72KZYUTAGFAWAZJDL7SWQHEU3OW...,1,16,2,-14
72,GAYMJTJ5XMCDNCJEJUCIQNRTMHAPPWGAEXKSQV3PRZWFBB...,2.68,"{8dd7893e, 1384537f, dac90391, 109a0c10, 6999f...",2.68 - Node GAYMJTJ5XMCDNCJEJUCIQNRTMHAPPWGAEX...,1,16,1,-15
71,GC5SXLNAM3C4NMGK2PXK4R34B5GNZ47FYQ24ZIBFDFOCU6...,2.68,"{8dd7893e, 1384537f, dac90391, 109a0c10, 6999f...",2.68 - Node GC5SXLNAM3C4NMGK2PXK4R34B5GNZ47FYQ...,1,16,2,-14
70,GA7TEPCBDQKI7JQLQ34ZURRMK44DVYCIGVXQQWNSWAEQR6...,2.68,"{8dd7893e, 1384537f, dac90391, 109a0c10, 6999f...",2.68 - Node GA7TEPCBDQKI7JQLQ34ZURRMK44DVYCIGV...,1,16,6,-10
69,GCXNILBNTEQ5ASHDO6BTCP7QIBNYDGWK5NMLIBZYTZTPX4...,2.68,"{8dd7893e, 1384537f, dac90391, 109a0c10, 6999f...",2.68 - Node GCXNILBNTEQ5ASHDO6BTCP7QIBNYDGWK5N...,1,16,5,-11
68,GDSGO56VH4QO533E65FKXLVYOFFZ2ABH2K7WJKMD7CJL5Y...,2.68,"{8dd7893e, 1384537f, dac90391, 109a0c10, 6999f...",2.68 - Node GDSGO56VH4QO533E65FKXLVYOFFZ2ABH2K...,1,16,4,-12


## Analyze Transaction Matches across slots

In [4]:
def analyze_transaction_matches(df):
    tx_occurrences = {}

    for idx, row in df.iterrows():
        node = row['node name']
        slot = row['Slot']
        # row['Finalised transactions'] is a set; iterate through each transaction hash
        for tx in row['Finalised transactions']:
            if tx not in tx_occurrences:
                tx_occurrences[tx] = set()
            tx_occurrences[tx].add((node, slot))
    
    duplicates = {tx: occ for tx, occ in tx_occurrences.items() if len(occ) > 1} 

    return duplicates


duplicates = analyze_transaction_matches(df)
print("Transactions that appear in more than one slot:")
for tx, occ in duplicates.items():
    print(f"Transaction {tx} appears in: {sorted(list(occ))}")


Transactions that appear in more than one slot:
Transaction 8dd7893e appears in: [('GA23HTSEJHCB54HAJW7VP7IWCXIEE474U2UMWYMEKZWFYAVEZBVBMZNN', 1), ('GA2LBNNBXPLJIGRGGL3OXZ2BFRTZJB5AEP7NTDIHPSMDGUFLHCCJBF6O', 1), ('GA2PU4UGMLSFUXGZATHPTDXXX7FOHBAQC57RSJCQUN72WFKTD6CEPQSF', 1), ('GA2SFCVSNYHL7C5YHR3DFIC75JFKBY4T7AH6A7HZRTGJFVAC25GNEIUZ', 1), ('GA3FLRTZLNMBXCQ2GG4W2CO2WXWGDDROCD3KVD5QYMYB5NXBUYMO2QXT', 1), ('GA3SCGZAMPXNGUULWFUOCMGXRDPLGIMWH3ZSDQFCNEZATIJC77ZW7Z3Y', 1), ('GA4Y3HGTGHSX4XASTABDUM6UIJHTYY6DBODESEKVCYPLOTRMMR2UB5K7', 1), ('GA5A7OIAFB4TCVJNPJRAQKBDAAIAC6EKCTNXVGF3F2KKFKDFOEE4DHV4', 1), ('GA5IMBV5AMJ6VAORQ6XOUNDMEMAS34DSKL5O6RSRWL6LR6F7EAZY5MB4', 1), ('GA5STBMV6QDXFDGD62MEHLLHZTPDI77U3PFOD2SELU5RJDHQWBR5NNK7', 1), ('GA7DV63PBUUWNUFAF4GAZVXU2OZMYRATDLKTC7VTCG7AU4XUPN5VRX4A', 1), ('GA7MEA44DK4WLPF4452CIS24ZOVLDVJN7Q3B4TK44MQXB6H6TKZIAPMC', 1), ('GA7TEPCBDQKI7JQLQ34ZURRMK44DVYCIGVXQQWNSWAEQR6KB4FMCBT7J', 1), ('GAAV2GCVFLNN522ORUYFV33E76VPC22E72S75AQ6MBR5V45Z5DWVPWEU', 1), ('GAAWTX

#### SHOW ALL TRANSACTIONS THAT APPEAR IN MULTIPLE SLOTS FOR THE SAME NODE

In [5]:
def analyze_node_slots_and_multislot_occurrences(df):
    """
    Analyzes the DataFrame to extract:
      - For each node, a dictionary mapping each Slot to the set of finalized transaction hashes 
        that were reported in that slot.
      - For each node, a dictionary mapping transaction hashes (that occur in multiple slots) to the
        set of slot numbers where they appear.
    
    Args:
        df (pandas.DataFrame): A DataFrame with at least the columns:
            - 'node name'
            - 'Slot'
            - 'Finalised transactions' (a set of transaction hashes)
    
    Returns:
        A tuple (node_slots, multi_occurrences) where:
          - node_slots is a dict: { node_name: { slot: set(transaction hashes), ... }, ... }
          - multi_occurrences is a dict: { node_name: { transaction_hash: set(slots) where it appears, ... }, ... }
    """
    # Build a dictionary that records for each node which transactions are finalized in each slot.
    node_slots = {}
    for idx, row in df.iterrows():
        node = row['node name']
        slot = row['Slot']
        txs = row['Finalised transactions']  # expected to be a set of transaction hashes
        if node not in node_slots:
            node_slots[node] = {}
        if slot not in node_slots[node]:
            node_slots[node][slot] = set()
        node_slots[node][slot].update(txs)
    
    # Now, for each node, gather transaction occurrences across slots.
    multi_occurrences = {}
    for node, slot_dict in node_slots.items():
        tx_occurrences = {}  # maps tx -> set of slots where it occurs
        for slot, txs in slot_dict.items():
            for tx in txs:
                if tx not in tx_occurrences:
                    tx_occurrences[tx] = set()
                tx_occurrences[tx].add(slot)
        # Filter to only transactions that appear in more than one slot.
        multi = {tx: slots for tx, slots in tx_occurrences.items() if len(slots) > 1}
        if multi:
            multi_occurrences[node] = multi

    return node_slots, multi_occurrences

# Example usage:
node_slots, multi_occurrences = analyze_node_slots_and_multislot_occurrences(df)

print("Per-node transaction sets by slot:")
for node, slot_info in node_slots.items():
    print(f"Node {node}:")
    for slot, txs in sorted(slot_info.items()):
        print(f"  Slot {slot}: {sorted(list(txs))}")

print("\nTransactions that appear for the same node in multiple slots:")
for node, tx_info in multi_occurrences.items():
    print(f"Node {node}:")
    for tx, slots in tx_info.items():
        print(f"  Transaction {tx} appears in slots: {sorted(list(slots))}")


Per-node transaction sets by slot:
Node GAM3VQ4PJQTSDSPBQYBROS36KU6JHKDEU4M5UJSAIFQS5EMDBFH6MGYB:
  Slot 1: ['109a0c10', '119e2440', '1384537f', '1d8a6eac', '1f2649f1', '20e1d5ad', '3ecfa79f', '52af9052', '5e0eb2ce', '6999fc4b', '8dd7893e', '994d99bf', 'c33808df', 'c983926f', 'cebd47a2', 'dac90391']
  Slot 2: ['1114d779', '348daaf', '41648f0f', '55eebbe0', '666d2d7c', '7bceeeb0', '819bf402', 'acc5e6db', 'b672d7b5']
  Slot 3: ['2c4561c2', '37a477f4', '41ae899f', '52468ff9', '8189b793', 'c3e1d286', 'c45ccf91', 'd4ef2642']
  Slot 4: ['10d7ad1f', '12feada1', '1a2225b9', '295daf58', '5099ed8', '5bafe6e9', '6162b535', '6fbaa3ad', '7358bd48', '744e968b', '849a51a9', '96726c1d', '9f79b7cf', 'c0be68ba', 'c20915d8', 'c2949bd7', 'c414a4cd', 'c5f75df1', 'cc058a1c', 'dd881562']
  Slot 5: ['10a87534', '1210aa8d', '13c9cffa', '152b8db3', '1661646b', '1919377d', '1d7591a4', '1e09e4b4', '1f495093', '2060e710', '28fde7ac', '2dd5338d', '3965c35f', '3bbfd4b7', '3ddb18e5', '3e1dceec', '46d0e178', '50b71666

In [6]:
import pandas as pd

def analyze_transaction_matches_with_msgs(df):
    """
    Find transactions that show up in more than one slot *and* record
    the first 3 words of the log messages that caused each appearance.
    
    Returns a dict:
      tx_hash -> {
         'occurrences': set of (node, slot),
         'msg_types':   set of first‑3‑word summaries
      }
    """
    tx_occurrences = {}   # tx -> set((node,slot))
    tx_msgtypes   = {}    # tx -> set(msg_type)
    
    for _, row in df.iterrows():
        node    = row['node name']
        slot    = row['Slot']
        txs     = row['Finalised transactions']
        msg     = row['Externalize message']
        # grab the first 3 words of the log line
        msg_type = " ".join(msg.split()[:10])
        
        for tx in txs:
            tx_occurrences.setdefault(tx, set()).add((node, slot))
            tx_msgtypes.setdefault(tx,   set()).add(msg_type)
    
    # now filter to only those that appear in >1 slot
    duplicates = {}
    for tx, occ in tx_occurrences.items():
        if len(occ) > 1:
            duplicates[tx] = {
                'occurrences': occ,
                'msg_types':   tx_msgtypes.get(tx, set())
            }
    return duplicates

# --- example usage ---
duplicates = analyze_transaction_matches_with_msgs(df)

print("Transactions that appear in more than one slot, with their first‐3‐word log types:")
for tx, info in duplicates.items():
    occ       = sorted(info['occurrences'])
    msg_types = sorted(info['msg_types'])
    print(f"- {tx}:")
    print(f"    slots:    {occ}")
    print(f"    messages: {msg_types}")


Transactions that appear in more than one slot, with their first‐3‐word log types:
- 8dd7893e:
    slots:    [('GA23HTSEJHCB54HAJW7VP7IWCXIEE474U2UMWYMEKZWFYAVEZBVBMZNN', 1), ('GA2LBNNBXPLJIGRGGL3OXZ2BFRTZJB5AEP7NTDIHPSMDGUFLHCCJBF6O', 1), ('GA2PU4UGMLSFUXGZATHPTDXXX7FOHBAQC57RSJCQUN72WFKTD6CEPQSF', 1), ('GA2SFCVSNYHL7C5YHR3DFIC75JFKBY4T7AH6A7HZRTGJFVAC25GNEIUZ', 1), ('GA3FLRTZLNMBXCQ2GG4W2CO2WXWGDDROCD3KVD5QYMYB5NXBUYMO2QXT', 1), ('GA3SCGZAMPXNGUULWFUOCMGXRDPLGIMWH3ZSDQFCNEZATIJC77ZW7Z3Y', 1), ('GA4Y3HGTGHSX4XASTABDUM6UIJHTYY6DBODESEKVCYPLOTRMMR2UB5K7', 1), ('GA5A7OIAFB4TCVJNPJRAQKBDAAIAC6EKCTNXVGF3F2KKFKDFOEE4DHV4', 1), ('GA5IMBV5AMJ6VAORQ6XOUNDMEMAS34DSKL5O6RSRWL6LR6F7EAZY5MB4', 1), ('GA5STBMV6QDXFDGD62MEHLLHZTPDI77U3PFOD2SELU5RJDHQWBR5NNK7', 1), ('GA7DV63PBUUWNUFAF4GAZVXU2OZMYRATDLKTC7VTCG7AU4XUPN5VRX4A', 1), ('GA7MEA44DK4WLPF4452CIS24ZOVLDVJN7Q3B4TK44MQXB6H6TKZIAPMC', 1), ('GA7TEPCBDQKI7JQLQ34ZURRMK44DVYCIGVXQQWNSWAEQR6KB4FMCBT7J', 1), ('GAAV2GCVFLNN522ORUYFV33E76VPC22E72S75AQ6MBR

In [7]:
def analyze_node_repeated_txs_with_msgs(df):
    """
    For each node, find transactions that are finalized
    in more than one slot.  For each such tx, record the
    slot *and* the first‑3‑words of the log message that
    reported it in that slot.
    
    Returns a dict:
      node -> {
        tx_hash -> [ (slot, msg_type), ... ]
      }
    """
    # intermediate store: node -> tx -> list of (slot,msg_type)
    temp = {}
    
    for _, row in df.iterrows():
        node = row['node name']
        slot = row['Slot']
        txs  = row['Finalised transactions']
        msg  = row['Externalize message']
        # first three words of the message
        msg_type = " ".join(msg.split()[:50])
        
        # initialize per‐node
        if node not in temp:
            temp[node] = {}
        
        for tx in txs:
            temp[node].setdefault(tx, []).append((slot, msg_type))
    
    # now filter to only those txes that appear in >1 distinct slot for each node
    result = {}
    for node, tx_dict in temp.items():
        repeated = {}
        for tx, slot_msgs in tx_dict.items():
            # collect distinct slots
            slots = { sm[0] for sm in slot_msgs }
            if len(slots) > 1:
                # keep all occurrences (slot,msg_type)
                repeated[tx] = slot_msgs
        if repeated:
            result[node] = repeated
    
    return result

# --- example usage ---
multi = analyze_node_repeated_txs_with_msgs(df)

for node, tx_info in multi.items():
    print(f"\nNode {node}:")
    for tx, occurrences in tx_info.items():
        print(f"  Transaction {tx!r} is in multiple slots:")
        for slot, msg_type in sorted(occurrences):
            print(f"    Slot {slot:>2} via “{msg_type}”")



Node GA5IMBV5AMJ6VAORQ6XOUNDMEMAS34DSKL5O6RSRWL6LR6F7EAZY5MB4:
  Transaction '13c9cffa' is in multiple slots:
    Slot  4 via “6.45 - NODE - INFO - Node GA5IMBV5AMJ6VAORQ6XOUNDMEMAS34DSKL5O6RSRWL6LR6F7EAZY5MB4 appended SCPExternalize message for slot 4 to its storage and state, message = SCPExternalize(ballot=SCPBallot(counter=17, value=[Value, hash = 8701179770744062840, state = State.init, transactions = {[Transaction 849a51a9 time = 3.8554], [Transaction 9f79b7cf time = 3.1825], [Transaction 5bafe6e9 time = 4.2076], [Transaction 6162b535 time =”
    Slot  5 via “8.58 - Node GA5IMBV5AMJ6VAORQ6XOUNDMEMAS34DSKL5O6RSRWL6LR6F7EAZY5MB4 adopting externalized value for slot 5: [Value, hash = -2605369243533702099, state = State.init, transactions = {[Transaction 3ddb18e5 time = 1.8666], [Transaction 1e09e4b4 time = 3.6969], [Transaction 1d7591a4 time = 6.0239], [Transaction 50b71666 time = 1.7645], [Transaction 46d0e178 time = 5.4685], [Transaction 2060e710 time = 4.4635], [Transaction”


Analyse duplicates within transactions

In [8]:
def check_duplicates_within_slots(df):
    duplicates_within = []

    for idx, row in df.iterrows():
        node = row['node name']
        slot = row['Slot']
        tx_list = list(row['Finalised transactions'])
        seen = set()
        duplicates = set()

        for tx in tx_list:
            if tx in seen:
                duplicates.add(tx)
            else:
                seen.add(tx)

        if duplicates:
            duplicates_within.append({
                'node': node,
                'slot': slot,
                'duplicates': list(duplicates)
            })

    return duplicates_within

duplicates_within_slots = check_duplicates_within_slots(df)
print("Duplicates within each transaction set per slot:")
for entry in duplicates_within_slots:
    print(f"Node: {entry['node']}, Slot: {entry['slot']}, Duplicates: {entry['duplicates']}")


Duplicates within each transaction set per slot:


ADD INTERLEDGER CHECKS

In [9]:
def calculate_inter_ledger_agreement_time(df):
    df = df.sort_values(by='Timestamp of finalisation')
    time_diffs = df['Timestamp of finalisation'].diff().dropna()
    
    return time_diffs.mean()

avg_time = calculate_inter_ledger_agreement_time(df)
print(f"Average Inter-Ledger Agreement Time: {avg_time}")

Average Inter-Ledger Agreement Time: 0.011870229007633588


In [10]:
final_experiment_df = df[[
    "sequence number",
    "Timestamp of finalisation",
    "No. of finalised transactions",
    "no. of transactions not finalised"
]]

display(final_experiment_df)

KeyError: "['sequence number'] not in index"

In [None]:
avg_difference = (final_experiment_df["no. of transactions not finalised"] - final_experiment_df["No. of finalised transactions"]).mean()

print(f"Average difference: {avg_difference}")

In [None]:
avg_finalised = final_experiment_df["No. of finalised transactions"].mean()
avg_total = (final_experiment_df["No. of finalised transactions"] + 
             final_experiment_df["no. of transactions not finalised"]).mean()

finalised_percentage = (avg_finalised / avg_total) * 100 if avg_total != 0 else 0

print(f"Percentage of finalised transactions vs total: {finalised_percentage:.2f}%")