In [247]:
!pip install pandas


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [248]:
import re
import pandas as pd
from collections import defaultdict

def get_transaction_count(line):
    pattern = r"transactions = \{([^}]+)\}"
    match = re.search(pattern, line)
    if match:
        return set(re.findall(r"Transaction ([a-fA-F0-9]+)", match.group(1)))
    return set()

def get_timestamp(line):
    pattern = r"^\d+\.\d+"
    match = re.match(pattern, line)
    return float(match.group(0)) if match else None

def get_node_name(line):
    pattern = r"Node ([A-Z0-9]+)"
    match = re.search(pattern, line)
    return match.group(1) if match else None

def count_unique_mempool_transactions(file_path, node_number):
    unique_transactions = set()
    with open(file_path, 'r') as file:
        for line in file:
            if f"Node {node_number}" in line and "from mempool" in line:
                unique_transactions.update(re.findall(r"Transaction ([a-fA-F0-9]+)", line))
    return len(unique_transactions)


def process_log_lines(file_path):
    node_data = defaultdict(lambda: {
        "Timestamp of finalisation": None, 
        "Finalised transactions": set(),  
        "Externalize messages": []
    })
    
    with open(file_path, 'r') as file:
        lines = file.readlines()
    
    for line in lines:
        if 'appended SCPExternalize message to its storage and state' not in line:
            continue
        
        node_name = get_node_name(line)
        timestamp = get_timestamp(line)
        transactions = get_transaction_count(line)
        
        if node_name:
            if node_data[node_name]["Timestamp of finalisation"] is None:
                node_data[node_name]["Timestamp of finalisation"] = timestamp
            node_data[node_name]["Finalised transactions"].update(transactions)
            node_data[node_name]["Externalize messages"].append(line.strip())
    
    df = pd.DataFrame.from_dict(node_data, orient='index')
    df.index.name = "sequence number"
    df = df.reset_index()
    
    df["No. of finalised transactions"] = df["Finalised transactions"].apply(len)
    
    df["total_transactions"] = df["sequence number"].apply(lambda node: count_unique_mempool_transactions(file_path, node))
    
    df["no. of transactions not finalised"] = df["total_transactions"] - df["No. of finalised transactions"]
    
    return df


In [249]:
file_path = 'src/simulator_events_log.txt'

df = process_log_lines(file_path)
df_sorted = df.sort_values(by='Timestamp of finalisation', ascending=True)

ADD INTERLEDGER CHECKS

In [250]:
def calculate_inter_ledger_agreement_time(df):
    df = df.sort_values(by='Timestamp of finalisation')
    time_diffs = df['Timestamp of finalisation'].diff().dropna()
    
    return time_diffs.mean()

avg_time = calculate_inter_ledger_agreement_time(df_sorted)
print(f"Average Inter-Ledger Agreement Time: {avg_time}")

Average Inter-Ledger Agreement Time: 1.2787499999999998


In [251]:
final_experiment_df = df_sorted[[
    "sequence number",
    "Timestamp of finalisation",
    "No. of finalised transactions",
    "no. of transactions not finalised"
]]

display(final_experiment_df)

Unnamed: 0,sequence number,Timestamp of finalisation,No. of finalised transactions,no. of transactions not finalised
0,21,8.06,12,35
1,38,13.72,3,45
2,18,14.25,16,24
3,30,19.89,8,39
4,9,20.28,3,45
5,32,20.63,2,51
6,56,21.28,3,45
7,28,26.02,1,50
8,25,29.45,6,40
9,33,30.04,5,43


In [252]:
avg_difference = (final_experiment_df["no. of transactions not finalised"] - final_experiment_df["No. of finalised transactions"]).mean()

print(f"Average difference: {avg_difference}")

Average difference: 28.816326530612244


In [253]:
avg_finalised = final_experiment_df["No. of finalised transactions"].mean()
avg_total = (final_experiment_df["No. of finalised transactions"] + 
             final_experiment_df["no. of transactions not finalised"]).mean()

finalised_percentage = (avg_finalised / avg_total) * 100 if avg_total != 0 else 0

print(f"Percentage of finalised transactions vs total: {finalised_percentage:.2f}%")

Percentage of finalised transactions vs total: 20.51%


# ADD CHECKS FOR FIST EXTERNALIZE

In [254]:
import re
import pandas as pd
from collections import defaultdict

def get_transaction_count(line):
    pattern = r"transactions = \{([^}]+)\}"
    match = re.search(pattern, line)
    if match:
        return set(re.findall(r"Transaction ([a-fA-F0-9]+)", match.group(1)))
    return set()

def get_timestamp(line):
    pattern = r"^\d+\.\d+"
    match = re.match(pattern, line)
    return float(match.group(0)) if match else None

def get_node_name(line):
    pattern = r"Node ([A-Za-z0-9]+)"
    match = re.search(pattern, line)
    return match.group(1) if match else None

def count_unique_mempool_transactions(file_path, node_number):
    unique_transactions = set()
    with open(file_path, 'r') as file:
        for line in file:
            if f"Node {node_number}" in line and "from mempool" in line:
                unique_transactions.update(re.findall(r"Transaction ([a-fA-F0-9]+)", line))
    return len(unique_transactions)

def process_log_lines(file_path):
    """
    Extracts the first SCPExternalize message per node and stores its relevant details.
    Ensures that once a node is added, it is never updated again.
    """
    node_data = {}

    with open(file_path, 'r') as file:
        for line in file:
            # Only consider lines that contain relevant externalization messages
            if ("appended SCPExternalize message to its storage and state" not in line and
                "adopting externalized value for slot" not in line):
                continue
            
            node_name = get_node_name(line)
            timestamp = get_timestamp(line)
            transactions = get_transaction_count(line)

            if node_name and node_name not in node_data:
                # If the node has never been recorded before, store its first occurrence
                node_data[node_name] = {
                    "Timestamp of finalisation": timestamp,
                    "Finalised transactions": transactions,
                    "Externalize message": line.strip()
                }
    
    # Convert to DataFrame
    df = pd.DataFrame.from_dict(node_data, orient='index')
    df.index.name = "node name"
    df = df.reset_index()

    # Count the number of finalized transactions per node
    df["No. of finalised transactions"] = df["Finalised transactions"].apply(len)

    # Compute total transactions for each node from mempool logs
    df["total_transactions"] = df["node name"].apply(lambda node: count_unique_mempool_transactions(file_path, node))

    # Calculate number of transactions not finalized
    df["no. of transactions not finalised"] = df["total_transactions"] - df["No. of finalised transactions"]

    return df

# Example usage:
# file_path = 'path/to/your/simulator_events_log.txt'
# df = process_log_lines(file_path)
# print(df)


In [255]:
file_path = 'src/simulator_events_log.txt'

df = process_log_lines(file_path)
df_sorted = df.sort_values(by='Timestamp of finalisation', ascending=True)

pd.set_option('display.max_rows', None)  # Show all rows
pd.set_option('display.max_columns', None)  # Show all columns
pd.set_option('display.width', 1000)  # Prevent line wrapping
pd.set_option('display.colheader_justify', 'center')  # Center column headers
pd.set_option('display.float_format', '{:.4f}'.format)  # Format float values


display(df_sorted)


Unnamed: 0,node name,Timestamp of finalisation,Finalised transactions,Externalize message,No. of finalised transactions,total_transactions,no. of transactions not finalised
0,21,8.06,{2228996},8.06 - NODE - INFO - Node 21 appended SCPExter...,1,47,46
1,38,13.72,"{d66ce25c, 2228996, 258e61a1}",13.72 - NODE - INFO - Node 38 appended SCPExte...,3,48,45
2,18,14.25,{2228996},14.25 - NODE - INFO - Node 18 appended SCPExte...,1,40,39
3,30,19.89,"{2228996, 258e61a1}",19.89 - NODE - INFO - Node 30 appended SCPExte...,2,47,45
4,9,20.28,"{347fff85, 2228996, 258e61a1}",20.28 - NODE - INFO - Node 9 appended SCPExter...,3,48,45
5,32,20.63,"{2228996, 258e61a1}",20.63 - NODE - INFO - Node 32 appended SCPExte...,2,53,51
6,56,21.28,"{d66ce25c, 2228996, 258e61a1}",21.28 - NODE - INFO - Node 56 appended SCPExte...,3,48,45
7,28,26.02,{2228996},26.02 - NODE - INFO - Node 28 appended SCPExte...,1,51,50
8,25,29.45,"{d66ce25c, cacc09f7, 2228996, 347fff85, 258e61...",29.45 - NODE - INFO - Node 25 appended SCPExte...,6,46,40
9,33,30.04,"{d66ce25c, 2228996, 347fff85, 258e61a1, 24139ce1}",30.04 - NODE - INFO - Node 33 appended SCPExte...,5,48,43


In [256]:
import pandas as pd

def check_exact_matches(df):
    # Convert 'Finalised transactions' to sets
    def convert_to_set(val):
        if isinstance(val, str):
            return eval(val)  # Convert string representation to set
        elif isinstance(val, set):
            return val  # If it's already a set, return it
        else:
            return set()  # In case of any unexpected format
    
    # Convert the 'Finalised transactions' column to sets
    df['Finalised transactions'] = df['Finalised transactions'].apply(convert_to_set)
    
    # Create a dictionary to store matches
    matches = {}
    
    # Iterate through each row and compare 'Finalised transactions' with every other row
    for i, row_i in df.iterrows():
        node_i = row_i["node name"]
        transactions_i = row_i["Finalised transactions"]
        
        for j, row_j in df.iterrows():
            if i < j:  # Only compare each pair once
                node_j = row_j["node name"]
                transactions_j = row_j["Finalised transactions"]
                
                # Check if the transactions are exactly the same (size and content)
                if len(transactions_i) == len(transactions_j) and transactions_i == transactions_j:
                    if node_i not in matches:
                        matches[node_i] = []
                    matches[node_i].append(node_j)
    
    return matches

# Assuming df_sorted is your DataFrame
matches = check_exact_matches(df_sorted)

# Print the matches
for node, matched_nodes in matches.items():
    print(f"Node {node} has the exact same transactions as:")
    for matched_node in matched_nodes:
        print(f"  - {matched_node}")


Node 21 has the exact same transactions as:
  - 18
  - 28
  - 52
  - 5
  - 6
Node 38 has the exact same transactions as:
  - 56
  - 13
  - 39
Node 18 has the exact same transactions as:
  - 28
  - 52
  - 5
  - 6
Node 30 has the exact same transactions as:
  - 32
  - 0
  - 26
  - 16
Node 32 has the exact same transactions as:
  - 0
  - 26
  - 16
Node 56 has the exact same transactions as:
  - 13
  - 39
Node 28 has the exact same transactions as:
  - 52
  - 5
  - 6
Node 25 has the exact same transactions as:
  - 17
  - 45
  - 55
Node 48 has the exact same transactions as:
  - 57
  - 15
  - 24
Node 52 has the exact same transactions as:
  - 5
  - 6
Node 57 has the exact same transactions as:
  - 15
  - 24
Node 0 has the exact same transactions as:
  - 26
  - 16
Node 15 has the exact same transactions as:
  - 24
Node 5 has the exact same transactions as:
  - 6
Node 29 has the exact same transactions as:
  - 22
Node 17 has the exact same transactions as:
  - 45
  - 55
Node 45 has the exact 

In [257]:
def compute_matching_transactions(df):
    matching_counts = {}

    # Compare the transactions of each node with the other nodes
    for i, row_i in df.iterrows():
        node_i = row_i["node name"]
        transactions_i = row_i["Finalised transactions"]
        
        matching_transactions = {}

        for j, row_j in df.iterrows():
            if i != j:  # Don't compare the node with itself
                node_j = row_j["node name"]
                transactions_j = row_j["Finalised transactions"]
                
                # Find common transactions between node_i and node_j
                common_transactions = transactions_i.intersection(transactions_j)
                matching_transactions[node_j] = len(common_transactions)

        matching_counts[node_i] = matching_transactions

    return matching_counts

# Get the matching transaction counts
matching_counts = compute_matching_transactions(df)

# Display the results
for node, matches in matching_counts.items():
    print(f"Matching transactions for node {node}:")
    for other_node, count in matches.items():
        print(f"  - {other_node}: {count} matching transactions")

Matching transactions for node 21:
  - 38: 1 matching transactions
  - 18: 1 matching transactions
  - 30: 1 matching transactions
  - 9: 1 matching transactions
  - 32: 1 matching transactions
  - 56: 1 matching transactions
  - 28: 1 matching transactions
  - 25: 1 matching transactions
  - 33: 1 matching transactions
  - 48: 1 matching transactions
  - 53: 0 matching transactions
  - 52: 1 matching transactions
  - 57: 1 matching transactions
  - 0: 1 matching transactions
  - 15: 1 matching transactions
  - 2: 1 matching transactions
  - 43: 0 matching transactions
  - 5: 1 matching transactions
  - 36: 1 matching transactions
  - 44: 0 matching transactions
  - 29: 0 matching transactions
  - 17: 1 matching transactions
  - 47: 0 matching transactions
  - 49: 0 matching transactions
  - 7: 0 matching transactions
  - 27: 0 matching transactions
  - 45: 1 matching transactions
  - 13: 1 matching transactions
  - 37: 0 matching transactions
  - 41: 1 matching transactions
  - 51: 1 

## GIVEN MATCHING TRANSACTIONS, NOW FURTHER ANALYSIS USING QUORUM SETS AND THRESHOLD

QUORUM SETS AND THRESHOLD USED IN SIMULATOR FOR ROUND OF LUNHC

In [258]:
quorum_sets = {
                    "Alice": ["Bob", "Carol", "Dave"],
                    "Bob": ["Alice", "Carol", "Dave"],
                    "Carol": ["Alice", "Bob", "Dave"],
                    "Dave": ["Alice", "Bob", "Carol"],
                    "Elsie": ["Alice", "Bob", "Carol", "Dave"],
                    "Fred": ["Alice", "Bob", "Carol", "Dave"],
                    "Gwen": ["Alice", "Bob", "Carol", "Dave"],
                    "Hank": ["Alice", "Bob", "Carol", "Dave"],
                    "Inez": ["Elsie", "Fred", "Gwen", "Hank"],
                    "John": ["Elsie", "Fred", "Gwen", "Hank"]
}

quorum_thresholds = {
                    "Alice": 2, "Bob": 2, "Carol": 2, "Dave": 2,  # 2 out of 3 → 67%
                    "Elsie": 2, "Fred": 2, "Gwen": 2, "Hank": 2,  # 2 out of 4 → 50%
                    "Inez": 2, "John": 2  # 2 out of 4 → 50%
}
top_tier_nodes = ["Alice", "Bob", "Carol", "Dave"] # each depends on two of its neighbors for a quorum
middle_tier_nodes = ["Elsie", "Fred", "Gwen", "Hank"] # each depends on any two members of the top tier
bottom_tier_nodes = ["Inez", "John"] # each depends on any two members of the middle tier.


In [259]:
def compute_matching_transactions_by_tiers(df, quorum_sets, quorum_thresholds, top_tier_nodes, middle_tier_nodes, bottom_tier_nodes):
    matching_counts = {}

    # Iterate over the nodes in the DataFrame
    for i, row_i in df.iterrows():
        node_i = row_i["node name"]
        transactions_i = row_i["Finalised transactions"]

        matching_transactions = {
            "total": len(transactions_i),  # Total transactions for node_i
            "top_tier": {},
            "middle_tier": {},
            "bottom_tier": {}
        }

        # Compare node_i's transactions with others in their respective tiers
        for j, row_j in df.iterrows():
            if i != j:  # Don't compare the node with itself
                node_j = row_j["node name"]
                transactions_j = row_j["Finalised transactions"]

                # Find common transactions between node_i and node_j
                common_transactions = transactions_i.intersection(transactions_j)
                match_count = len(common_transactions)

                # Determine the tier for node_j based on its position in the quorum_sets
                if node_j in quorum_sets[node_i]:
                    if node_j in top_tier_nodes:
                        matching_transactions["top_tier"][node_j] = match_count
                    elif node_j in middle_tier_nodes:
                        matching_transactions["middle_tier"][node_j] = match_count
                    elif node_j in bottom_tier_nodes:
                        matching_transactions["bottom_tier"][node_j] = match_count

        matching_counts[node_i] = matching_transactions

    return matching_counts

In [260]:
matching_counts = compute_matching_transactions_by_tiers(df, quorum_sets, quorum_thresholds, top_tier_nodes, middle_tier_nodes, bottom_tier_nodes)

# Display the results
for node, matches in matching_counts.items():
    if node in top_tier_nodes:
        tier = "Top-tier"
    elif node in middle_tier_nodes:
        tier = "Middle-tier"
    elif node in bottom_tier_nodes:
        tier = "Bottom-tier"
    else:
        tier = "Unknown-tier"
    # Print the node, its tier, and total transactions before showing matches
    print(f"Matching transactions for node {node} in tier {tier}(Total: {matches['total']} transactions):")
    
    print(f"  - Top-tier matches:")
    for top_node, count in matches["top_tier"].items():
        print(f"    * {top_node}: {count} matching transactions")
    
    print(f"  - Middle-tier matches:")
    for middle_node, count in matches["middle_tier"].items():
        print(f"    * {middle_node}: {count} matching transactions")
    
    print(f"  - Bottom-tier matches:")
    for bottom_node, count in matches["bottom_tier"].items():
        print(f"    * {bottom_node}: {count} matching transactions")
    
    print()  # For readability between node results

KeyError: '21'

### DELAYS IN EXTERNALISATION FOR LEAD NODES VS OUTER NODES

In [None]:
import pandas as pd

def analyze_finalization_delay(df, top_tier, middle_tier, bottom_tier):
    """
    Analyzes the delay in finalization times across the three-tier structure.

    Args:
        df (pd.DataFrame): The DataFrame containing node names and finalization timestamps.
        top_tier (list): List of top-tier node names.
        middle_tier (list): List of middle-tier node names.
        bottom_tier (list): List of bottom-tier node names.

    Returns:
        dict: Contains average finalization times for each tier and delays between them.
    """
    top_times = df[df["node name"].isin(top_tier)]["Timestamp of finalisation"]
    middle_times = df[df["node name"].isin(middle_tier)]["Timestamp of finalisation"]
    bottom_times = df[df["node name"].isin(bottom_tier)]["Timestamp of finalisation"]

    top_avg = top_times.mean()
    middle_avg = middle_times.mean()
    bottom_avg = bottom_times.mean()

    delay_top_to_middle = middle_avg - top_avg
    delay_middle_to_bottom = bottom_avg - middle_avg
    delay_top_to_bottom = bottom_avg - top_avg

    return {
        "top_avg_finalization_time": top_avg,
        "middle_avg_finalization_time": middle_avg,
        "bottom_avg_finalization_time": bottom_avg,
        "delay_top_to_middle": delay_top_to_middle,
        "delay_middle_to_bottom": delay_middle_to_bottom,
        "delay_top_to_bottom": delay_top_to_bottom
    }


In [None]:
# Define groups
top_tier = ["Alice", "Bob", "Carol", "Dave"] # each depends on two of its neighbors for a quorum
middle_tier = ["Elsie", "Fred", "Gwen", "Hank"] # each depends on any two members of the top tier
bottom_tier = ["Inez", "John"] # each depends on any two members of the middle tier.

# Run analyses
finalization_delay = analyze_finalization_delay(df, top_tier, middle_tier, bottom_tier)

# Print results
print("Finalization Delay Analysis:", finalization_delay)


### TRANSACTION PROPAGATION IN NODES WITH NO LINKS (OR WEAK LINKS THROUGH PEERS)

In [None]:
from itertools import combinations
import pandas as pd

def transaction_overlap(df, group1, group2, group3):
    """
    Measures the percentage of matching transactions between three groups of nodes.
    Computes overlap separately for (g1-g2), (g1-g3), (g2-g3), and all three together.

    Args:
        df (pd.DataFrame): The DataFrame containing nodes and finalized transactions.
        group1 (list): First group of nodes (e.g., top tier).
        group2 (list): Second group of nodes (e.g., middle tier).
        group3 (list): Third group of nodes (e.g., bottom tier).

    Returns:
        dict: Contains average transaction match percentages for each pair of groups
              and overall across all three groups.
    """
    def compute_overlap(nodes1, nodes2):
        """Helper function to compute overlap between two sets of nodes."""
        total_pairs = 0
        total_match_percentage = 0

        for (node1, txs1), (node2, txs2) in combinations(
            zip(df["node name"], df["Finalised transactions"]), 2
        ):
             # cjecl that one noed is in group 'nodes1' and the other is in 'nodes2', not that they are part of the same group
            if node1 in nodes1 and node2 in nodes2:
                total_pairs += 1
                match_count = len(set(txs1).intersection(set(txs2))) # compare each matching transaction
                total_transactions = len(set(txs1).union(set(txs2))) # total transactions in both externalised values
                match_percentage = (match_count / total_transactions) if total_transactions else 0
                total_match_percentage += match_percentage

        return total_match_percentage / total_pairs if total_pairs else 0

    # Compute pairwise overlaps
    g1_g2_overlap = compute_overlap(group1, group2)
    g1_g3_overlap = compute_overlap(group1, group3)
    g2_g3_overlap = compute_overlap(group2, group3)

    # Compute overall overlap across all three groups
    all_groups_overlap = compute_overlap(group1 + group2, group3)

    return {
        "Top-Middle Overlap": g1_g2_overlap,
        "Top-Bottom Overlap": g1_g3_overlap,
        "Middle-Bottom Overlap": g2_g3_overlap,
        "Overall Three-Group Overlap": all_groups_overlap
    }


In [None]:
propagation_analysis = transaction_overlap(df, top_tier, middle_tier, bottom_tier)
print("Transaction Propagation Analysis:", propagation_analysis)

### EFFECT OF LOWER QUORUM THRESHOLD

In [None]:
def analyze_threshold_effect(df, quorum_thresholds):
    """
    Analyzes how different quorum thresholds affect transaction agreement.

    Args:
        df (pd.DataFrame): The DataFrame containing nodes and finalized transactions.
        quorum_thresholds (dict): A dictionary mapping nodes to their quorum thresholds.

    Returns:
        dict: Contains statistics on transaction agreement and threshold impact.
    """
    unique_transactions_per_node = df["Finalised transactions"].apply(len)
    avg_transactions = unique_transactions_per_node.mean()
    
    threshold_agreement = {
        node: {
            "threshold": quorum_thresholds[node],
            "transactions_finalized": df[df["node name"] == node]["No. of finalised transactions"].values[0]
        }
        for node in quorum_thresholds
    }
    threshold_df = pd.DataFrame(threshold_agreement)

    return  avg_transactions, threshold_df


In [None]:
avg_txs, threshold_df = analyze_threshold_effect(df, quorum_thresholds)
print("Threshold Effect Analysis:")
print("Average_transactions_finalized", avg_txs)
display(threshold_df)

# CONCLUSIONS FROM DATA ANALYSIS

Finalization Delays:

The Top tier finalized transactions the earliest on average, but the delay between Top-Middle and Middle-Bottom is relatively small.
Delay between Top and Bottom being negative (-7.7) suggests that nodes in the Top tier finalize faster than those in the Bottom tier.
The asynchronous behavior of the simulator means there is no global synchronization of node actions, which likely results in these delays. Synchronization points during the real SCP implementation are crucial to maintain consistency across nodes at different tiers, but the simulator misses this aspect.


Transaction Overlap:

The low transaction overlap indicates that transactions are not propagated efficiently between the tiers. In a real-world SCP implementation, once a value is finalized in one tier, it should propagate to the other tiers effectively to ensure a quicker consensus process. The simulator’s lack of synchronization likely hinders this propagation.


Effect of Quorum Thresholds:

The large variation in finalized transactions (1 to 61) indicates that quorum thresholds are having inconsistent effects across nodes. Some nodes seem to finalize transactions quicker than others, but overall, the average finalized transactions (26) indicate some nodes are not able to meet the quorum efficiently, possibly because lower thresholds result in early externalization without proper coordination