In [None]:
#Resolve the required imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt  # To visualize
from tabulate import tabulate #To create pretty tables
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

In [None]:

#read in the data
base_directory = "/home/jeanpierre/LibraMetrics/containersMetricsFiles/2020_07_24__01_40_18/"

files = ["merged/jp_mempool_process_incoming_transactions.csv", "merged/jp_consensus_process_proposal.csv", "merged/jp_consensus_process_new_round.csv"]

data_frames = []
for file_name in files:
    data_frame = pd.read_csv(base_directory + file_name, names=["txns", "duration"])
    data_frames.append(data_frame)

In [None]:
#Split the "jp_consensus_process_proposal.csv" into two parts,
#as the behaviour when #txns==0 and #txns>0 differs significantly
def split(arr, cond):
    return arr[cond], arr[~cond]

left, right = split(data_frames[1], data_frames[1]["txns"] == 0)
data_frames.append(right)
files.append("jp_consensus_process_proposal.csv WHERE #txns>0")

In [None]:
#convert duration from microseconds to milliseconds
i = 0
while i < len(data_frames):
    data_frames[i]["duration"] *= .001
    i += 1

left["duration"] *= 0.001

In [None]:
#Remove outliers that are >{std_from_mean}*std from the mean
std_from_mean = 3

mean_array = []
std_array = []
res_array = []

i = 0
while i < len(data_frames):
    mean = data_frames[i].groupby("txns").mean().reset_index()
    mean_array.append(mean)

    std = data_frames[i].groupby("txns").std().reset_index()
    std_array.append(std)
    
    std_mean = pd.merge(std, mean, on='txns', how='inner')
    std_mean.columns = ["txns", "std", "mean"]

    res = pd.merge(std_mean, data_frames[i], on='txns', how='inner')
    res = res[np.abs(res["duration"]-res["mean"]) <= (std_from_mean*res["std"])]
    res_array.append(res)
    print("Removed " + str(len(data_frames[i]) - len(res)) + " outliers out of " + str(len(data_frames[i])) + " entries")
    i += 1

In [None]:

#Perform linear regression with R-sqaured score with pyplot
i = 0
while i < len(res_array):
    X = res_array[i].iloc[:, 0].values.reshape(-1, 1)
    Y = res_array[i].iloc[:, 3].values.reshape(-1, 1)
    linear_regressor = LinearRegression()
    linear_regressor.fit(X, Y)
    Y_pred = linear_regressor.predict(X)

    fig = plt.figure(figsize=(16,8))
    fig.suptitle(files[i], fontsize=16)
    ax = fig.add_subplot()
    ax.set_xlabel('#Transactions')
    ax.set_ylabel('Duration (milliseconds)')

    ax.scatter(X, Y, marker='.', s=1, label='Data point')
    ax.plot(X, Y_pred, color='red', label='Linear regression model')

    coefficient_of_dermination = r2_score(Y, Y_pred)

    #plt.errorbar(mean["txns"], mean["duration"], yerr=0, capsize=5, label="errorbar")
    ax.plot(mean_array[i]["txns"], mean_array[i]["duration"], label="Line through sample means")
    ax.plot(std_array[i]["txns"], std_array[i]["duration"], label="Standard Deviation")

    ax.text(0.1, 0.8, "$R^2={0:.3f}$".format(coefficient_of_dermination), verticalalignment='bottom', horizontalalignment='right',
        transform=ax.transAxes)
    plt.legend(loc='upper left')
    #plt.savefig('pic.png')
    i += 1

In [None]:
#Linear regression plot with Seaborn, showing a 95% confidence interval
i = 0
while i < len(res_array):
    fig, ax = plt.subplots(nrows=2, figsize=(16,12), gridspec_kw={'height_ratios':[3,1], 'hspace':0.1})

    sns.regplot(x='txns', y='duration', data=res_array[i], ax=ax[0], ci=95, label='Data point', line_kws={"color": "red"}, scatter_kws={'s':1})
    sns.residplot(x='txns', y='duration', data=res_array[i], ax=ax[1], scatter_kws={'s':1})
    ax[0].plot(mean_array[i]["txns"], mean_array[i]["duration"], label="Line through sample means")

    ax[0].set_title(files[i], fontsize=16)
    ax[0].set_xlabel("")
    ax[0].set_ylabel("Duration (milliseconds)")
    ax[0].set_xlim(left=0)
    ax[0].legend(loc='upper left')

    ax[1].set_xlabel("#Transactions")
    ax[1].set_ylabel("residuals")
    ax[1].set_xlim(left=0, right=ax[0].get_xlim()[1])
    ax[1].legend(loc='upper left')
    i += 1

In [None]:
def create_four_dist_plots(title, data):
    fig, ax = plt.subplots(2, 2, figsize=(16,8))
    plt.subplots_adjust(hspace=0.5)
    sns.distplot(data, ax=ax[0][0])
    sns.boxplot(data, ax=ax[0][1], fliersize=1, linewidth=1)
    sns.boxenplot(data, ax=ax[1][0])
    sns.violinplot(data, ax=ax[1][1], inner='quartile')

    fig.suptitle(title, fontsize=16)
    ax[0,0].set_title("Distribution")
    ax[0,1].set_title("Boxplot")
    ax[1,0].set_title("Boxenplot")
    ax[1,1].set_title("Violinplot")

    ax[0,0].set_xlabel("Duration (milliseconds)")
    ax[0,1].set_xlabel("Duration (milliseconds)")
    ax[1,0].set_xlabel("Duration (milliseconds)")
    ax[1,1].set_xlabel("Duration (milliseconds)")

In [None]:
create_four_dist_plots("jp_consensus_process_proposal.csv WHERE #txns==0", left['duration'])

In [None]:
def get_counts_per_timewindow(X, nr_bins):
    X_res = []
    Y_res = []

    end_time = max(X)
    part = end_time/nr_bins
    for x in range(nr_bins):
        position = (part * x) + (0.5 * part)
        X_res.append(position)

        y_part = X[(X >= (part * x)) & (X < (part * (x+1)))]
        Y_res.append(len(y_part)/part)
    return X_res, Y_res

In [None]:
#Load in both client txns start time and end times
ac_dataframe = pd.read_csv(base_directory + "merged/jp_ac_client_transaction.csv", names=["address", "sequence_number", "timestamp"])
f = open(base_directory + "container0/jp_blockstore_process_block.csv")
lines = f.read().split("\n")
f.close()

dictionary = {"address":[], "sequence_number":[], "timestamp":[]}

#Parse blockstore into a dataframe
for x in range(len(lines)):
    line = lines[x].replace('"', '')
    parts = line.split(",")

    timestamp = parts[0]
    for y in range(1, len(parts)):
        acc_seq = parts[y].split(";")
        dictionary["address"].append(acc_seq[0])
        dictionary["sequence_number"].append(acc_seq[1])
        dictionary["timestamp"].append(timestamp)

blockstore_dataframe = pd.DataFrame(dictionary)
blockstore_dataframe[['sequence_number', 'timestamp']] = blockstore_dataframe[['sequence_number', 'timestamp']].apply(pd.to_numeric)

#left join to add commit times
result = pd.merge(ac_dataframe, blockstore_dataframe, how='left', on=['address', 'sequence_number'], suffixes=['_submit', '_commit'])

#calculate the txn_latency and relative_txn_start_time, then add it to the result dataframe
txn_latency = result['timestamp_commit'] - result['timestamp_submit']
txn_rel_start_time = (result['timestamp_commit'] - result['timestamp_submit'].min())/1000
result['txn_latency(ms)'] = txn_latency
result['txn_rel_commit_time'] = txn_rel_start_time

result.sort_values(by=['timestamp_submit'], inplace=True, ignore_index=True)

avg_throughput = len(result) / max(result['txn_rel_commit_time'])
print("Average Throughput: " + str(avg_throughput))

#Create a scatterplot with 2 Y-axis, txn_latency and txn_througput
fig, ax1 = plt.subplots(figsize=(16,8))
ax2 = ax1.twinx()
#$sns.scatterplot(x=result['txn_rel_commit_time'], y=result['txn_latency(ms)'], label='Data point', size=1, ax=ax1)

X, Y = get_counts_per_timewindow(result['txn_rel_commit_time'], int(max(X)/5))
ax2.plot(X, Y, color='red', label='Throughput (txn/s)')
ax2.set_ylim(bottom=0, top=max(Y)+10)

ax1.set_title("Txn Latency and Throughput", fontsize=16)
ax1.set_xlabel('Time (s)')
ax1.set_ylabel('Txn commit time (ms)', color='blue')
ax2.set_ylabel('Throughput (txn/s)', color='red')
ax1.get_legend().remove()
ax2.legend();

In [None]:
#Visualize txn_latency in distributions
create_four_dist_plots("Transaction Latency", result['txn_latency(ms)'])
result[['txn_latency(ms)']].describe()