In [None]:
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import sqlite3 as sq
from datetime import datetime, timedelta, date
from IPython.display import Markdown, display, HTML
import matplotlib.ticker as mtick

In [None]:
protocols = ['QUIC', 'TLS', 'HTTPS', 'TCP', 'UDP']
plt.rcParams.update({'font.size': 18})

In [None]:
df = pd.read_parquet('web.performance/web.performance.parquet.gzip')

In [None]:
# order vantage Points
vp_order = ['US West', 'South America East', 'Europe Central', 'Asia Pacific Southeast', 'Asia Pacific Northeast', 'Africa South']

In [None]:
df['timeByTransferSize'] = df['loadEventStart'] / df['transferSize']
df['timeByDecodedSize'] = df['loadEventStart'] / df['decodedBodySize']
df['difLookupFcp'] = df['firstContentfulPaint'] - df['max_lookup_end']
df['difFcpEnd'] = df['loadEventStart'] - df['firstContentfulPaint']
df['difLookupEnd'] = df['loadEventStart'] - df['elapsed']
df['serverRTT'] = df['responseStart'] - df['requestStart']

In [None]:
# order by average number of queries
df_new = df[['domain', 'queries']].copy()
df_new = df_new.groupby('domain').agg('mean')
df_new = df_new.sort_values('queries')
domain_order = df_new.index.tolist()

In [None]:
# Difference between DoUDP and DoQ/DoT/DoH/DoTCP
diff_dict2 = {'domain': [], 'vantagePoint': [], 'server': [], 'QUIC-PLT': [], 'QUIC-FCP': [], 'TLS-PLT': [], 'TLS-FCP': [], 
             'HTTPS-PLT': [], 'HTTPS-FCP': [], 'TCP-PLT': [], 'TCP-FCP': [], 'RTT_over_10ms': []}

for vp in vp_order:
    df_vp = df[df['vantagePoint'] == vp].copy()
    for server in df_vp['server'].unique():
        df_server = df_vp[df_vp['server'] == server].copy()
        for domain in domain_order:
            df_domain = df_server[df_server['domain'] == domain].copy()
            
            start_date = date(2022, 4, 18)
            end_date = date(2022, 4, 24)
            delta = timedelta(days=2)
            df_domain['timestamp'] = pd.to_datetime(df_domain['timestamp']).dt.date
            
            # we remove iterations with faulty measurements
            while start_date <= end_date:
                df_timeframe = df_domain[(df_domain['timestamp'] >= start_date) & (df_domain['timestamp'] < (start_date+delta))].copy()
                
                tmp_list = []
                for protocol in ['UDP', 'QUIC', 'TLS', 'HTTPS', 'TCP']:
                    df_protocol = df_timeframe[df_timeframe['protocol'] == protocol].copy()
                    tmp_list += df_protocol['firstContentfulPaint'].tolist()
                    tmp_list += df_protocol['loadEventEnd'].tolist()
                    
                if 0.0 in tmp_list or np.nan in tmp_list:
                    df_domain = df_domain[(df_domain['timestamp'] < start_date) | (df_domain['timestamp'] >= (start_date+delta))]
                start_date += delta
            
            df_udp = df_domain[df_domain['protocol'] == 'UDP'].copy()
            if not df_udp.empty:
                # udp baseline
                median_plt_udp = df_udp['loadEventStart'].median()
                median_fcp_udp = df_udp['firstContentfulPaint'].median()
                median_rtt = df_udp['avg_lookup_end'].median()
                # add PLT and FCP increase
                for protocol in ['QUIC', 'TLS', 'HTTPS', 'TCP']:
                    df_protocol = df_domain[df_domain['protocol'] == protocol].copy()
                    median_plt_protocol = df_protocol['loadEventStart'].median()
                    median_fcp_protocol = df_protocol['firstContentfulPaint'].median()

                    increase_plt = (median_plt_protocol/(median_plt_udp/100))-100
                    increase_fcp = (median_fcp_protocol/(median_fcp_udp/100))-100

                    diff_dict2[f"{protocol}-PLT"].append(increase_plt)
                    diff_dict2[f"{protocol}-FCP"].append(increase_fcp)

                # add vantage point, server and domain information
                diff_dict2['vantagePoint'].append(vp)
                diff_dict2['domain'].append(domain)
                diff_dict2['server'].append(server)
                # add information if resolver rtt is over 10 ms
                diff_dict2['RTT_over_10ms'].append(median_rtt > 10)
                
diff_df2 = pd.DataFrame.from_dict(diff_dict2)

diff_df2.rename(columns={"QUIC-PLT": "DoQ-PLT", "QUIC-FCP": "DoQ-FCP", "TLS-PLT": "DoT-PLT", "TLS-FCP": "DoT-FCP",
                        "HTTPS-PLT": "DoH-PLT", "HTTPS-FCP": "DoH-FCP", "TCP-PLT": "DoTCP-PLT", "TCP-FCP": "DoTCP-FCP"}, inplace=True)

In [None]:
# Difference between DoQ and DoH/DoT/DoUDP
diff_dict3 = {'domain': [], 'vantagePoint': [], 'server': [], 'HTTPS-PLT': [], 'HTTPS-FCP': [], 'TLS-PLT': [], 'TLS-FCP': [], 'UDP-PLT': [], 'UDP-FCP': [], 'RTT': []}

for vp in vp_order:
    df_vp = df[df['vantagePoint'] == vp].copy()
    for server in df_vp['server'].unique():
        df_server = df_vp[df_vp['server'] == server].copy()
        for domain in domain_order:
            df_domain = df_server[df_server['domain'] == domain].copy()
            
            start_date = date(2022, 4, 18)
            end_date = date(2022, 4, 24)
            delta = timedelta(days=2)
            df_domain['timestamp'] = pd.to_datetime(df_domain['timestamp']).dt.date
            
            # we remove iterations with faulty measurements
            while start_date <= end_date:
                df_timeframe = df_domain[(df_domain['timestamp'] >= start_date) & (df_domain['timestamp'] < (start_date+delta))].copy()
                
                tmp_list = []
                for protocol in ['UDP', 'QUIC', 'TLS', 'HTTPS', 'TCP']:
                    df_protocol = df_timeframe[df_timeframe['protocol'] == protocol].copy()
                    tmp_list += df_protocol['firstContentfulPaint'].tolist()
                    tmp_list += df_protocol['loadEventEnd'].tolist()
                    
                if 0.0 in tmp_list or np.nan in tmp_list:
                    df_domain = df_domain[(df_domain['timestamp'] < start_date) | (df_domain['timestamp'] >= (start_date+delta))]
                start_date += delta
                
            df_doh = df_domain[df_domain['protocol'] == 'QUIC'].copy()
            if not df_doh.empty:
                # doh baseline
                median_plt_doh = df_doh['loadEventStart'].median()
                median_fcp_doh = df_doh['firstContentfulPaint'].median()
                
                # add vantage point, server and domain information
                diff_dict3['vantagePoint'].append(vp)
                diff_dict3['domain'].append(domain)
                diff_dict3['server'].append(server)
                
                # add PLT and FCP increase
                for protocol in ['HTTPS', 'TLS', 'UDP']:
                    df_protocol = df_domain[df_domain['protocol'] == protocol].copy()
                    median_plt_protocol = df_protocol['loadEventStart'].median()
                    median_fcp_protocol = df_protocol['firstContentfulPaint'].median()
                    
                    increase_plt = (median_plt_protocol/(median_plt_doh/100))-100
                    increase_fcp = (median_fcp_protocol/(median_fcp_doh/100))-100
                    
                    diff_dict3[f"{protocol}-PLT"].append(increase_plt)
                    diff_dict3[f"{protocol}-FCP"].append(increase_fcp)
                    
                # add information of resolver rtt
                df_dou = df_domain[df_domain['protocol'] == 'UDP'].copy()
                min_rtt = df_dou['min_lookup_end'].min()
                diff_dict3['RTT'].append(min_rtt)
                
diff_df3 = pd.DataFrame.from_dict(diff_dict3)

diff_df3.rename(columns={"HTTPS-PLT": "DoH-PLT", "HTTPS-FCP": "DoH-FCP", "TLS-PLT": "DoT-PLT", "TLS-FCP": "DoT-FCP",
                        "UDP-PLT": "DoUDP-PLT", "UDP-FCP": "DoUDP-FCP"}, inplace=True)

# Functions

In [None]:
def calculateCDF(items, bins=500):
    count, bins_count = np.histogram(items, bins=bins)
    pdf = count / sum(count)
    # using numpy np.cumsum to calculate the CDF
    cdf = np.cumsum(pdf)
    # return bins_count, cdf
    return bins_count[1:], cdf

def find_nearest(array, value):
    array = np.asarray(array)
    idx = (np.abs(array - value)).argmin()
    return idx
    
def draw_grid_line(axis, horizontal, mark, xs, ys):
    if horizontal:
        idx = find_nearest(ys, mark)
        axis.hlines([mark], -20, xs[idx], 'k', linestyles='dotted')
        axis.vlines([xs[idx]], 0, mark, 'k', linestyles='dotted')
    else:
        idx = find_nearest(xs, mark)
        axis.vlines([mark], 0, ys[idx], 'k', linestyles='dotted')
        axis.hlines([ys[idx]], -20, mark, 'k', linestyles='dotted')
        
def create_comparison(data, ax, metric='PLT', baseline='DoUDP', compare_to='DoQ', compare_to_2=None, showleg=False):
    # for the cluster plot
    if compare_to_2:
        key = f'{compare_to_2}-{metric}'
        y = data[[key]].copy()
        y = y[y[key] < np.inf]
        y = y[~np.isnan(y)]
        plot_a, plot_n = calculateCDF(y, bins=400)
        sns.lineplot(x=plot_a, y=plot_n, ax=ax, linewidth=3, label='DoUDP', color=sns.color_palette("tab10")[4])
    
    key = f'{compare_to}-{metric}'
    x = data[[key]].copy()
    x = x[x[key] < np.inf]
    x = x[~np.isnan(x)]
    plot_x, plot_y = calculateCDF(x, bins=400)
    zero = find_nearest(plot_x, 0)
    sns.lineplot(x=plot_x, y=plot_y, ax=ax, linewidth=3, label=compare_to, color=sns.color_palette("tab10")[2])
    ax.vlines([0], -0.1, 1, 'k', linestyles='dashed')


    ax.set_facecolor(plt.cm.binary((plot_y[zero]/1.1)))   # or /1.25, values >1.25 lose too much contrast
    ax.tick_params(left=True, bottom=True, labelsize=17)
    ax.yaxis.set_ticks(np.arange(0.0, 1.01, 0.5))
    ax.xaxis.set_ticks(np.arange(-20, 21, 20))
    ax.tick_params(which='minor', bottom=True, labelsize=17, length=6, width=1)
    ax.set_ylim(0.0, 1.0)
    ax.set_xlim(-25, 25)
    ax.xaxis.set_major_formatter(mtick.PercentFormatter())
    ax.grid(False)
    ax.spines[['bottom', 'top', 'left', 'right']].set_visible(False)
    
    if showleg:
        leg = ax.legend(loc="best", fontsize='small', framealpha=0.8, handlelength=1)
        for line in leg.get_lines():
            line.set_linewidth(3.0)
    else:
        ax.get_legend().remove()

## Plots

In [None]:
fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(16, 6), gridspec_kw = {'wspace':0.2, 'hspace':0.1, 'width_ratios': [5, 6]}, sharex=True)

### left plot ###
xs = {}
ys = {}
for protocol in ['DoQ', 'DoT', 'DoH', 'DoTCP']:
    key = f'{protocol}-FCP'
    x = diff_df2[[key]].copy()
    x = x[x[key] < np.inf]
    x = x[~np.isnan(x)]
    xs[protocol], ys[protocol] = calculateCDF(x, bins=400)    
    sns.lineplot(x=xs[protocol], y=ys[protocol], label=protocol, ax=ax1, linewidth=4)

# Labels
ax1.set_ylabel("CDF", fontsize = 33)

# Y axis
ax1.set_ylim(0.0, 1.01)
ax1.yaxis.set_ticks(np.arange(0.0, 1.01, 0.2))
ax1.yaxis.set_minor_locator(mtick.AutoMinorLocator())

# X axis
ax1.set_xlim(-20, 80)
ax1.xaxis.set_ticks(np.arange(-20, 81, 20))
ax1.xaxis.set_minor_locator(mtick.AutoMinorLocator())
ax1.xaxis.set_major_formatter(mtick.PercentFormatter())

ax1.tick_params(which='major', left=True, bottom=True, labelsize=27, width=3, length=9)
ax1.tick_params(which='minor', left=True, bottom=True, width=1, length=6)

# Legend
leg = ax1.legend(loc="lower right", fontsize='x-large')
for line in leg.get_lines():
    line.set_linewidth(5.0)
ax1.get_legend().remove()

# "grid" 
ax1.grid(False)
draw_grid_line(ax1, True, 0.39, xs['DoQ'], ys['DoQ'])
draw_grid_line(ax1, True, 0.39, xs['DoH'], ys['DoH'])
draw_grid_line(ax1, True, 0.8, xs['DoQ'], ys['DoQ'])
draw_grid_line(ax1, True, 0.8, xs['DoH'], ys['DoH'])
draw_grid_line(ax1, False, 0, xs['DoQ'], ys['DoQ'])

ax1.spines['right'].set_visible(False)
ax1.spines['top'].set_visible(False)

### right plot ###
xs = {}
ys = {}
for protocol in ['DoQ', 'DoT', 'DoH', 'DoTCP']:
    key = f'{protocol}-PLT'
    x = diff_df2[[key]].copy()
    x = x[x[key] < np.inf]
    x = x[~np.isnan(x)]
    xs[protocol], ys[protocol] = calculateCDF(x, bins=400)    
    sns.lineplot(x=xs[protocol], y=ys[protocol], label=protocol, ax=ax2, linewidth=4)

# Labels
ax2.set_ylabel("CDF", fontsize = 33)

# Y axis
ax2.set_ylim(0.0, 1.01)
ax2.yaxis.set_ticks(np.arange(0.0, 1.01, 0.2))
ax2.yaxis.set_minor_locator(mtick.AutoMinorLocator())

# X axis
ax2.set_xlim(-20, 80)
ax2.xaxis.set_ticks(np.arange(-20, 81, 20))
ax2.xaxis.set_minor_locator(mtick.AutoMinorLocator())
ax2.xaxis.set_major_formatter(mtick.PercentFormatter())

ax2.tick_params(which='major', left=True, bottom=True, labelsize=27, width=3, length=9)
ax2.tick_params(which='minor', left=True, bottom=True, width=1, length=6)

# Legend
leg = ax2.legend(loc="lower right", fontsize='x-large')
for line in leg.get_lines():
    line.set_linewidth(5.0)

# "grid" 
ax2.grid(False)
draw_grid_line(ax2, False, 15, xs['DoH'], ys['DoH'])
draw_grid_line(ax2, False, 15, xs['DoQ'], ys['DoQ'])

ax2.spines['right'].set_visible(False)
ax2.spines['top'].set_visible(False)


fig.show()
fig.savefig('udp-baseline-fcp-plt.pdf', bbox_inches='tight')


In [None]:
fig, axs = plt.subplots(6, 10, figsize=(22, 13), sharex=True, sharey=True)
tmp_order = ['Europe Central', 'Asia Pacific Northeast', 'US West', 'Africa South', 'Asia Pacific Southeast', 'South America East']

for i, domain in enumerate(domain_order):
    for j, vp in enumerate(tmp_order):
        ax = axs[j, i]
        data = diff_df3[(diff_df3['vantagePoint'] == vp) & (diff_df3['domain'] == domain)].copy()
        showleg=False
        if (i == 9 and j == 5):  # bottom right plot
            ax.annotate('DoUDP', (-16, 0.3), ha='left', rotation=75, color='purple')
            ax.annotate('DoH', (2.5, 0.3), ha='left', rotation=75, color='darkgreen')
        create_comparison(data, ax, metric='PLT', baseline='DoQ', compare_to='DoH', compare_to_2='DoUDP', showleg=showleg)
            
tmp_order = []        
for index, row in df_new.iterrows():
    tmp_order.append(f"{index.split('.')[-2]} ({int(row['queries'])})")

for ax, col in zip(axs[0], tmp_order):
    ax.set_title(col, size='medium')

for ax, row in zip(axs[:,0], ['EU', 'AS', 'NA', 'AF', 'OC', 'SA']):
    ax.set_ylabel(row, rotation=0, size='large', labelpad=0)
    ax.yaxis.set_label_coords(-0.5,0.395)

fig.tight_layout()
plt.savefig("doq-baseline-cluster-annot.pdf", bbox_inches='tight')
plt.show()

# Sample Sizes

In [None]:
# measurement samples
df.groupby(['vantagePoint', 'protocol']).size().unstack()

In [None]:
# queries
df[['vantagePoint', 'queries', 'protocol']].groupby(['vantagePoint', 'protocol']).sum().unstack()