In [2]:
import pandas as pd
import sqlite3
import datetime
import numpy as np
import matplotlib as mpl
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from matplotlib.ticker import PercentFormatter
import math

from urllib.parse import urlparse, parse_qs
mpl.rcParams['figure.dpi'] = 200
okabe_ito = ["#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7"]
sns.set_context('paper')
sns.set_palette(sns.color_palette(okabe_ito))
#https://stackoverflow.com/a/45846841
def human_format(num):
    num = float('{:.3g}'.format(num))
    magnitude = 0
    while abs(num) >= 1000:
        magnitude += 1
        num /= 1000.0
    return '{}{}'.format('{:f}'.format(num).rstrip('0').rstrip('.'), ['', 'K', 'M', 'B', 'T'][magnitude])
def make_cdf(series):
    return series.value_counts(normalize=True).sort_index().cumsum()
def make_pdf(series):
    return series.value_counts(normalize=True).sort_index()
from mpl_toolkits.axes_grid1 import make_axes_locatable
from datetime import datetime
mpl.rcParams['figure.dpi'] = 100
import pickle

In [26]:
conn = sqlite3.connect('web-performance-youtube-merged.db')
measurements = pd.read_sql_query("SELECT * FROM measurements", conn)
failed_lookups = pd.read_sql_query('SELECT * FROM lookups WHERE status <> "NOERROR"', conn)
conn.close()

In [27]:
measurements['timestamp'] = pd.to_datetime(measurements['timestamp'])
measurements['date'] = measurements['timestamp'].dt.date

In [28]:
def count_cache_warming(agg):
    #print(agg)
    found_cw = False
    found_msm = False
    for item in agg:
        if item == 0:
            found_msm = True
        if item == 1:
            found_cw = True
    if found_cw and found_msm:
        return 'cw+msm'
    elif found_cw and not found_msm:
        return 'cw'
    elif not found_cw and not found_msm:
        return 'nothing'
    elif not found_cw and found_msm:
        return 'something went completely wrong'
sanity_check = measurements.groupby(['vantagePoint', 'domain', 'server', 'protocol', 'date'], as_index=False).agg(num_msm_id=('msm_id', pd.Series.nunique), sanity_check=('cacheWarming', lambda x: count_cache_warming(x)))

In [29]:
#measurements.error.str.startswith("failed driver.get() for cache warming ###")
#measurements.error.str.startswith("failed driver.get() ###")
#measurements.error.str.startswith("failed loading player for cache warming ###")
#measurements.error.str.startswith("failed loading player ###")
#measurements.error.str.startswith("failed switching selenium focus to youtube iframe or monitoring loop ###")
measurements['error_'] = "none"
measurements['error_'] = np.where(measurements.error.str.startswith("failed driver.get() for cache warming ###"), "cw: driver.get", measurements['error_'])
measurements['error_'] = np.where(measurements.error.str.startswith("failed driver.get() ###"), "msm: driver.get", measurements['error_'])
measurements['error_'] = np.where(measurements.error.str.startswith("failed loading player for cache warming ###"), "cw: player load", measurements['error_'])
measurements['error_'] = np.where(measurements.error.str.startswith("failed loading player ###"), "msm: player load", measurements['error_'])
measurements['error_'] = np.where(measurements.error.str.startswith("failed switching selenium focus to youtube iframe or monitoring loop ###"), "msm: playback", measurements['error_'])
measurements['error_'].value_counts()

none                237470
cw: player load      15316
msm: playback          861
cw: driver.get          84
msm: player load        59
msm: driver.get         24
Name: error_, dtype: int64

In [30]:
servfail_during_msm = pickle.load( open( "servfail_during_msm.pickle", "rb" ) )
no_start_play_event_during_msm = pickle.load( open( "no_start_play_event_during_msm.pickle", "rb" ) )
no_quality_change_event_during_msm = pickle.load( open( "no_quality_change_event_during_msm.pickle", "rb" ) )
googlevideo_changed_during_msm = pickle.load( open( "googlevideo_changed_during_msm.pickle", "rb" ) )


In [31]:
measurements['error_'] = np.where((measurements.cacheWarming == 0) & (measurements.error_ == 'none') & (measurements.msm_id.isin(servfail_during_msm)), "msm: dns error", measurements['error_'])
measurements['error_'] = np.where((measurements.cacheWarming == 0) & (measurements.error_ == 'none') & (measurements.msm_id.isin(no_start_play_event_during_msm)), "msm: no play event", measurements['error_'])
measurements['error_'] = np.where((measurements.cacheWarming == 0) & (measurements.error_ == 'none') & (measurements.msm_id.isin(no_quality_change_event_during_msm)), "msm: no quality event", measurements['error_'])
measurements['error_'] = np.where((measurements.cacheWarming == 0) & (measurements.error_ == 'none') & (measurements.msm_id.isin(googlevideo_changed_during_msm)), "msm: googlevideo changed during playback", measurements['error_'])

In [32]:
measurements['error_'].value_counts()

none                                        235829
cw: player load                              15316
msm: playback                                  861
msm: googlevideo changed during playback       826
msm: dns error                                 525
msm: no play event                             283
cw: driver.get                                  84
msm: player load                                59
msm: driver.get                                 24
msm: no quality event                            7
Name: error_, dtype: int64

In [33]:
sanity_check_three = measurements.sort_values(['timestamp', 'cacheWarming']).groupby(['vantagePoint', 'domain', 'server', 'protocol', 'date'], as_index=False).agg(msm_ids=('msm_id', pd.Series.unique), errors=('error_', lambda x: ', '.join(list(x.values))))

In [34]:
sanity_check_three['errors'] = np.where(sanity_check_three.errors == 'none', 'none, msm: failed to even run', sanity_check_three['errors'])

In [35]:
sanity_check.sanity_check.value_counts()

cw+msm    119206
cw         15402
Name: sanity_check, dtype: int64

In [100]:
len(sanity_check_three)

134608

In [36]:
sanity_check_three.errors.value_counts()

none, none                                        116621
cw: player load                                    15316
none, msm: playback                                  861
none, msm: googlevideo changed during playback       826
none, msm: dns error                                 525
none, msm: no play event                             283
cw: driver.get                                        84
none, msm: player load                                59
none, msm: driver.get                                 24
none, msm: no quality event                            7
none, msm: failed to even run                          2
Name: errors, dtype: int64

In [37]:
sanity_check_three.groupby('protocol').size()

protocol
https    26921
quic     26922
tcp      26922
tls      26922
udp      26921
dtype: int64

In [101]:
sanity_check_three[sanity_check_three.errors != 'none, none'].groupby('vantagePoint').size().sort_values()

vantagePoint
South America East        2288
US West                   2391
Asia Pacific Northeast    2452
Europe Central            2477
Asia Pacific Southeast    2510
US East                   2557
Africa South              3312
dtype: int64

In [38]:
sanity_check_three[sanity_check_three.errors != 'none, none'].groupby('protocol').size()

protocol
https    3469
quic     3413
tcp      3631
tls      3405
udp      4069
dtype: int64

In [39]:
sanity_check_three[sanity_check_three.errors.isin(['none, none', 'none, msm: googlevideo changed during playback', 'none, msm: dns error', 'none, msm: no play event', 'none, msm: no quality event', 'none, msm: failed to even run'])].errors.value_counts()

none, none                                        116621
none, msm: googlevideo changed during playback       826
none, msm: dns error                                 525
none, msm: no play event                             283
none, msm: no quality event                            7
none, msm: failed to even run                          2
Name: errors, dtype: int64

In [40]:
sanity_check_three[sanity_check_three.errors.isin(['none, none', 'none, msm: googlevideo changed during playback', 'none, msm: dns error', 'none, msm: no play event', 'none, msm: no quality event', 'none, msm: failed to even run'])].groupby('protocol').size()

protocol
https    23770
quic     23801
tcp      23468
tls      23843
udp      23382
dtype: int64

In [41]:
sanity_check_three[sanity_check_three.errors.isin(['none, msm: googlevideo changed during playback', 'none, msm: dns error', 'none, msm: no play event', 'none, msm: no quality event', 'none, msm: failed to even run'])].groupby('protocol').size()

protocol
https    318
quic     292
tcp      177
tls      326
udp      530
dtype: int64

In [103]:
sanity_check_three[sanity_check_three.errors == 'none, none'].groupby('vantagePoint').size().sort_values(ascending=False)

vantagePoint
South America East        16932
US West                   16849
Europe Central            16792
Asia Pacific Southeast    16759
Asia Pacific Northeast    16748
US East                   16623
Africa South              15918
dtype: int64

In [42]:
sanity_check_three[sanity_check_three.errors == 'none, none'].groupby('protocol').size()

protocol
https    23452
quic     23509
tcp      23291
tls      23517
udp      22852
dtype: int64

In [48]:
sanity_check_three.groupby('protocol').errors.value_counts()

protocol  errors                                        
https     none, none                                        23452
          cw: player load                                    2963
          none, msm: googlevideo changed during playback      191
          none, msm: playback                                 176
          none, msm: no play event                             76
          none, msm: dns error                                 48
          none, msm: player load                                7
          none, msm: driver.get                                 3
          cw: driver.get                                        2
          none, msm: no quality event                           2
          none, msm: failed to even run                         1
quic      none, none                                        23509
          cw: player load                                    2944
          none, msm: googlevideo changed during playback      170
          none, msm

In [47]:
sanity_check_three.errors.unique()

array(['cw: player load', 'none, none',
       'none, msm: googlevideo changed during playback',
       'none, msm: playback', 'none, msm: dns error',
       'none, msm: player load', 'cw: driver.get',
       'none, msm: no play event', 'none, msm: driver.get',
       'none, msm: no quality event', 'none, msm: failed to even run'],
      dtype=object)

In [55]:
sanity_check_three[sanity_check_three.errors == 'none, msm: failed to even run']

Unnamed: 0,vantagePoint,domain,server,protocol,date,msm_ids,errors
72415,Europe Central,lqiN98z6Dak,188.68.59.46,tls,2022-04-23,054af061-eada-2266-e54c-7cba3fa35c36,"none, msm: failed to even run"
124923,US West,aqz-KE-bpKQ,94.140.15.16,https,2022-04-23,fe6b59ad-78fa-0549-a0cf-516f3d59ef95,"none, msm: failed to even run"


In [56]:
def cw_msm_errors(grouped):
    #print(grouped)
    ret = {}
    ret['sanity_check'] = ""
    ret['cw_id'] = ""
    ret['msm_id'] = ""
    
    cw_error = False
    for cache_warming in [1,0]:
        if cache_warming == 1:
            ret['cw_id'] = grouped[grouped.cacheWarming == cache_warming].msm_id.values[0]
            if grouped[grouped.cacheWarming == cache_warming].error_.values[0] != 'none':
                cw_error=True
        else:
            if cw_error:
                if len(grouped[grouped.cacheWarming == cache_warming]) > 0:
                    ret['msm_id'] = grouped[grouped.cacheWarming == cache_warming].msm_id.values[0]
                    ret['sanity_check'] = "cache warming error but also ran measurement: "+grouped[grouped.cacheWarming == 1].error_.values[0] + " -- "+grouped[grouped.cacheWarming == 0].error_.values[0]
                else:
                    ret['sanity_check'] = "cache warming error: "+grouped[grouped.cacheWarming == 1].error_.values[0]
            else:
                if len(grouped[grouped.cacheWarming == cache_warming]) == 0:
                    ret['sanity_check'] = "cache warming successful but no measurement"
                else:
                    ret['msm_id'] = grouped[grouped.cacheWarming == cache_warming].msm_id.values[0]
                    ret['sanity_check'] = "measurement error: "+grouped[grouped.cacheWarming == cache_warming].error_.values[0]
                
    return pd.Series(ret, index=['sanity_check', 'cw_id', 'msm_id'])
sanity_check_two = measurements.groupby(['vantagePoint', 'domain', 'server', 'protocol', 'date'], as_index=False).apply(cw_msm_errors)
sanity_check_two

Unnamed: 0,vantagePoint,domain,server,protocol,date,sanity_check,cw_id,msm_id
0,Africa South,aqz-KE-bpKQ,101.32.27.77,https,2022-04-18,cache warming error: cw: player load,8e6cb9f9-48f3-b924-bb34-305dd99ce2c1,
1,Africa South,aqz-KE-bpKQ,101.32.27.77,https,2022-04-19,cache warming error: cw: player load,fbc7c693-7982-3fa8-1216-3cd2aefecffd,
2,Africa South,aqz-KE-bpKQ,101.32.27.77,https,2022-04-20,cache warming error: cw: player load,8f4c0da3-2296-4ee9-6d0f-4df5fcc596bf,
3,Africa South,aqz-KE-bpKQ,101.32.27.77,https,2022-04-21,cache warming error: cw: player load,3eec96b6-2bdc-b9b1-d6f5-75f9dde8d8e8,
4,Africa South,aqz-KE-bpKQ,101.32.27.77,https,2022-04-22,cache warming error: cw: player load,9805c13a-d8fa-09c2-eb2d-037df3f8ffc4,
...,...,...,...,...,...,...,...,...
134603,US West,lqiN98z6Dak,98.154.23.186,udp,2022-04-20,measurement error: none,ed446cae-d639-c866-534a-6e1f925b2f55,ad4bc260-91c8-2d64-1a7e-646eef762650
134604,US West,lqiN98z6Dak,98.154.23.186,udp,2022-04-21,measurement error: none,3f83bec9-3d7b-d194-4da1-4ec7f35ece6d,49660942-439c-a6ae-f871-905a908abd0e
134605,US West,lqiN98z6Dak,98.154.23.186,udp,2022-04-22,measurement error: none,574abb53-e752-6d96-13a8-9524f583fd83,8ce9a9d3-8872-47e6-914c-5e10b4e65b53
134606,US West,lqiN98z6Dak,98.154.23.186,udp,2022-04-23,measurement error: none,72baa7a5-2a39-0f78-8370-afad8365b0f2,ce9bd00e-7a9e-2355-1968-fa8c177f36b3


In [57]:
measurements[measurements.error_ == 'none'].cacheWarming.value_counts()

1    119208
0    116621
Name: cacheWarming, dtype: int64

In [58]:
measurements[measurements.cacheWarming == 1].groupby('protocol', as_index=False).agg(successful_msms=('msm_id', pd.Series.nunique)).sort_values('protocol')

Unnamed: 0,protocol,successful_msms
0,https,26921
1,quic,26922
2,tcp,26922
3,tls,26922
4,udp,26921


In [59]:
measurements[(measurements.error_ != 'none')].groupby('protocol', as_index=False).agg(successful_msms=('msm_id', pd.Series.nunique)).sort_values('protocol')

Unnamed: 0,protocol,successful_msms
0,https,3468
1,quic,3413
2,tcp,3631
3,tls,3404
4,udp,4069


In [60]:
measurements[(measurements.cacheWarming == 0)].groupby('protocol', as_index=False).agg(successful_msms=('msm_id', pd.Series.nunique)).sort_values('protocol')

Unnamed: 0,protocol,successful_msms
0,https,23955
1,quic,23975
2,tcp,23650
3,tls,24051
4,udp,23575


In [61]:
measurements[(measurements.error_ == 'none') & (measurements.cacheWarming == 0)].groupby('protocol', as_index=False).agg(successful_msms=('msm_id', pd.Series.nunique)).sort_values('protocol')

Unnamed: 0,protocol,successful_msms
0,https,23452
1,quic,23509
2,tcp,23291
3,tls,23517
4,udp,22852


In [62]:
measurements[(measurements.error_ != 'none') & (measurements.cacheWarming == 0)].groupby('protocol', as_index=False).agg(successful_msms=('msm_id', pd.Series.nunique)).sort_values('protocol')

Unnamed: 0,protocol,successful_msms
0,https,503
1,quic,466
2,tcp,359
3,tls,534
4,udp,723


In [63]:
measurements[(measurements.error_ == 'none') & (measurements.cacheWarming == 1)].groupby('protocol', as_index=False).agg(successful_msms=('msm_id', pd.Series.nunique)).sort_values('protocol')

Unnamed: 0,protocol,successful_msms
0,https,23956
1,quic,23975
2,tcp,23650
3,tls,24052
4,udp,23575


In [64]:
measurements[(measurements.error_ != 'none') & (measurements.cacheWarming == 1)].groupby('protocol', as_index=False).agg(successful_msms=('msm_id', pd.Series.nunique)).sort_values('protocol')

Unnamed: 0,protocol,successful_msms
0,https,2965
1,quic,2947
2,tcp,3272
3,tls,2870
4,udp,3346


In [65]:
measurements[measurements.error_ != 'none'].groupby('protocol', as_index=False).agg(failed_msms=('msm_id', pd.Series.nunique))

Unnamed: 0,protocol,failed_msms
0,https,3468
1,quic,3413
2,tcp,3631
3,tls,3404
4,udp,4069


In [66]:
measurements[measurements.error_ == 'none'].groupby('protocol', as_index=False).agg(successful_msms=('msm_id', pd.Series.nunique)).sort_values('successful_msms')

Unnamed: 0,protocol,successful_msms
4,udp,46427
2,tcp,46941
0,https,47408
1,quic,47484
3,tls,47569


In [67]:
measurements[measurements.error_ != 'none'].groupby('vantagePoint', as_index=False).agg(failed_msms=('msm_id', pd.Series.nunique))

Unnamed: 0,vantagePoint,failed_msms
0,Africa South,3312
1,Asia Pacific Northeast,2452
2,Asia Pacific Southeast,2510
3,Europe Central,2476
4,South America East,2288
5,US East,2557
6,US West,2390


In [68]:
measurements[measurements.error_.str.startswith('cw')].groupby('vantagePoint', as_index=False).agg(failed_msms=('msm_id', pd.Series.nunique))

Unnamed: 0,vantagePoint,failed_msms
0,Africa South,2347
1,Asia Pacific Northeast,2138
2,Asia Pacific Southeast,2148
3,Europe Central,2235
4,South America East,2077
5,US East,2299
6,US West,2156


In [69]:
measurements[measurements.error_.str.startswith('msm')].groupby('vantagePoint', as_index=False).agg(failed_msms=('msm_id', pd.Series.nunique))

Unnamed: 0,vantagePoint,failed_msms
0,Africa South,965
1,Asia Pacific Northeast,314
2,Asia Pacific Southeast,362
3,Europe Central,241
4,South America East,211
5,US East,258
6,US West,234


In [70]:
measurements[measurements.error_ == 'none'].groupby('vantagePoint', as_index=False).agg(successful_msms=('msm_id', pd.Series.nunique)).sort_values('successful_msms')

Unnamed: 0,vantagePoint,successful_msms
0,Africa South,32801
5,US East,33504
1,Asia Pacific Northeast,33810
3,Europe Central,33826
2,Asia Pacific Southeast,33880
6,US West,33933
4,South America East,34075


In [71]:
server_failure_vp_count = measurements[measurements.error_ != 'none'].groupby(['server'], as_index=False).agg(unique_vps=('vantagePoint', pd.Series.nunique))
server_failure_vp_count


Unnamed: 0,server,unique_vps
0,101.32.27.77,7
1,103.105.98.141,4
2,103.123.108.197,7
3,103.124.95.99,3
4,103.172.17.225,5
...,...,...
249,94.140.14.49,1
250,94.140.14.59,1
251,94.140.15.15,5
252,94.140.15.16,1


In [72]:
server_failure_vp_count.unique_vps.value_counts()

1    71
2    64
7    43
3    36
4    19
5    12
6     9
Name: unique_vps, dtype: int64

In [73]:
servers_that_fail_on_single_vm = server_failure_vp_count[server_failure_vp_count.unique_vps == 1].server.unique().tolist()

In [74]:
measurements[(measurements.error_ != 'none') & (measurements.server.isin(servers_that_fail_on_single_vm))].vantagePoint.value_counts()

Africa South              76
Asia Pacific Southeast    14
Europe Central            14
US East                   14
US West                   13
South America East         6
Asia Pacific Northeast     1
Name: vantagePoint, dtype: int64

In [75]:
measurements[(measurements.error_ != 'none') & (measurements.server.isin(servers_that_fail_on_single_vm)) & (measurements.vantagePoint == "Africa South")].error_.value_counts()

msm: googlevideo changed during playback    54
cw: player load                             14
msm: playback                                4
cw: driver.get                               2
msm: dns error                               1
msm: no quality event                        1
Name: error_, dtype: int64

In [76]:
measurements[(measurements.error_ != 'none') & (measurements.server.isin(servers_that_fail_on_single_vm)) & (measurements.vantagePoint == "Europe Central")].error_.value_counts()

msm: googlevideo changed during playback    9
msm: dns error                              3
msm: playback                               2
Name: error_, dtype: int64

In [77]:
measurements[(measurements.error_ != 'none') & (measurements.server.isin(servers_that_fail_on_single_vm)) & (measurements.vantagePoint == "US East")].error_.value_counts()

msm: googlevideo changed during playback    6
msm: dns error                              3
msm: playback                               3
cw: player load                             1
msm: no quality event                       1
Name: error_, dtype: int64

In [78]:
measurements[(measurements.error_ != 'none') & (measurements.server.isin(servers_that_fail_on_single_vm)) & (measurements.vantagePoint == "Asia Pacific Northeast")].error_.value_counts()

msm: googlevideo changed during playback    1
Name: error_, dtype: int64

In [79]:
servers_that_fail_on_every_vm = server_failure_vp_count[server_failure_vp_count.unique_vps == 7].server.unique().tolist()

In [80]:
measurements[(measurements.error_ != 'none') & (measurements.server.isin(servers_that_fail_on_every_vm))].vantagePoint.value_counts()

Europe Central            2083
Africa South              2082
US West                   2074
Asia Pacific Southeast    2047
US East                   2038
Asia Pacific Northeast    1990
South America East        1948
Name: vantagePoint, dtype: int64

In [81]:
measurements[(measurements.error_ != 'none') & (measurements.server.isin(servers_that_fail_on_every_vm))].error_.value_counts()

cw: player load                             13297
msm: googlevideo changed during playback      268
msm: dns error                                251
msm: no play event                            250
msm: playback                                  76
msm: player load                               52
cw: driver.get                                 51
msm: driver.get                                16
msm: no quality event                           1
Name: error_, dtype: int64

In [82]:
measurements[measurements.error == ''].server.nunique()

260

In [83]:
measurements[measurements.error != ''].server.nunique()

126

In [84]:
measurements[measurements.server.isin(measurements[measurements.error == ''].server.unique())].protocol.value_counts()

tls      49163
quic     49087
https    49066
tcp      48762
udp      48686
Name: protocol, dtype: int64

In [85]:
measurements.dtypes

msm_id                          object
py_time                          int64
js_time                        float64
resource_time_origin           float64
protocol                        object
server                          object
domain                          object
vantagePoint                    object
timestamp               datetime64[ns]
suggested_quality               object
player_width                     int64
player_height                    int64
start_time                       int64
play_time                        int64
video_ids                       object
cacheWarming                     int64
error                           object
date                            object
error_                          object
dtype: object

In [86]:
measurements[measurements.error == ''].groupby(['server', 'protocol', 'vantagePoint'], as_index=False).agg(num_msm=('msm_id',pd.Series.nunique)).sort_values('num_msm')

Unnamed: 0,server,protocol,vantagePoint,num_msm
4347,185.254.18.242,tls,Africa South,1
6888,47.107.121.125,udp,Asia Pacific Southeast,1
4993,199.101.171.125,https,South America East,2
6861,47.103.26.225,tls,Asia Pacific Southeast,2
863,119.3.92.152,tls,Asia Pacific Southeast,2
...,...,...,...,...
3096,172.104.183.19,udp,Asia Pacific Northeast,28
3095,172.104.183.19,udp,Africa South,28
3094,172.104.183.19,tls,US West,28
3092,172.104.183.19,tls,South America East,28


In [128]:
successful_msms = measurements[(measurements.error_ == 'none') & (measurements.cacheWarming == 0)].groupby(['server', 'protocol', 'vantagePoint'], as_index=False).agg(num_msm=('msm_id',pd.Series.nunique))

In [129]:
successful_msms[successful_msms.num_msm < 14].server.nunique()

230

In [131]:
resolvers_with_full_results = dict()

for vp in successful_msms.vantagePoint.unique():
    resolvers_with_full_results[vp] = set()
    
    for server in successful_msms.server.unique():
        found_incomplete=False
        for protocol in successful_msms.protocol.unique():
            df_tmp = successful_msms[(successful_msms.server == server) & (successful_msms.vantagePoint == vp) & (successful_msms.protocol == protocol)]
            if len(df_tmp) == 0:
                found_incomplete = True
                break
            else:
                num_msm = df_tmp.num_msm.values[0]
                if num_msm < 14:
                    found_incomplete = True
                    break
        if not found_incomplete:
            resolvers_with_full_results[vp].add(server)

In [132]:
resolvers_with_full_results_set = set()
for vp in successful_msms.vantagePoint.unique():
    if len(resolvers_with_full_results_set) == 0:
        resolvers_with_full_results_set = resolvers_with_full_results[vp]
    else:
        resolvers_with_full_results_set = resolvers_with_full_results_set.intersection(resolvers_with_full_results[vp])
resolvers_with_full_results_set

{'109.205.178.178',
 '13.250.108.212',
 '138.2.91.167',
 '150.230.99.64',
 '16.162.25.97',
 '164.90.199.170',
 '168.138.36.90',
 '176.103.134.134',
 '176.103.134.149',
 '185.244.173.222',
 '188.68.45.12',
 '192.46.231.38',
 '193.123.252.172',
 '194.67.121.139',
 '209.17.118.5',
 '37.114.32.22',
 '37.114.32.44',
 '85.214.195.112',
 '85.235.65.70',
 '86.238.66.147',
 '92.38.241.36',
 '94.140.14.140',
 '94.140.14.15'}

In [133]:
len(resolvers_with_full_results_set)

23

In [134]:
resolvers_with_half_results = dict()

for vp in successful_msms.vantagePoint.unique():
    resolvers_with_half_results[vp] = set()
    
    for server in successful_msms.server.unique():
        found_incomplete=False
        for protocol in successful_msms.protocol.unique():
            df_tmp = successful_msms[(successful_msms.server == server) & (successful_msms.vantagePoint == vp) & (successful_msms.protocol == protocol)]
            if len(df_tmp) == 0:
                found_incomplete = True
                break
            else:
                num_msm = df_tmp.num_msm.values[0]
                if num_msm < 7:
                    found_incomplete = True
                    break
        if not found_incomplete:
            resolvers_with_half_results[vp].add(server)
resolvers_with_half_results_set = set()
for vp in successful_msms.vantagePoint.unique():
    if len(resolvers_with_half_results_set) == 0:
        resolvers_with_half_results_set = resolvers_with_half_results[vp]
    else:
        resolvers_with_half_results_set = resolvers_with_half_results_set.intersection(resolvers_with_half_results[vp])
print(resolvers_with_half_results_set)
print(len(resolvers_with_half_results_set))

{'82.118.227.235', '176.103.134.34', '85.192.48.70', '45.129.2.32', '51.75.162.101', '173.82.68.70', '5.135.233.74', '107.191.51.151', '67.205.154.232', '192.26.105.29', '46.137.195.96', '104.238.154.123', '172.104.183.19', '176.103.130.132', '5.181.48.19', '23.95.209.33', '185.177.218.107', '185.200.34.171', '185.180.206.110', '3.9.78.39', '193.201.126.42', '47.243.165.176', '188.68.45.12', '46.232.251.76', '104.244.79.105', '43.154.154.162', '109.205.178.178', '173.255.211.77', '188.165.4.138', '15.165.113.96', '20.24.142.152', '8.210.148.24', '140.238.36.23', '37.114.32.22', '45.79.151.58', '94.140.14.141', '202.61.207.68', '180.93.137.1', '98.154.23.186', '85.235.65.70', '50.116.59.251', '167.99.236.104', '64.112.126.239', '185.17.3.188', '132.226.232.79', '176.103.130.149', '86.238.66.147', '192.3.73.139', '188.227.86.112', '146.59.226.118', '51.195.116.94', '94.140.14.49', '94.140.14.14', '107.172.103.160', '176.103.130.150', '192.3.253.106', '150.230.103.62', '185.175.57.120', '

In [137]:
successful_msms_video_id = measurements[(measurements.error_ == 'none') & (measurements.cacheWarming == 0)].groupby(['server', 'protocol', 'vantagePoint', 'domain'], as_index=False).agg(num_msm=('msm_id',pd.Series.nunique))

In [142]:
for video_id in successful_msms_video_id.domain.unique():
    print(video_id)
    resolvers_with_half_results = dict()
    for vp in successful_msms.vantagePoint.unique():
        resolvers_with_half_results[vp] = set()

        for server in successful_msms.server.unique():
            found_incomplete=False
            for protocol in successful_msms.protocol.unique():
                df_tmp = successful_msms_video_id[(successful_msms_video_id.server == server) & (successful_msms_video_id.vantagePoint == vp) & (successful_msms_video_id.protocol == protocol) & (successful_msms_video_id.domain == video_id)]
                if len(df_tmp) == 0:
                    found_incomplete = True
                    break
                else:
                    num_msm = df_tmp.num_msm.values[0]
                    if num_msm < 3:
                        found_incomplete = True
                        break
            if not found_incomplete:
                resolvers_with_half_results[vp].add(server)
    resolvers_with_half_results_set = set()
    for vp in successful_msms.vantagePoint.unique():
        if len(resolvers_with_half_results_set) == 0:
            resolvers_with_half_results_set = resolvers_with_half_results[vp]
        else:
            resolvers_with_half_results_set = resolvers_with_half_results_set.intersection(resolvers_with_half_results[vp])
    print(resolvers_with_half_results_set)
    print(len(resolvers_with_half_results_set))

aqz-KE-bpKQ
{'82.118.227.235', '176.103.134.34', '85.192.48.70', '45.129.2.32', '51.75.162.101', '173.82.68.70', '5.135.233.74', '107.191.51.151', '67.205.154.232', '192.26.105.29', '46.137.195.96', '104.238.154.123', '172.104.183.19', '176.103.130.132', '5.181.48.19', '23.95.209.33', '185.177.218.107', '185.200.34.171', '185.180.206.110', '3.9.78.39', '193.201.126.42', '47.243.165.176', '188.68.45.12', '46.232.251.76', '104.244.79.105', '43.154.154.162', '109.205.178.178', '173.255.211.77', '188.165.4.138', '15.165.113.96', '20.24.142.152', '8.210.148.24', '140.238.36.23', '37.114.32.22', '45.79.151.58', '94.140.14.141', '202.61.207.68', '180.93.137.1', '98.154.23.186', '85.235.65.70', '50.116.59.251', '167.99.236.104', '64.112.126.239', '185.17.3.188', '132.226.232.79', '176.103.130.149', '86.238.66.147', '192.3.73.139', '188.227.86.112', '146.59.226.118', '51.195.116.94', '94.140.14.49', '94.140.14.14', '107.172.103.160', '176.103.130.150', '192.3.253.106', '150.230.103.62', '185.17

In [93]:
measurements.msm_id.nunique()

253814

In [95]:
measurements.server.nunique()

279

In [107]:
successful_msms_by_resolver_vp_count = measurements[(measurements.cacheWarming == 0) & (measurements.error_ == 'none')].groupby(['server'], as_index=False).agg(unique_vps=('vantagePoint', pd.Series.nunique))

In [117]:
len(successful_msms_by_resolver_vp_count[successful_msms_by_resolver_vp_count.unique_vps == 7])

234

In [118]:
successful_msms_by_resolver_vp_count[successful_msms_by_resolver_vp_count.unique_vps == 7]

Unnamed: 0,server,unique_vps
0,103.105.98.141,7
1,103.123.108.197,7
2,103.124.95.99,7
3,103.172.17.225,7
4,103.172.17.226,7
...,...,...
252,94.140.14.49,7
253,94.140.14.59,7
254,94.140.15.15,7
255,94.140.15.16,7


In [109]:
successful_msms_by_resolver_vp_count[successful_msms_by_resolver_vp_count.unique_vps == 1]

Unnamed: 0,server,unique_vps
17,106.12.119.30,1
27,123.195.161.145,1
180,39.105.150.214,1
187,45.12.19.254,1
220,51.89.26.38,1


In [111]:
measurements[measurements.server == '106.12.119.30'].error_.value_counts()

cw: player load                             434
none                                          9
msm: dns error                                1
msm: googlevideo changed during playback      1
msm: no play event                            1
Name: error_, dtype: int64

In [112]:
measurements[measurements.server == '123.195.161.145'].error_.value_counts()

cw: player load    420
none               140
Name: error_, dtype: int64

In [113]:
measurements[measurements.server == '39.105.150.214'].error_.value_counts()

none                                        280
cw: player load                             203
msm: no play event                          157
msm: googlevideo changed during playback    101
msm: player load                             12
msm: dns error                                3
msm: playback                                 1
Name: error_, dtype: int64

In [114]:
measurements[measurements.server == '45.12.19.254'].error_.value_counts()

none    20
Name: error_, dtype: int64

In [115]:
measurements[measurements.server == '51.89.26.38'].error_.value_counts()

cw: player load    420
none               140
Name: error_, dtype: int64

In [120]:
measurements[measurements.server == '103.232.207.2'].error_.value_counts()

none                                        965
msm: googlevideo changed during playback      9
msm: dns error                                6
Name: error_, dtype: int64

In [126]:
sanity_check_three[sanity_check_three.server == '103.232.207.2'].errors.value_counts()

none, none                                        475
none, msm: googlevideo changed during playback      9
none, msm: dns error                                6
Name: errors, dtype: int64