In [1]:
import geopy.distance
import pandas as pd
import numpy as np

In [2]:
routes_df = pd.read_csv('routes_df.csv')
IP_df = pd.read_csv('IP_df.csv')

In [3]:
IP_df.columns

Index(['IP', 'as', 'city', 'country', 'countryCode', 'isp', 'lat', 'lon',
       'org', 'region', 'regionName', 'status', 'timezone', 'zip', 'message',
       'asn'],
      dtype='object')

In [4]:
# Calculate minimum estimates of distance traveled
# to be used as a minimum propogation delay

In [8]:
link_distances_df = pd.DataFrame(columns=['link_distance', 'coords', 'IPs'])
route_distances_df = pd.DataFrame()
for route_num, route in routes_df.groupby(['route_num']):
    coords_1 = None
    last_IP = 0
    route_distance = 0
    length = len(route)
    num_missed = 0
    for index, row in route.iterrows():
#         print (row)
        if row['IP'] == '???':
            num_missed += 1
            continue
        IP_row = IP_df.loc[IP_df['IP'] == row['IP']].iloc[0]
#         print (IP_row)
        if IP_row['status'] == 'fail':
            num_missed += 1
            continue
        coords_2 = (IP_row['lat'], IP_row['lon'])
#         print ('coords_1 : '+str(coords_1))
#         print ('coords_2 : '+str(coords_2))
#         print ("---")
        if coords_1 is None or coords_1 == coords_2:
            coords_1 = coords_2    
            continue
        coords = sorted([coords_1, coords_2])
        link_distance = 0
        if str(coords) in link_distances_df['coords']:
            link_distance = link_distances.loc[coords].iloc[0]
        else:
            link_distance = geopy.distance.distance(coords_1, coords_2).km
        route_distance += link_distance
        link_distance_dict = {'link_distance':link_distance, 'coords':str(coords)}
#         print (link_distance_dict)
        link_distances_df = link_distances_df.append(link_distance_dict, ignore_index=True)
        coords_1 = coords_2
    route_distance_dict = {'route_num':route_num, 'route_distance':route_distance, 'length':length, 'num_missed':num_missed}
#     print (route_distance_dict)
    route_distances_df = route_distances_df.append(route_distance_dict, ignore_index=True)
            

In [9]:
link_distances_df[:10]

Unnamed: 0,link_distance,coords,IPs
0,3908.468575,"[(32.7405, -117.0935), (40.7234, -74.0054)]",
1,3908.437847,"[(32.7405, -117.0935), (40.7128, -74.006)]",
2,5.514623,"[(25.7617, -80.1918), (25.7975, -80.23)]",
3,4114.157343,"[(25.7617, -80.1918), (37.2414, -121.782999999...",
4,2371.489087,"[(32.7767, -96.79700000000001), (37.5272, -122...",
5,1528.811423,"[(37.5272, -122.26100000000001), (39.6004, -10...",
6,4135.345541,"[(37.5272, -122.26100000000001), (40.7234, -74...",
7,52.89755,"[(37.2414, -121.78299999999999), (37.5272, -12...",
8,3875.612975,"[(40.7128, -74.0059), (47.6062, -122.332000000...",
9,3908.446296,"[(32.7405, -117.0935), (40.7128, -74.0059)]",


In [10]:
max(routes_df['route_num'])

303

In [11]:
route_distances_df[:10]

Unnamed: 0,length,num_missed,route_distance,route_num
0,16.0,11.0,0.0,1.0
1,14.0,9.0,7816.906422,2.0
2,15.0,10.0,4119.671965,3.0
3,11.0,4.0,3900.30051,4.0
4,14.0,9.0,4188.243091,5.0
5,14.0,9.0,11692.497118,6.0
6,14.0,9.0,4176.090469,7.0
7,15.0,11.0,4106.786224,8.0
8,16.0,4.0,22518.764435,9.0
9,18.0,10.0,14698.290697,10.0


In [12]:
route_distances_df['percentage_instact'] = route_distances_df.apply(lambda row : (row['length'] - row['num_missed']) / row['length'], axis=1)

In [13]:
route_distances_df['percentage_missing'] = (1 - route_distances_df['percentage_instact'])

In [14]:
route_distances_df[:10]

Unnamed: 0,length,num_missed,route_distance,route_num,percentage_instact,percentage_missing
0,16.0,11.0,0.0,1.0,0.3125,0.6875
1,14.0,9.0,7816.906422,2.0,0.357143,0.642857
2,15.0,10.0,4119.671965,3.0,0.333333,0.666667
3,11.0,4.0,3900.30051,4.0,0.636364,0.363636
4,14.0,9.0,4188.243091,5.0,0.357143,0.642857
5,14.0,9.0,11692.497118,6.0,0.357143,0.642857
6,14.0,9.0,4176.090469,7.0,0.357143,0.642857
7,15.0,11.0,4106.786224,8.0,0.266667,0.733333
8,16.0,4.0,22518.764435,9.0,0.75,0.25
9,18.0,10.0,14698.290697,10.0,0.444444,0.555556


In [15]:
route_distances_df = route_distances_df.rename(columns={'percentage_missing':'percent_missing',
                                  'percentage_instact':'percent_complete',
                                    'route_distance':'route_distance(km)'})
route_distances_df[:10]

Unnamed: 0,length,num_missed,route_distance(km),route_num,percent_complete,percent_missing
0,16.0,11.0,0.0,1.0,0.3125,0.6875
1,14.0,9.0,7816.906422,2.0,0.357143,0.642857
2,15.0,10.0,4119.671965,3.0,0.333333,0.666667
3,11.0,4.0,3900.30051,4.0,0.636364,0.363636
4,14.0,9.0,4188.243091,5.0,0.357143,0.642857
5,14.0,9.0,11692.497118,6.0,0.357143,0.642857
6,14.0,9.0,4176.090469,7.0,0.357143,0.642857
7,15.0,11.0,4106.786224,8.0,0.266667,0.733333
8,16.0,4.0,22518.764435,9.0,0.75,0.25
9,18.0,10.0,14698.290697,10.0,0.444444,0.555556


In [16]:
len(route_distances_df)

303

In [17]:
route_distances_df.loc[route_distances_df['percent_complete'] > 0.8]

Unnamed: 0,length,num_missed,route_distance(km),route_num,percent_complete,percent_missing
14,20.0,1.0,12419.470639,15.0,0.950000,0.050000
16,17.0,2.0,15120.282726,17.0,0.882353,0.117647
17,19.0,0.0,15890.383019,18.0,1.000000,0.000000
18,9.0,1.0,12956.788646,19.0,0.888889,0.111111
19,14.0,2.0,31672.558432,20.0,0.857143,0.142857
21,15.0,1.0,24711.825145,22.0,0.933333,0.066667
23,16.0,1.0,27036.896299,24.0,0.937500,0.062500
24,19.0,0.0,58234.033942,25.0,1.000000,0.000000
25,20.0,3.0,24461.216497,26.0,0.850000,0.150000
26,19.0,0.0,57152.992934,27.0,1.000000,0.000000


In [18]:
link_distances_df[:10]

Unnamed: 0,link_distance,coords,IPs
0,3908.468575,"[(32.7405, -117.0935), (40.7234, -74.0054)]",
1,3908.437847,"[(32.7405, -117.0935), (40.7128, -74.006)]",
2,5.514623,"[(25.7617, -80.1918), (25.7975, -80.23)]",
3,4114.157343,"[(25.7617, -80.1918), (37.2414, -121.782999999...",
4,2371.489087,"[(32.7767, -96.79700000000001), (37.5272, -122...",
5,1528.811423,"[(37.5272, -122.26100000000001), (39.6004, -10...",
6,4135.345541,"[(37.5272, -122.26100000000001), (40.7234, -74...",
7,52.89755,"[(37.2414, -121.78299999999999), (37.5272, -12...",
8,3875.612975,"[(40.7128, -74.0059), (47.6062, -122.332000000...",
9,3908.446296,"[(32.7405, -117.0935), (40.7128, -74.0059)]",


In [19]:
link_distances_df = link_distances_df.loc[link_distances_df['link_distance'] > 0]

In [20]:
len(link_distances_df)

1790

In [21]:
len(route_distances_df.loc[route_distances_df['percent_complete'] == 1])

16

In [22]:
routes_df.columns

Index(['IP', 'Loss', 'Snt', 'Last', 'Avg', 'Best', 'Wrst', 'StDev', 'time',
       'order', 'route_num'],
      dtype='object')

In [23]:
def get_end_latency(row):
    route_row = routes_df.loc[(routes_df['order'] == row['length'] - 1) & 
                              (routes_df['route_num'] == row['route_num'])].iloc[0]
    return route_row['Avg']

In [24]:
route_distances_df['end_latency'] = route_distances_df.apply(get_end_latency, axis=1)

In [25]:
len(route_distances_df.loc[(route_distances_df['end_latency'] > 0)])

136

In [26]:
len(route_distances_df.loc[(route_distances_df['end_latency'] > 0) &
                    (route_distances_df['percent_complete'] >= .8)])

70

In [33]:
def get_birds_eye_distance(row):
    start_row = routes_df.loc[(routes_df['order'] == 1) & 
                              (routes_df['route_num'] == row['route_num'])].iloc[0]
    end_row = routes_df.loc[(routes_df['order'] == row['length'] - 1) & 
                              (routes_df['route_num'] == row['route_num'])].iloc[0]
    if start_row['IP'] == '???' or end_row['IP'] == '???':
        return -1
    IP_start_row = IP_df.loc[IP_df['IP'] == start_row['IP']].iloc[0]
    IP_end_row = IP_df.loc[IP_df['IP'] == end_row['IP']].iloc[0]
    if IP_start_row['status'] == 'fail' or IP_end_row['status'] == 'fail':
        return -1
    coords_1 = (IP_start_row['lat'], IP_start_row['lon'])
    coords_2 = (IP_end_row['lat'], IP_end_row['lon'])
    distance = geopy.distance.distance(coords_1, coords_2).km
    return distance

In [34]:
route_distances_df['birds_eye_distance(km)'] = route_distances_df.apply(get_birds_eye_distance, axis=1)

In [35]:
len(route_distances_df.loc[(route_distances_df['birds_eye_distance(km)'] > 0) &
                    (route_distances_df['percent_complete'] >= 0.8)])

66

In [36]:
link_distances_df.to_csv('link_distances_df.csv', index=False)

In [37]:
route_distances_df.to_csv('route_distances_df.csv', index=False)

In [32]:
len(routes_df.loc[(routes_df['order'] == 0) &
                   (routes_df['IP'] != '???')])

173