In [22]:
import requests

import json
import re
from pandas.io.json import json_normalize
import pandas as pd

from datetime import datetime, timedelta

import time

### Methods

In [23]:
## TODO
# function does not distinguish between API error or missing values
def get_report_outcome(report_url):
    # Get report JSON file
    
    response = requests.get(report_url)
    
    # Wait and try to fetch once again
    if response.status_code != 200:
        time.sleep(10)
        response = requests.get(report_url)
    
    ret = {}
    
    if response.status_code == 200:
        report = response.json()
    
    ## Additional attributes can be added from the measurement JSON file
    try:
        ret["page_title"] = report['test_keys']['control']['http_request']['title']
    except:
        ret["page_title"] =  'Na'
    
    try:
        ret["probe_platform"] = report['annotations']['platform']
    except:
        ret["probe_platform"] =  'Na'
        
    try:
        ret["probe_network_type"] = report['annotations']['network_type']
    except:
        ret["probe_network_type"] =  'Na'
    
    try:
        ret["dns_ip"] = report['test_keys']['client_resolver']
    except:
        ret["dns_ip"] =  'Na'
        
    try:
        ret["agent"] = report['test_keys']['agent']
    except:
        ret["agent"] =  'Na'
    
    # Determined through OONI test logic
    try:
        ret["blocking"] = report['test_keys']['blocking']
    except:
        ret["blocking"] =  'Na'
    
    
    ## Raw test results
    
    try:
        ret["dns_experiment_failure"] = report['test_keys']['dns_experiment_failure']
    except:
        ret["dns_experiment_failure"] =  'Na'
    
    try:
        ret["http_experiment_failure"] = report['test_keys']['http_experiment_failure']
    except:
        ret["http_experiment_failure"] =  'Na'
    
    
    ## Status of Control measurement
    # possible that results inconsistent due to failed control measurement
    
    try:
        ret["control_failure"] = report['test_keys']['control_failure']
    except:
        ret["control_failure"] =  'Na'
    try:
        ret["control_http_failure"] = report['test_keys']['control']['http_request']['failure']
    except:
        ret["control_http_failure"] =  'Na'
    try:
        ret["control_dns_failure"] = report['test_keys']['control']['dns']['failure']
    except:
        ret["control_dns_failure"] =  'Na'
    
    # DNS response same as in control
    try:
        ret["dns_consistency"] = report['test_keys']['dns_consistency']
    except:
        ret["dns_consistency"] =  'Na'
    
    # same headers as in control
    try:
        ret["headers_match"] = report['test_keys']['headers_match']
    except:
        ret["headers_match"] =  'Na'
        
    try:
        ret["body_length_match"] = report['test_keys']['body_length_match']
    except:
        ret["body_length_match"] =  'Na'
        
    try:
        # response codes for each response delimetered with ';'
        ret['http_status_code'] =  ';'.join(map(str,[c['response']['code'] for c in report['test_keys']['requests']]))
    except:
        ret['http_status_code'] = 'Na'
    
    ## TODO
    # possible to extract more fields from the result JSON

    return ret

# OONI `web_connectivity` Test
### Test Description: https://github.com/ooni/spec/blob/master/nettests/ts-017-web-connectivity.md

In [None]:
ret = None

url = "https://api.ooni.io/api/v1/measurements"


##############################################
##### Adjust Start and End Time here:
##############################################
start = datetime.strptime("2020-02-01", "%Y-%m-%d")
end = datetime.strptime("2020-02-08", "%Y-%m-%d")
# Watch out for enough memory in case large periods of measurements are pulled

params = {
        "test_name" : "web_connectivity",  
        "probe_cc" : "GB",
        "limit" : "10000000"
        }

day = start


# Only get data for 1 day and not whole timeframe to avoid API timeout
while day <= end:    
    # Set timeframe to get data
    params['since'] = day.strftime("%Y-%m-%d") + "T00:00:00"
    params['until'] = day.strftime("%Y-%m-%d") + "T23:59:59"
    
    print("## Getting data for "+  day.strftime("%Y-%m-%d"))
    print(datetime.now().strftime("%H:%M:%S"))
    
    response = requests.get(url, params=params)
    
    if response.status_code == 200:
        response_json = response.json()
        
        if response_json['metadata']['count'] > 0:
        # iterate through list of all reports
            for r in response_json['results']:         
                outcome = get_report_outcome(r['measurement_url'])
                # Merge Metadata with test outcome
                tmp_df = pd.merge(json_normalize(r), json_normalize(outcome), left_index=True, right_index = True)

                # Add each row to result df
                if ret is None:
                    ret = tmp_df
                else:
                    ret = ret.append(tmp_df)
    
    # API Timeout
    elif response.status_code == 504:
        print("API Timeout.")
        print(datetime.now().strftime("%H:%M:%S"))
    
    # Service unavailable
    elif response.status_code == 503:
        # wait and try one more time
        print('Server Error 503 waiting 5 sec')
        
        time.sleep(5)
        print('Retry API call.')
        continue
        
    # Server Error
    elif response.status_code == 500:
        # wait 10 mins...
        print('Server Error 500')
        print(datetime.now().strftime("%H:%M:%S"))
        
    else:
        print("Bad API response - Status Code: " + str(response.status_code))
                    
    day = day + timedelta(days=1)           
    
    print("Already " + str(len(ret)) + " measurements loaded.")

ret.head()

In [25]:
# Add additional features
ret['https'] = ret.input.apply(lambda x : bool(re.match(r"^https.*", x)))

# Only get the Domain
ret['domain'] = ret.input.apply(lambda x : re.sub(r'^https?://(www\.)?|/.*/?$', '', x))

In [None]:
# Memory Usage in MB
sum(ret.memory_usage()) / 1000 / 1000 

In [26]:
print(len(ret))
ret.head()

10


Unnamed: 0,anomaly,confirmed,failure,input,measurement_id,measurement_start_time,measurement_url,probe_asn,probe_cc,report_id,...,http_experiment_failure,control_failure,control_http_failure,control_dns_failure,dns_consistency,headers_match,body_length_match,http_status_code,https,domain
0,False,False,False,http://khilafah.net/,temp-id-379164652,2020-02-01T21:46:42Z,https://api.ooni.io/api/v1/measurement/temp-id...,AS35228,GB,20200201T214637Z_AS35228_2EjGZ76gChblMZzgZsbl3...,...,,generic_timeout_error,Na,Na,,,,200,False,khilafah.net
0,False,False,False,http://www.rotten.com/,temp-id-379164651,2020-02-01T21:46:32Z,https://api.ooni.io/api/v1/measurement/temp-id...,AS35228,GB,20200201T214628Z_AS35228_IE22qw037SVvhTsiH2ozk...,...,,,generic_timeout_error,,consistent,,,200;302,False,rotten.com
0,False,False,False,http://www.protest.net/,temp-id-379164551,2020-02-01T21:44:09Z,https://api.ooni.io/api/v1/measurement/temp-id...,AS35228,GB,20200201T214315Z_AS35228_vR5hb82GJdlgWz7lVM2Za...,...,,,,,consistent,False,True,200,False,protest.net
0,False,False,False,http://www.ohchr.org/english/bodies/hrcouncil/,temp-id-379164552,2020-02-01T21:44:09Z,https://api.ooni.io/api/v1/measurement/temp-id...,AS35228,GB,20200201T214315Z_AS35228_vR5hb82GJdlgWz7lVM2Za...,...,,,,,consistent,True,True,404;302,False,ohchr.org
0,False,False,False,https://www.truecaller.com/,temp-id-379164549,2020-02-01T21:44:07Z,https://api.ooni.io/api/v1/measurement/temp-id...,AS35228,GB,20200201T214315Z_AS35228_vR5hb82GJdlgWz7lVM2Za...,...,,,,,consistent,True,True,200,True,truecaller.com


In [None]:
# Adjust timeframe in name
ret.to_csv('OONI GB WEBCON.csv',index=False)