## Part 0: Set-up

In [1]:
%load_ext autoreload 
%autoreload 2

import pandas as pd
from tqdm import tqdm
import data

## Part 1: Initial data collection
Download the data (and extract a random sample of urls).

Data comes from the CrUX dataset. More information can be found at https://github.com/zakird/crux-top-lists

In [None]:
# create dataframe
df = data.get_dataset().reset_index()
ranks = df['rank'].unique()
print(df)

## Part 2: Searching for AUPs

In [None]:
# run code, sit back, and wait ...
rank_df = data.rank_filter(df, 10000)
data.get_aups_in_bucket(rank_df)

In [2]:
df_1k = pd.read_csv('../data/urls/aups-rank1000.csv')
df_5k = pd.read_csv('../data/urls/aups-rank5000.csv')
url_df = pd.concat([df_1k, df_5k])
uniq_url_df = url_df.groupby('aup').min().reset_index()

print(uniq_url_df)

                                                  aup  index
0     https://abadis.ir/entofa/acceptable-use-policy/   2314
1   https://about.gitlab.com/handbook/people-group...   1145
2   https://about.realestate.com.au/acceptable-use...   2111
3   https://assets.publishing.service.gov.uk/gover...    301
4                         https://aws.amazon.com/aup/     43
..                                                ...    ...
67  https://www.slideshare.net/gfair3/acceptable-u...    585
68  https://www.studocu.com/en-us/document/grand-c...     21
69  https://www.twitch.tv/p/en/legal/channel-point...   1367
70  https://www.verizon.com/about/terms-conditions...   4072
71  https://www.xfinity.com/corporate/customers/po...   1091

[72 rows x 2 columns]


In [6]:
for i in tqdm(range(len(uniq_url_df))):
    row = uniq_url_df.iloc[i]
    idx = row['index']
    aup = row['aup']
    print(aup, idx)

    try:
        data.get_aup_content(aup, idx)
    except Exception as e:
        print(f'on {aup}, exception occurred {e}')

https://www.barnesandnoble.com/w/acceptable-use-policy-a-complete-guide-2020-edition-gerardus-blokdyk/1136391212 3930


## Part 3: Longitudinal data
Use the Wayback Machine API (https://archive.org/help/wayback_api.php) or the standalone Wayback CDX Server API (https://github.com/internetarchive/wayback/tree/master/wayback-cdx-server#field-order) to query snapshots of each url.

## Part 4: Scraping content from URLs