## Part 0: Set-up

In [1]:
%load_ext autoreload 
%autoreload 2

import pandas as pd
from multiprocessing import Process
import data

## Part 1: Initial data collection
Download the data (and extract a random sample of urls).

Data comes from the CrUX dataset. More information can be found at https://github.com/zakird/crux-top-lists

In [2]:
# create dataframe
df = data.get_dataset().reset_index()
ranks = df['rank'].unique()
print(df)

         index                               origin     rank
0            0                https://www.globo.com     1000
1            1          https://www.rightmove.co.uk     1000
2            2                   https://mobcup.net     1000
3            3           https://www.sahibinden.com     1000
4            4                https://tamilyogi.dog     1000
...        ...                                  ...      ...
999995  999995  https://www.studysmartwithchris.com  1000000
999996  999996  https://www.radiomarcabarcelona.com  1000000
999997  999997               https://www.vehikit.fr  1000000
999998  999998            https://www.zoomtekno.com  1000000
999999  999999             https://taujenudvaras.lt  1000000

[1000000 rows x 3 columns]


## Part 2: Searching for AUPs

In [3]:
# run code, sit back, and wait ...
# rank_df = data.rank_filter(df, 10000)
# data.get_aups_in_bucket(rank_df)

In [5]:
# df_1k = pd.read_csv('../data/urls/aups-rank1000.csv')
# df_5k = pd.read_csv('../data/urls/aups-rank5000.csv')

url_df = pd.concat([pd.read_csv(f'../data/aup-urls/crux-aups-rank{r}.csv') for r in ranks[:3]])
uniq_url_df = url_df.groupby('aup').min().reset_index()

print(uniq_url_df)

                                                   aup  index
0      https://abadis.ir/entofa/acceptable-use-policy/   2314
1    https://about.gitlab.com/handbook/people-group...   1145
2    https://about.realestate.com.au/acceptable-use...   2111
3    https://accessories.three.co.uk/pages/acceptab...   8546
4    https://answers.microsoft.com/en-us/msoffice/f...   9700
..                                                 ...    ...
111  https://www.virginmedia.com/legal/acceptable-u...   6688
112  https://www.vodafone.co.uk/terms-and-condition...   7289
113  https://www.xfinity.com/corporate/customers/po...   1091
114  https://www.yumpu.com/en/document/view/2343917...   6963
115     https://zapier.com/legal/acceptable-use-policy   6236

[116 rows x 2 columns]


In [6]:
for i in range(len(uniq_url_df)):
    row = uniq_url_df.iloc[i]
    idx = row['index']
    aup = row['aup']

    fname = f'../data/crux-aups/{str(idx).zfill(4)}-current.txt'
    p = Process(target=data.get_aup_content, args=[aup, fname])
    p.start()
    p.join(timeout=10)
    p.terminate()

In [8]:
crux_urls = uniq_url_df['aup'].values
googlesearch_urls = open('../data/aup-urls/googlesearch-aups.txt').readlines()

for idx, aup in enumerate(googlesearch_urls):
    if aup in crux_urls:
        continue
    
    fname = f'../data/googlesearch-aups/{str(idx).zfill(3)}-current.txt'
    p = Process(target=data.get_aup_content, args=[aup, fname])
    p.start()
    p.join(timeout=10)
    p.terminate()

## Part 3: Longitudinal data
Use the Wayback Machine API (https://archive.org/help/wayback_api.php) or the standalone Wayback CDX Server API (https://github.com/internetarchive/wayback/tree/master/wayback-cdx-server#field-order) to query snapshots of each url.

## Part 4: Scraping content from URLs