## Part 0: Set-up

In [1]:
import requests
import os
import gzip
import pandas as pd
import random
from googlesearch import search
from tqdm import tqdm
import csv 
from bs4 import BeautifulSoup
from PyPDF2 import PdfReader
from io import BytesIO

random.seed(0)

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


## Part 1: Initial data collection
Download the data (and extract a random sample of urls).

Data comes from the CrUX dataset. More information can be found at https://github.com/zakird/crux-top-lists

In [2]:
# download crux dataset (if needed) and read into a dataframe
def get_dataset(fname='current.csv'):
  if not os.path.isfile(fname):
    print('downloading data...')
    url = 'https://raw.githubusercontent.com/zakird/crux-top-lists/main/data/global/current.csv.gz'
    response = requests.get(url)

    # write download contents to 'current.csv.gz'
    gzip.open(f'{fname}.gz', 'wb').write(response.content)
    # unzip to get 'current.csv'
    os.system(f'gzip -d {fname}.gz')

  try:
    df = pd.read_csv(fname, compression='gzip')
  except:
    df = pd.read_csv(fname)
  return df

# returns a new dataframe of only websites with specified rank
def rank_filter(df, rank):
  return df[df['rank'] == rank]

In [3]:
# create dataframe
df = get_dataset().reset_index()
ranks = df['rank'].unique()
print(df)

downloading data...
         index                               origin     rank
0            0                https://www.globo.com     1000
1            1          https://www.rightmove.co.uk     1000
2            2                   https://mobcup.net     1000
3            3           https://www.sahibinden.com     1000
4            4                https://tamilyogi.dog     1000
...        ...                                  ...      ...
999995  999995  https://www.studysmartwithchris.com  1000000
999996  999996  https://www.radiomarcabarcelona.com  1000000
999997  999997               https://www.vehikit.fr  1000000
999998  999998            https://www.zoomtekno.com  1000000
999999  999999             https://taujenudvaras.lt  1000000

[1000000 rows x 3 columns]


## Part 2: Searching for AUPs

In [7]:
# try removing trivial components of url
# for simplicity, any component consisting of 3 or fewer characters
def nontrivial_cmp(orig_url, aup_url): # condition function can only take one argument
  len3 = lambda s : len(s) > 3
  len3_and_orig = lambda s: len3(s) and s in orig_parts

  orig_parts = list(filter(len3, orig_url.replace('https://', '').split('.')))
  aup_parts = list(filter(len3_and_orig, aup_url.replace('https://', '').split('.')))
  return bool(aup_parts) and ('aup' in aup_url or 'acceptable-use-policy' in aup_url)

# take top google search results and see if any contain an aup
# search-width: n
def get_search_results(url, n=5):
  search_str = f'{url} acceptable use policy'
  for result in search(search_str, num=n, stop=n, pause=0): # NOTE removing pause may lead to 429 Too Many Requests
    if nontrivial_cmp(url, result):
      return result
  return None

# get aup's in the bucket specified by a given rank
# don't worry about duplicates here, remove them later in a dataframe
def get_aups_in_bucket(partial_df):
  aups = []
  rank = partial_df.iloc[0]['rank']

  # create a new file to store aup urls
  with open(f'aups-rank{rank}.csv', 'w') as f:
    w = csv.writer(f)
    w.writerow(['index', 'aup'])

  for i in tqdm(range(len(partial_df))):
    row = partial_df.iloc[i]
    idx = row['index']
    url = row['origin']
       
    # run google search for relevant aup
    result = get_search_results(url)
    if result:
      aups.append((idx, result))
    
    # print progress update and write to file after every 1000 searches
    if i % 1000 == 0 and i != 0:
      print(f' found {len(aups)} new aups')
      with open(f'aups-rank{rank}.csv', 'a') as f:
        w = csv.writer(f)
        w.writerows(aups)
      aups = []

In [None]:
# run everything up to rank 50,000 (first four buckets) ...
rank_df = rank_filter(df, 5000)
get_aups_in_bucket(rank_df)

## Part 3: Longitudinal data
Use the Wayback Machine API (https://archive.org/help/wayback_api.php) or the standalone Wayback CDX Server API (https://github.com/internetarchive/wayback/tree/master/wayback-cdx-server#field-order) to query snapshots of each url.

In [None]:
# query the Wayback CDX Server API
# this provides more complex support, including returning all available snapshots
def query_cdx(url):
  query = f'http://web.archive.org/cdx/search/cdx?url={url}&fl=timestamp,digest,length,original&output=json'
  response = requests.get(query)
  return response.json()

# get urls of all wayback machine snapshots from a current url
def get_snapshots(url):
  cdx = query_cdx(url)[1:] # skip header

  # c[0] is timestamp and c[3] is original url
  snapshots = [f'http://web.archive.org/web/{c[0]}/{c[3]}' for c in cdx]
  timestamps = [c[0] for c in cdx]
  return snapshots, timestamps

## Part 4: Scraping content from URLs

In [None]:
def get_aup_content(aup_url, index, timestamp='current'):
  response = requests.get(aup_url)
  padded_index = str(index).zfill(6)
  text = ""

  if 'text/html' in response.headers['Content-Type']:
    soup = BeautifulSoup(response.content)
    elems_to_rm = ['style', 'script', 'head', 'title', 'nav', 'heater', 'footer', 'button', 'a']
    for s in soup(elems_to_rm):
      s.extract()
    text = soup.get_text()

  elif 'pdf' in response.headers['Content-Type']:
    # src: https://wellsr.com/python/read-pdf-files-with-python-using-pypdf2/
    pdf_bytes = BytesIO(response.content)
    pdf = PdfReader(pdf_bytes)
    for page in pdf.pages:
      text += page.extract_text()
    
  else:
    print('urecognized content type', response.headers['Content-Type'])
    # TODO return error

  # strip leading/trailing space and drop blank lines
  lines = (line.strip() for line in text.splitlines())
  cleantext = '\n'.join(line for line in lines if line)

  # write to file
  open(f'./{padded_index}-{timestamp}.txt', 'w').write(cleantext)