part 1

In [None]:
import requests
import os
import gzip
import pandas as pd

# download crux dataset (if needed) and read into a dataframe
def get_dataset(fname='current.csv'):
  if not os.path.isfile(fname):
    print('downloading data...')
    url = 'https://raw.githubusercontent.com/zakird/crux-top-lists/main/data/global/current.csv.gz'
    response = requests.get(url)

    # write download contents to 'current.csv.gz'
    gzip.open(f'{fname}.gz', 'wb').write(response.content)
    # unzip to get 'current.csv'
    os.system(f'gzip -d {fname}.gz')

  try:
    df = pd.read_csv(fname, compression='gzip')
  except:
    df = pd.read_csv(fname)
  return df


# get dataframe
df = get_dataset()

In [None]:
import random
random.seed()

# returns a new dataframe of only websites with specified rank
def rank_filter(df, rank):
  return df[df['rank'] == rank]

# extract a random sample of n urls
def get_sample(df, n, save=None):
  N = len(df) # population size
  n = 100     # sample size
  sample_idxs = random.sample(range(N), n)
  sample_urls = [df['origin'].iloc[i] for i in sample_idxs]

  # if save parameter is provided, save urls to specified output file
  if save:
    open(save, 'w').write('\n'.join(sample_urls))
  return sample_urls


# get rank 1000 websites
df1000 = rank_filter(df, 1000)
print(df1000)

# sample of urls
urls = get_sample(df, 100, save='urls.txt')
urls1000 = get_sample(df1000, 100, save='urls1000.txt')

part 2

In [None]:
%pip install beautifulsoup4 
%pip install google

from googlesearch import search
from tqdm import tqdm

def print_search_results(url, result):
  print(url)
  print(result)
  print()

# take top google search results and see if any contain an aup
# search-width: n
def get_search_results(url, condition=(lambda x, y: 'aup' in y or 'acceptable-use-policy' in y), n=5):
  search_str = f'{url} acceptable use policy'
  for result in search(search_str, num=n, stop=n, pause=1):
    if condition(url, result):
      return result
  return None


# example
aups = set()
for url in urls[:5]:
  result = get_search_results(url)
  if result:
    aups.add(result)
    print_search_results(url, result)

print(f'{len(aups)} results found')

In [None]:
import csv 

# get aup's in the bucket specified by a given rank
# don't worry about duplicates here, remove them later in a dataframe
def get_aups_in_bucket(partial_df):
  aups = []
  rank = partial_df.iloc[0]['rank']

  # create a new file to store aup urls
  with open(f'aups-rank{rank}.csv', 'w') as f:
    w = csv.writer(f)
    w.writerow(['index', 'aup'])

  for i in tqdm(range(len(partial_df))):
    row = partial_df.iloc[i]
    idx = row['index']
    url = row['origin']
       
    # run google search for relevant aup
    result = get_search_results(url)
    if result:
      aups.append((idx, result))
    
    # print progress update and write to file after every 1000 searches
    if i % 1000 == 0 and i != 0:
      print(f' found {len(aups)} new aups')
      with open(f'aups-rank{rank}.csv', 'a') as f:
        w = csv.writer(f)
        w.writerows(aups)
      aups = []

part 3

In [None]:
# query the Wayback Availability JSON API
# limitation: only returns a single snapshot (most recent or closest to query timestamp)
def query_wayback(url):
  query = f'https://archive.org/wayback/available?url={url}'
  response = requests.get(query)
  return response.json()

# query the Wayback CDX Server API
# this provides more complex support, including returning all available snapshots
def query_cdx(url):
  query = f'http://web.archive.org/cdx/search/cdx?url={url}&fl=timestamp,digest,length,original&output=json'
  response = requests.get(query)
  return response.json()


# example query results
aups = list(aups)
x = aups[0]
print(query_wayback(x))
print('\n'.join(list(map(str, query_cdx(x)))))

In [None]:
# get urls of all wayback machine snapshots from a current url
def get_snapshots(url):
  cdx = query_cdx(url)[1:] # skip header

  # c[0] is timestamp and c[3] is original url
  snapshots = [f'http://web.archive.org/web/{c[0]}/{c[3]}' for c in cdx]
  timestamps = [c[0] for c in cdx]
  return snapshots, timestamps


# example
snapshots, _ = get_snapshots(x)
print('\n'.join(snapshots))

part 4

In [1]:
%pip install PyPDF2

from bs4 import BeautifulSoup
import requests
from PyPDF2 import PdfReader
from io import BytesIO
import random # TODO remove later

## below code can be replaced by using requests
# from urllib.request import Request, urlopen
# import ssl
# ctx = ssl.create_default_context() # pass this into urlopen as context=ctx
# ctx.set_ciphers('DEFAULT')

# aup = 'https://www.whitman.edu/technology-services/policies/acceptable-use-policy'
# aup = 'https://www.kean.edu/media/computer-related-acceptable-use-policy' # pdf... TT

# combining part 3 and 4 to test scraping from Wayback Machine:
aup = 'http://web.archive.org/web/20230518110433/https://aws.amazon.com/aup/' # snapshot
index = random.randint(0,1000000)

def get_aup_content(aup_url, index, timestamp='current'):
  response = requests.get(aup_url)
  padded_index = str(index).zfill(6)
  text = ""

  if 'text/html' in response.headers['Content-Type']:
    soup = BeautifulSoup(response.content)
    elems_to_rm = ['style', 'script', 'head', 'title', 'nav', 'heater', 'footer', 'button', 'a']
    for s in soup(elems_to_rm):
      s.extract()
    text = soup.get_text()

  elif 'pdf' in response.headers['Content-Type']:
    # src: https://wellsr.com/python/read-pdf-files-with-python-using-pypdf2/
    pdf_bytes = BytesIO(response.content)
    pdf = PdfReader(pdf_bytes)
    for page in pdf.pages:
      text += page.extract_text()
    
  else:
    print('urecognized content type', response.headers['Content-Type'])
    # TODO return error

  # strip leading/trailing space and drop blank lines
  lines = (line.strip() for line in text.splitlines())
  cleantext = '\n'.join(line for line in lines if line)

  # write to file
  fname = f'../data/aups/{padded_index}-{timestamp}.txt'
  open(fname, 'w').write(cleantext)


# example
get_aup_content(aup, index)


[notice] A new release of pip available: 22.3.1 -> 23.1.2
[notice] To update, run: python.exe -m pip install --upgrade pip


Note: you may need to restart the kernel to use updated packages.


### miscellaneous

In [None]:
## scraping urls with selenium

from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager

options = Options()
options.binary_location = r'C:\Program Files (x86)\Google\Chrome\Application\chrome.exe'

service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=options)

search_str = 'acceptable use policy'
driver.get(f"https://www.google.com/search?q={search_str}")

soup = BeautifulSoup(driver.page_source, 'html.parser')

search = soup.find_all('div', class_="yuRUbf")
for h in search:
    print(h.a.get('href'))

In [None]:
## url parsing and string matching attempts D:

import urllib.parse
import socket

url = 'https://google.com'

def get_ip(url):
  parsed_url = urllib.parse.urlparse(url)
  print(parsed_url.netloc)
  print(socket.gethostbyname(parsed_url.netloc))


# example
get_ip(url)

### master csv for final dataset

In [21]:
import pandas as pd
import os
import csv

datapath = 'data/final_data/'
cruxpath = f'../{datapath}crux/'
googlepath = f'../{datapath}googlesearch/'
github_url = f'https://github.com/kyeling/cse256-aup-project/tree/main/{datapath}'

with open('../data/master.csv', newline='', mode='w') as f:
    w = csv.writer(f)
    w.writerow(['id', 'source', 'url', 'filepath', 'link-to-filepath', 'sector'])

    crux_df = pd.read_csv('../data/aup-urls/crux-aups.csv')
    crux_df = crux_df.set_index('index')
    for fname in os.listdir(cruxpath):
        if fname == '.gitignore': continue
        id, _ = fname.split('-')
        url = crux_df.loc[int(id)]['aup']
        w.writerow([id, 'crux', url, fname, f'{github_url}{fname}'])

    googlesearch_list = open('../data/aup-urls/googlesearch-aups.txt').readlines()
    for fname in os.listdir(googlepath):
        if fname == '.gitignore': continue
        id, _ = fname.split('-')
        url = googlesearch_list[int(id)].replace('\n', '')
        w.writerow([id, 'googlesearch', url, fname, f'{github_url}{fname}'])