## Part 0: Set-up

In [None]:
%load_ext autoreload 
%autoreload 2

import pandas as pd
from multiprocessing import Process
import data

## Part 1: Initial data collection
Data comes from the CrUX dataset. More information can be found at https://github.com/zakird/crux-top-lists

In [None]:
# create dataframe
df = data.get_dataset().reset_index()
ranks = df['rank'].unique()
print(df)

## Part 2: Searching for AUPs
Get the data in batches by rank (1k, 5k, 10k)

In [None]:
# run code, sit back, and wait ...
for r in ranks[:3]:
    rank_df = data.rank_filter(df, r)
    data.get_aups_in_bucket(rank_df)

In [None]:
# get aups in a combined dataframe for scraping
url_df = pd.concat([pd.read_csv(f'../data/aup-urls/crux-aups-rank{r}.csv') for r in ranks[:3]])
uniq_url_df = url_df.groupby('aup').min().reset_index()

print(uniq_url_df)

In [None]:
# scrape aup contents
for i in range(len(uniq_url_df)):
    row = uniq_url_df.iloc[i]
    idx = row['index']
    aup = row['aup']

    fname = f'../data/crux-aups/{str(idx).zfill(4)}-current.txt'
    p = Process(target=data.get_aup_content, args=[aup, fname])
    p.start()
    p.join(timeout=10)
    p.terminate()

In [None]:
# scrape aup contents of aup urls retrieved through google search 
crux_urls = uniq_url_df['aup'].values
googlesearch_urls = open('../data/aup-urls/googlesearch-aups.txt').readlines()

for idx, aup in enumerate(googlesearch_urls):
    if aup in crux_urls:
        continue
    
    fname = f'../data/googlesearch-aups/{str(idx).zfill(3)}-current.txt'
    p = Process(target=data.get_aup_content, args=[aup, fname])
    p.start()
    p.join(timeout=10)
    p.terminate()

## Obtaining master csv file for final dataset

In [None]:
import pandas as pd
import os
import csv

datapath = 'data/final_data/'
cruxpath = f'../{datapath}crux/'
googlepath = f'../{datapath}googlesearch/'
github_url = f'https://github.com/kyeling/cse256-aup-project/tree/main/{datapath}'

with open('../data/master.csv', newline='', mode='w') as f:
    w = csv.writer(f)
    w.writerow(['id', 'source', 'url', 'filepath', 'link-to-filepath', 'sector'])

    crux_df = pd.read_csv('../data/aup-urls/crux-aups.csv')
    crux_df = crux_df.set_index('index')
    for fname in os.listdir(cruxpath):
        if fname == '.gitignore': continue
        id, _ = fname.split('-')
        url = crux_df.loc[int(id)]['aup']
        w.writerow([id, 'crux', url, fname, f'{github_url}/crux/{fname}'])

    googlesearch_list = open('../data/aup-urls/googlesearch-aups.txt').readlines()
    for fname in os.listdir(googlepath):
        if fname == '.gitignore': continue
        id, _ = fname.split('-')
        url = googlesearch_list[int(id)].replace('\n', '')
        w.writerow([id, 'googlesearch', url, fname, f'{github_url}/googlesearch/{fname}'])