In [53]:
import pandas as pd
from bs4 import BeautifulSoup
from meilisearch import Client
from tqdm import tqdm
import urllib.request
import json
import concurrent.futures
from urllib.error import HTTPError
import os
import pdfplumber

# List of domain names
domain_names = [
    "clubofbudapest.org",
    "club-of-budapest.com",
    "clubofbudapest.com",
    "club-of-budapest.org",
    "club-of-budapest.it",
    "club-of-budapest.de",
    "clubofbudapest.de",
    "clubofbudapest.fw.hu",
    "clubofbudapest.cz",
    "cobusa.org",
    "clubofbudapest.nl",
    "clubofbudapest.ca",
    "clubofbudapest.org.au",
    "budapestklub.hu",
    "clubofbudapest.hu",
    "budapestklub.matav.hu",
    "clubdebudapest.org",
    "club-de-budapest.asso.fr",
    "iwc.org.hu",
    "worldshiftnetwork.org",
    "globalspirit.org"
]

opener = urllib.request.build_opener(
    urllib.request.ProxyHandler(
        {
            'http': 'http://brd-customer-hl_6448c571-zone-zone1:d0rsfd3f67nx@brd.superproxy.io:22225',
            'https': 'http://brd-customer-hl_6448c571-zone-zone1:d0rsfd3f67nx@brd.superproxy.io:22225'
            }
        )
)

def fetch_pages(domain):
    url = f'http://web.archive.org/cdx/search/cdx?url={domain}&output=json&matchType=domain&fl=timestamp,original,mimetype,statuscode'
    response = opener.open(url)
    return response

def process_domains(domain_names):
    all_pages = []
    domains_dict = {}  

    print(f"Total domains to be fetched: {len(domain_names)}...")

    for domain in tqdm(domain_names, desc="Fetching domains"):
        response = fetch_pages(domain)
        
        if response.status == 200:
            response_json = json.load(response)
            domains_dict[domain] = response_json[1:] # skip first row

    print(f"Total fetched domains: {len(domains_dict)}.")
    print(f"Total pages of all domains: {sum([len(rows) for rows in domains_dict.values()])}.")

    return domains_dict

def step_1():
    domains_dict = process_domains(domain_names)

    if not os.path.exists('../data'):
        os.makedirs('../data')
    
    all_domains_pages = None

    for domain, domain_pages in domains_dict.items():
        # skip first row

        domain_pages_df = pd.DataFrame(domain_pages, columns=['timestamp', 'original', 'mimetype', 'statuscode'])
        domain_pages_df['domain'] = domain  
        domain_pages_df.to_csv(f'../data/{domain}.csv', index=False)

    for domain, domain_pages in domains_dict.items():    
        all_domains_pages = pd.concat([pd.read_csv(f'../data/{domain}.csv') for domain in domain_names])

    all_domains_pages.to_csv('../data/all_domains_pages.csv', index=False)

    return domains_dict
   
domains_dict = step_1() 
df = pd.read_csv('../data/all_domains_pages.csv')

Total domains to be fetched: 21...


Fetching domains: 100%|██████████| 21/21 [00:21<00:00,  1.01s/it]


Total fetched domains: 21.
Total pages of all domains: 70972.


21

In [55]:
df = pd.read_csv('../data/all_domains_pages.csv').head()
# put domains_dict in a dataframe
# df = pd.DataFrame()
# for domain in domains_dict:
#     df = df.append(pd.DataFrame(domains_dict[domain], columns=['timestamp', 'original', 'mimetype', 'statuscode']))

# df.head()

In [90]:
df = None
download_folder = '../data/pdfs'
all_domains_pages_df = None
results_pickle = '../results/pages_content.pkl'
results_csv = '../results/pages_content.csv'
# all_domains_pages_csv = '../data/all_domains_pages.csv'
all_domains_pages_csv = '../data/test.csv'

def read_pdf_content(pdf_path):
    pdf_content = ''
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            pdf_content += page.extract_text() + ' '
    return pdf_content

def fetch_pdf_content(original_content, pdf_url):
    if not os.path.exists(download_folder):
        os.makedirs(download_folder)

    filename = os.path.join(download_folder, pdf_url.split('/')[-1])

    if not os.path.exists(filename):
        with open(filename, 'wb') as f:
            f.write(original_content)

    return read_pdf_content(filename)

def fetch_html_content(original_content):
    soup = BeautifulSoup(original_content, 'html.parser')
    text = soup.get_text()    
    return text 

def fetch_content(wayback_machine_url):
    try:
        response = opener.open(wayback_machine_url)
        if response.status == 200:
            return response.read()
    except HTTPError as e:
        print(f"HTTP Error: {e} for URL: {wayback_machine_url}")
        
    return None
   
def process_domain(domain, rows):  
    print(f"{domain} ({len(rows)}):")
    domain_pages = fetch_domain_pages(domain, rows)    
    print(f"{domain}: {(len(rows) - len(domain_pages))} of {len(rows)} fetched.")
    save_domain_pages(domain, domain_pages)
    return domain_pages

def step_2():
    pages = pd.read_csv(all_domains_pages_csv)

    print(f"Total pages: {len(pages)}.")

    pages_content_list = []
    for row in tqdm(pages.itertuples(), desc="Fetching pages for each domain", total=len(pages)):    
        if row.statuscode != 200:
            continue

        print(f"{row.timestamp}: {row.original}")
        wayback_machine_url = f'http://web.archive.org/web/{row.timestamp}/{row.original}'

        if row.mimetype == 'application/pdf':
            wayback_machine_pdf_url = f'http://web.archive.org/web/{row.timestamp}if_/{row.original}'
            original_content = fetch_content(wayback_machine_pdf_url)
            if original_content is None:
                continue
            text = fetch_pdf_content(original_content, row.url)
        else:
            original_content = fetch_content(wayback_machine_url)
            if original_content is None:
                continue
            text = fetch_html_content(original_content)   

        pages_content_list.append({
            'domain': row.domain,
            'timestamp': row.timestamp,
            'wayback_machine_url': wayback_machine_url,
            'url': row.original,
            'text': text,
            'original_content': original_content,
            'mimetype': row.mimetype,
            'statuscode': row.statuscode
        })

    pages_content = pd.DataFrame(pages_content_list)
    pages_content['id'] = range(1, len(pages_content) + 1)
    return pages_content

pages_content = step_2()  
pages_content.to_pickle(results_pickle)
pages_content.to_csv(results_csv, escapechar='\\')
pages_content.head()

Total pages: 2.


Fetching pages for each domain:   0%|          | 0/2 [00:00<?, ?it/s]


AttributeError: 'Pandas' object has no attribute 'url'

In [66]:
pages_content.head()

Unnamed: 0,domain,timestamp,wayback_machine_url,url,text,original_content,mimetype,statuscode,id
0,clubofbudapest.org,20041020184009,http://web.archive.org/web/20041020184009/http...,http://www.clubofbudapest.org:80/BestProjects/...,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nSUPPORT\n\n\n\n\...,"b'<html>\n\n<head><script src=""//archive.org/i...",text/html,200,1
1,clubofbudapest.org,20160323233557,http://web.archive.org/web/20160323233557/http...,http://clubofbudapest.org/clubofbudapest/attac...,THE FUJI DECLARATION\nAwakening The Divine Spa...,b'%PDF-1.3\n%\xc4\xe5\xf2\xe5\xeb\xa7\xf3\xa0\...,application/pdf,200,2


In [85]:
def step_3():
    if not os.path.exists(results_pickle):
        print("'{results_pickle}' not found. Please run step 2 first.")
        return

    df = pd.read_pickle(results_pickle)
    df = df[['id', 'domain', 'timestamp', 'wayback_machine_url', 'url', 'text', 'mimetype', 'statuscode']]

    # Ingest the dataframe into a MeiliSearch index in batches of 100
    client = Client('http://localhost:7700', 'masterKey')
    client.delete_index('pages')
    index = client.index('pages')

    batch_size = 1000
    documents = df.to_dict(orient='records')
    total_documents = len(documents)

    # use tqdm to show progress bar and add total_documents to index in batches of batch_size
    for i in tqdm(range(0, total_documents, batch_size), desc="Adding {total_documents} documents to index"):
        index.add_documents(documents[i:i+batch_size])    

    print("Indexing completed.")

step_3() 

Adding {total_documents} documents to index:   0%|          | 0/1 [00:00<?, ?it/s]

Adding {total_documents} documents to index: 100%|██████████| 1/1 [00:00<00:00, 47.83it/s]

Indexing completed.



