In [133]:
import pandas as pd
from bs4 import BeautifulSoup
from meilisearch import Client
from tqdm import tqdm
import urllib.request
import json
import concurrent.futures
from urllib.error import HTTPError
import os
import pdfplumber
from pydantic import BaseModel
from typing import Dict, List, Optional, Any, Tuple
from io import BytesIO

class BaseContent(BaseModel):
    text: str
    title: Optional[str]

class Page(BaseModel):
    timestamp: str
    original: str
    mimetype: str
    statuscode: str
    digest: str
    length: str

class DomainPages(BaseModel):
    domain: str
    pages: List[Page]

class PageContent(BaseModel):
    id: Optional[int]
    title: str
    domain: str
    timestamp: str
    unix_timestamp: Optional[int]
    wayback_machine_url: str
    url: str
    text: str
    mimetype: str

class DomainPagesContent(BaseModel):
    domain: str
    pages_contents: List[PageContent]

# List of domain names
domain_names = [
    "clubofbudapest.org.au",
    "clubofbudapest.fw.hu",
    "clubofbudapest.de",
    "iwc.org.hu",
    "clubofbudapest.hu",
    "budapestklub.matav.hu",
    "clubofbudapest.cz",
    "clubofbudapest.ca",
    "club-of-budapest.de",
    "cobusa.org",
    "clubdebudapest.org",
    "worldshiftnetwork.org",
    "globalspirit.org",
    "club-of-budapest.org",
    "budapestklub.hu",
    "club-de-budapest.asso.fr",
    "club-of-budapest.com",
    "club-of-budapest.it",
    "clubofbudapest.com",
    "clubofbudapest.org"
]
# domain_names = [
#     "clubofbudapest.fw.hu",
#     "clubofbudapest.de",
# ]

opener = urllib.request.build_opener(
    urllib.request.ProxyHandler(
        {
            'http': 'http://brd-customer-hl_6448c571-zone-zone1:d0rsfd3f67nx@brd.superproxy.io:22225',
            'https': 'http://brd-customer-hl_6448c571-zone-zone1:d0rsfd3f67nx@brd.superproxy.io:22225'
            }
        )
)

df = None
data_folder = '../data'
download_folder = data_folder + '/pdfs'
all_domains_pages_df = None
meilisearch_host = 'http://127.0.0.1:7700'  # Replace with your Meilisearch host
index_name = 'club-of-budapest-6' 
meilisearch_api_key = 'masterKey'

In [134]:
from typing import List, Optional
import json

def fetch_pages(domain: str) -> Tuple[str, Optional[bytes]]:
    url = f'https://web.archive.org/cdx/search/cdx?url={domain}&output=json&collapse=digest&matchType=domain&fl=timestamp,original,mimetype,statuscode,digest,length&filter=statuscode:200&filter=mimetype:text/html|application/pdf'
    response = opener.open(url)
    return domain, response

def convert_json_to_page(json_data: List[List[str]]) -> List[Page]:
    pages = []
    for page in json_data[1:]:
        page = Page(
            timestamp=page[0],
            original=page[1],
            mimetype=page[2],
            statuscode=page[3],
            digest=page[4],
            length=page[5]
        )
        pages.append(page)
    return pages

def fetch_all_domain_pages() -> List[DomainPages]:
    all_domain_pages = [] 

    print(f"Total domains to be fetched: {len(domain_names)}...")
    
    with concurrent.futures.ThreadPoolExecutor() as executor:
        futures = []

        for domain in domain_names:
            future = executor.submit(fetch_pages, domain)
            futures.append(future)
            
        for future in tqdm(concurrent.futures.as_completed(futures), desc="Fetching domains", total=len(domain_names)):
            domain, response = future.result()
            
            if response is None:
                continue

            response_json = json.load(response)
            pages = convert_json_to_page(response_json)
            domain_pages = DomainPages(domain=domain, pages=pages)

            all_domain_pages.append(domain_pages)

    # reorder all_domain_pages by pages length from smallest to largest
    all_domain_pages = sorted(all_domain_pages, key=lambda x: len(x.pages))

    print(f"Total fetched domains: {len(all_domain_pages)}.")
    print(f"Total pages fetched: {sum([len(domain_pages.pages) for domain_pages in all_domain_pages])}.")

    return all_domain_pages

all_domain_pages = fetch_all_domain_pages()


Fetching domains:   0%|          | 0/20 [00:00<?, ?it/s]

Fetching domains: 100%|██████████| 20/20 [00:02<00:00,  7.59it/s]


In [135]:

def read_pdf_content(pdf_path: str) -> Optional[BaseContent]:
    pdf_content = ''
    pdf_title = ''
    try:
        with pdfplumber.open(pdf_path) as pdf:
            pdf_title = pdf.metadata['title']
            for page in pdf.pages:
                pdf_content += page.extract_text() + ' '
    except:
        return None

    return BaseContent(text=pdf_content, title=pdf_title)

def fetch_pdf_content(original_content: bytes, pdf_url: str) -> Optional[BaseContent]:
    download_folder = 'downloads'  # Assuming the download folder exists
    if not os.path.exists(download_folder):
        os.makedirs(download_folder)

    filename = os.path.join(download_folder, pdf_url.split('/')[-1])

    if not os.path.exists(filename):
        with open(filename, 'wb') as f:
            f.write(original_content)

    return read_pdf_content(filename)


def fetch_pdf_content_on_the_fly(content, url):
    pdf_content = ''
    pdf_title = ''

    try:
        with pdfplumber.open(BytesIO(content)) as pdf:
            pdf_content = ' '.join(page.extract_text() for page in pdf.pages)
            if pdf.metadata is not None and 'title' in pdf.metadata:
                pdf_title = pdf.metadata['title']

        if pdf_title == '':
            pdf_title = url.split('/')[-1]
    except:
        return None

    return BaseContent(text=pdf_content, title=pdf_title)

def fetch_html_content(original_content: bytes) -> Optional[BaseContent]:
    soup = BeautifulSoup(original_content, 'html.parser')
    text = soup.get_text()
    text = text.replace('\n', ' ').replace(' +', ' ')
    title = soup.title.string if soup.title else ''

    return BaseContent(text=text, title=title)

def fetch_content(wayback_machine_url: str) -> Optional[bytes]:
    try:
        response = opener.open(wayback_machine_url)
        if response.status == 200:
            return response.read()
    except:
        return None
    return None
   
def process_page(page: Page, domain: str) -> Optional[PageContent]:
    wayback_machine_url = f'https://web.archive.org/web/{page.timestamp}/{page.original}'
    wayback_machine_content_url = f'https://web.archive.org/web/{page.timestamp}if_/{page.original}'

    if page.mimetype == 'application/pdf':
        original_content = fetch_content(wayback_machine_content_url)   
        
        if original_content is None:
            return None
        content = fetch_pdf_content_on_the_fly(original_content, page.original)
    elif page.mimetype == 'text/html':
        original_content = fetch_content(wayback_machine_content_url)     
        if original_content is None:       
            return None

        content = fetch_html_content(original_content)
    else:
        return None

    if content is None:
        return None
        
    if content.text is None or len(content.text) < 500:
        return None

    unix_timestamp = int(pd.to_datetime(page.timestamp).timestamp())

    return PageContent(
        title=content.title,
        domain=domain,
        timestamp=page.timestamp,
        unix_timestamp=unix_timestamp,
        wayback_machine_url=wayback_machine_url,
        url=page.original,
        text=content.text,
        mimetype=page.mimetype
    )

def fetch_domain_pages_content(all_domain_pages: List[DomainPages], use_threads: bool = False) -> List[DomainPagesContent]:
    domain_pages_content_list = []

    if use_threads:
        for domain_pages in all_domain_pages:
            domain_pages_content = DomainPagesContent(domain=domain_pages.domain, pages_contents=[])

            with concurrent.futures.ThreadPoolExecutor() as executor:
                futures = []

                for page in domain_pages.pages:
                    future = executor.submit(process_page, page, domain_pages.domain)
                    futures.append(future)

                for future in tqdm(concurrent.futures.as_completed(futures), desc=f"Fetching pages for domain '{domain_pages.domain}'", total=len(domain_pages.pages)):
                    result = future.result()
                    if result is not None:
                        domain_pages_content.pages_contents.append(result)

                domain_pages_content_list.append(domain_pages_content)
    else:
        for domain_pages in all_domain_pages:
            domain_pages_content = DomainPagesContent(domain=domain_pages.domain, pages_contents=[])

            for page in tqdm(domain_pages.pages, desc=f"Fetching pages for domain '{domain_pages.domain}'", total=len(domain_pages.pages)):
                result = process_page(page, domain_pages.domain)
                if result is not None:
                    domain_pages_content.pages_contents.append(result)

            domain_pages_content_list.append(domain_pages_content)

    return domain_pages_content_list

def create_index(all_domain_pages_content: List[DomainPagesContent], index_name: str) -> None:    
    # init meilisearch
    client = Client(meilisearch_host, meilisearch_api_key)
    client.create_index(index_name, {'primaryKey': 'id'})

    # iterate over pages_content and add to Meilisearch index
    all_pages_content = []
    # count total pages in pages_content

    index = client.index(index_name)
    index.update_filterable_attributes([
        'unix_timestamp',
        'domain',
    ])
    total_pages = 0

    for domain_pages_content in all_domain_pages_content:
        print(f"Adding {len(domain_pages_content.pages_contents)} pages from domain '{domain_pages_content.domain}' to Meilisearch index...")
        total_pages += len(domain_pages_content.pages_contents)
        all_pages_content.extend(domain_pages_content.pages_contents)

    # update all_pages_content and set id as index starting with 1
    for i, page_content in enumerate(all_pages_content):
        page_content.id = i + 1
                
    # convert all_pages_content to dictoionary
    all_pages_content_dict = [page_content.dict() for page_content in all_pages_content]

    # add all_pages_content_dict to Meilisearch index
    index.add_documents(all_pages_content_dict)

    print(f"Total pages added to Meilisearch index: {total_pages}.")


In [136]:
# domain_pages_content_list = fetch_domain_pages_content(all_domain_pages)
# print("domain_pages_content_list", len(domain_pages_content_list))

threaded_domain_pages_content_list = fetch_domain_pages_content(all_domain_pages, use_threads=True)
print("threaded_domain_pages_content_list", len(threaded_domain_pages_content_list))

Fetching pages for domain 'clubofbudapest.org.au':   0%|          | 0/16 [00:00<?, ?it/s]

Fetching pages for domain 'clubofbudapest.org.au': 100%|██████████| 16/16 [00:02<00:00,  6.49it/s]
Fetching pages for domain 'clubofbudapest.de': 100%|██████████| 18/18 [00:02<00:00,  7.21it/s]
Fetching pages for domain 'clubofbudapest.fw.hu': 100%|██████████| 31/31 [00:05<00:00,  6.02it/s]
Fetching pages for domain 'clubofbudapest.hu': 100%|██████████| 164/164 [00:13<00:00, 12.45it/s]
Fetching pages for domain 'clubofbudapest.ca': 100%|██████████| 182/182 [00:18<00:00,  9.91it/s]
Fetching pages for domain 'budapestklub.matav.hu': 100%|██████████| 209/209 [00:13<00:00, 15.20it/s]
Fetching pages for domain 'iwc.org.hu': 100%|██████████| 270/270 [00:45<00:00,  5.96it/s]
Fetching pages for domain 'cobusa.org': 100%|██████████| 313/313 [00:30<00:00, 10.24it/s]
Fetching pages for domain 'club-de-budapest.asso.fr': 100%|██████████| 335/335 [00:22<00:00, 14.95it/s]
Fetching pages for domain 'club-of-budapest.de': 100%|██████████| 371/371 [00:25<00:00, 14.28it/s]
Fetching pages for domain 'glo

In [137]:
create_index(threaded_domain_pages_content_list, index_name="club-of-budapest")