In [168]:
import pandas as pd
from bs4 import BeautifulSoup
from meilisearch import Client
from tqdm import tqdm
import urllib.request
import json
import concurrent.futures
from urllib.error import HTTPError
import os
import pdfplumber

# List of domain names
domain_names = [
    "clubofbudapest.org.au",
    "clubofbudapest.fw.hu",
    "clubofbudapest.de",
    "iwc.org.hu",
    "clubofbudapest.hu",
    "budapestklub.matav.hu",
    "clubofbudapest.cz",
    "clubofbudapest.ca",
    "club-of-budapest.de",
    "cobusa.org",
    "clubdebudapest.org",
    "worldshiftnetwork.org",
    "globalspirit.org",
    "club-of-budapest.org",
    "budapestklub.hu",
    "club-de-budapest.asso.fr",
    "club-of-budapest.com",
    "club-of-budapest.it",
    "clubofbudapest.com",
    "clubofbudapest.org"
]

opener = urllib.request.build_opener(
    urllib.request.ProxyHandler(
        {
            'http': 'http://brd-customer-hl_6448c571-zone-zone1:d0rsfd3f67nx@brd.superproxy.io:22225',
            'https': 'http://brd-customer-hl_6448c571-zone-zone1:d0rsfd3f67nx@brd.superproxy.io:22225'
            }
        )
)

def fetch_pages(domain):
    url = f'http://web.archive.org/cdx/search/cdx?url={domain}&output=json&collapse=digest&matchType=domain&fl=timestamp,original,mimetype,statuscode,digest,length&filter=statuscode:200&filter=mimetype:text/html|application/pdf'
    response = opener.open(url)
    return response

def process_domains(domain_names):
    all_pages = []
    domains_dict = {}  

    print(f"Total domains to be fetched: {len(domain_names)}...")

    for domain in tqdm(domain_names, desc="Fetching domains"):
        response = fetch_pages(domain)
        
        if response.status == 200:
            response_json = json.load(response)
            domains_dict[domain] = response_json[1:] # skip first row

    print(f"Total fetched domains: {len(domains_dict)}.")
    print(f"Total pages of all domains: {sum([len(rows) for rows in domains_dict.values()])}.")

    return domains_dict

def step_1():
    domains_dict = process_domains(domain_names)

    if not os.path.exists('../data2'):
        os.makedirs('../data2')
    
    all_domains_pages = None

    for domain, domain_pages in domains_dict.items():
        # skip first row

        domain_pages_df = pd.DataFrame(domain_pages, columns=['timestamp', 'original', 'mimetype', 'statuscode', 'digest', 'length'])

        # filter out non-html and pdf pages
        domain_pages_df = domain_pages_df[domain_pages_df['mimetype'].isin(['text/html', 'application/pdf'])]

        # filter out non-200 pages
        domain_pages_df = domain_pages_df[domain_pages_df['statuscode'] == '200']

        domain_pages_df['domain'] = domain  
        domain_pages_df.to_csv(f'../data2/{domain}.csv', index=False)

    for domain, domain_pages in domains_dict.items():    
        all_domains_pages = pd.concat([pd.read_csv(f'../data2/{domain}.csv') for domain in domain_names])

    all_domains_pages.to_csv('../data2/all_domains_pages.csv', index=False)

    return domains_dict
   
domains_dict = step_1() 
df = pd.read_csv('../data2/all_domains_pages.csv')

Total domains to be fetched: 20...


Fetching domains: 100%|██████████| 20/20 [00:35<00:00,  1.78s/it]


Total fetched domains: 20.
Total pages of all domains: 16167.


In [146]:
df
# put domains_dict in a dataframe
# df = pd.DataFrame()
# for domain in domains_dict:
#     df = df.append(pd.DataFrame(domains_dict[domain], columns=['timestamp', 'original', 'mimetype', 'statuscode']))

# df.head()

Unnamed: 0,timestamp,original,mimetype,statuscode,domain
0,20080724001059,http://www.clubofbudapest.org.au/,text/html,200,clubofbudapest.org.au
1,20081012032643,http://www.clubofbudapest.org.au/,text/html,200,clubofbudapest.org.au
2,20090328135026,http://www.clubofbudapest.org.au:80/,text/html,200,clubofbudapest.org.au
3,20090428045449,http://www.clubofbudapest.org.au:80/,text/html,200,clubofbudapest.org.au
4,20090529024051,http://www.clubofbudapest.org.au:80/,text/html,200,clubofbudapest.org.au
...,...,...,...,...,...
30998,20070617130605,http://www.clubofbudapest.org:80/You%20Can%20C...,text/html,200,clubofbudapest.org
30999,20190915075804,http://ervinlaszlo.clubofbudapest.org/,text/html,200,clubofbudapest.org
31000,20210620135141,http://welt.www.clubofbudapest.org/,text/html,200,clubofbudapest.org
31001,20210620135141,http://welt.www.clubofbudapest.org/,text/html,200,clubofbudapest.org


In [156]:
df = None
data_folder = '../data2'
download_folder = data_folder + '/pdfs'
all_domains_pages_df = None
results_folder = '../results4'

domain_names = [
    "clubofbudapest.org.au",
    "clubofbudapest.fw.hu",
    "clubofbudapest.de",
    "iwc.org.hu",
    "clubofbudapest.hu",
    "budapestklub.matav.hu",
    "clubofbudapest.cz",
    "clubofbudapest.ca",
    "club-of-budapest.de",
    "cobusa.org",
    "clubdebudapest.org",
    "worldshiftnetwork.org",
    "globalspirit.org",
    "club-of-budapest.org",
    "budapestklub.hu",
    "club-de-budapest.asso.fr",
    "club-of-budapest.com",
    "club-of-budapest.it",
    "clubofbudapest.com",
    "clubofbudapest.org"
]


def read_pdf_content(pdf_path):
    pdf_content = ''
    pdf_title = ''
    try:
        with pdfplumber.open(pdf_path) as pdf:
            pdf_title = pdf.metadata['title']
            for page in pdf.pages:
                pdf_content += page.extract_text() + ' '
    except:
        # print(f"Error reading PDF: {pdf_path}")
        return None
    return pdf_content, pdf_title

def fetch_pdf_content(original_content, pdf_url):
    if not os.path.exists(download_folder):
        os.makedirs(download_folder)

    filename = os.path.join(download_folder, pdf_url.split('/')[-1])

    if not os.path.exists(filename):
        with open(filename, 'wb') as f:
            f.write(original_content)

    return read_pdf_content(filename)

def fetch_html_content(original_content):
    soup = BeautifulSoup(original_content, 'html.parser')
    text = soup.get_text()    
    title = soup.title.string if soup.title else ''
    return text, title

def fetch_content(wayback_machine_url):
    try:
        response = opener.open(wayback_machine_url)
        if response.status == 200:
            return response.read()
    except:
        # print(f"HTTP Error: {e} for URL: {wayback_machine_url}")
        return None
    return None
   
def process_domain(domain, rows):  
    domain_pages = fetch_domain_pages(domain, rows)    
    print(f"{domain}: {(len(rows) - len(domain_pages))} of {len(rows)} fetched.")
    save_domain_pages(domain, domain_pages)
    return domain_pages

def step_2(domain, pages):
    print(f"{domain} with {len(pages)} pages.")

    pages_content_list = []
    for row in tqdm(pages.itertuples(), desc="Fetching pages", total=len(pages)):    
        if row.statuscode != '200' and row.statuscode != 200:
            continue

        wayback_machine_url = f'http://web.archive.org/web/{row.timestamp}/{row.original}'
        wayback_machine_content_url = f'http://web.archive.org/web/{row.timestamp}if_/{row.original}'

        if row.mimetype == 'application/pdf':
            original_content = fetch_content(wayback_machine_content_url)
            if original_content is None:
                continue
            text, title = fetch_pdf_content(original_content, row.original)
        elif row.mimetype == 'text/html':
            original_content = fetch_content(wayback_machine_content_url)
            if original_content is None:
                continue
            text, title = fetch_html_content(original_content)   
        else:
            continue
        
        if text is None or len(text) < 500:
            continue

        pages_content_list.append({
            'title': title,
            'domain': row.domain,
            'timestamp': row.timestamp,
            'wayback_machine_url': wayback_machine_url,
            'url': row.original,
            'original_content': original_content
            'text': text,
            'mimetype': row.mimetype
        })

    pages_content = pd.DataFrame(pages_content_list)
    return pages_content

def group_by_domain():
    domain_pages = {}

    if nog os.path.exists(results_folder):
        os.makedirs(results_folder)

    for domain in domain_names:
        if os.path.exists(results_folder + '/' + domain + '.pkl'):
            continue
        pages = pd.read_csv(data_folder + '/' + domain + '.csv')
        domain_pages[domain] = pages

    # order domain_pages by length of pages starting with the shortest
    domain_pages = {k: v for k, v in sorted(domain_pages.items(), key=lambda item: len(item[1]))}
    
    # init meilisearch
    meilisearch_host = 'http://127.0.0.1:7700'  # Replace with your Meilisearch host
    index_name = 'club-of-budapest-4' 
    meilisearch_api_key = 'masterKey'
    client = Client(meilisearch_host, meilisearch_api_key)
    client.create_index(index_name)
    
    # iterate over the first 3 domains
    for domain, pages in domain_pages.items():
        pages_content = step_2(domain, pages)   
        # add id column to first column
        pages_content.insert(0, 'id', range(0, len(pages_content)))

        # # add datetime column and convert timestamp to datetime format
        # pages_content['datetime'] = pd.to_datetime(pages_content['timestamp'], format='%Y%m%d%H%M%S')

        # # convert datetime to string
        # pages_content['datetime'] = pages_content['datetime'].dt.strftime('%Y-%m-%d %H:%M:%S')

        # # remove timestamp column
        # pages_content = pages_content.drop(columns=['timestamp'])

        # replace \n with space and remove double spaces
        # if pages_content has column ['text']
        if 'text' in pages_content.columns:
            pages_content['text'] = pages_content['text'].str.replace('\n', ' ').str.replace(' +', ' ')

        pages_content.to_pickle(results_folder + '/' + domain + '.pkl')     

        index = client.index(index_name)
        index.add_documents(pages_content.to_dict('records'))
        print(f"Domain {domain} added to Meilisearch index.")
group_by_domain()

cobusa.org with 1178 pages.


Fetching pages: 100%|██████████| 1178/1178 [24:16<00:00,  1.24s/it] 


Domain cobusa.org added to Meilisearch index.
globalspirit.org with 1253 pages.


Fetching pages:  93%|█████████▎| 1170/1253 [21:38<02:21,  1.70s/it]Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Fetching pages:  93%|█████████▎| 1171/1253 [21:39<02:12,  1.61s/it]Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Fetching pages:  94%|█████████▎| 1172/1253 [21:41<02:12,  1.64s/it]Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Fetching pages:  94%|█████████▎| 1173/1253 [21:43<02:09,  1.62s/it]Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Fetching pages:  94%|█████████▎| 1174/1253 [21:44<02:07,  1.61s/it]Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Fetching pages:  94%|█████████▍| 1175/1253 [21:46<02:14,  1.73s/it]Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Fetching pages:  94%|█████████▍| 1176/1253 [21:47<02:01,  1.58s/it]Some characters could not b

Domain globalspirit.org added to Meilisearch index.
budapestklub.hu with 1263 pages.


Fetching pages: 100%|██████████| 1263/1263 [25:44<00:00,  1.22s/it] 


Domain budapestklub.hu added to Meilisearch index.
clubofbudapest.com with 1520 pages.


Fetching pages: 100%|██████████| 1520/1520 [51:21<00:00,  2.03s/it]  


Domain clubofbudapest.com added to Meilisearch index.
clubdebudapest.org with 1782 pages.


Fetching pages: 100%|██████████| 1782/1782 [34:36<00:00,  1.17s/it] 


Domain clubdebudapest.org added to Meilisearch index.
club-of-budapest.org with 1812 pages.


Fetching pages: 100%|██████████| 1812/1812 [25:54<00:00,  1.17it/s] 


Domain club-of-budapest.org added to Meilisearch index.
worldshiftnetwork.org with 2349 pages.


Fetching pages: 100%|██████████| 2349/2349 [00:09<00:00, 237.75it/s]


KeyError: 'text'

In [162]:

domain_pages = {}

# read all pkl files in results3 folder into meilisearch
for domain in domain_names:
    if os.path.exists(results_folder + '/' + domain + '.pkl'):
        domain_pages[domain] = pd.read_pickle(results_folder + '/' + domain + '.pkl')

meilisearch_host = 'http://127.0.0.1:7700'  # Replace with your Meilisearch host
index_name = 'club-of-budapest-4' 
meilisearch_api_key = 'masterKey'
client = Client(meilisearch_host, meilisearch_api_key)
client.create_index(index_name)

for domain, pages in domain_pages.items():
    index = client.index(index_name)
    index.add_documents(pages.to_dict('records'))
    print(f"Domain {domain} added to Meilisearch index.")

Domain clubofbudapest.org.au added to Meilisearch index.
Domain clubofbudapest.fw.hu added to Meilisearch index.
Domain clubofbudapest.de added to Meilisearch index.
Domain iwc.org.hu added to Meilisearch index.
Domain clubofbudapest.hu added to Meilisearch index.
Domain budapestklub.matav.hu added to Meilisearch index.
Domain clubofbudapest.cz added to Meilisearch index.
Domain clubofbudapest.ca added to Meilisearch index.
Domain club-of-budapest.de added to Meilisearch index.
Domain cobusa.org added to Meilisearch index.
Domain clubdebudapest.org added to Meilisearch index.
Domain globalspirit.org added to Meilisearch index.
Domain club-of-budapest.org added to Meilisearch index.
Domain budapestklub.hu added to Meilisearch index.
Domain clubofbudapest.com added to Meilisearch index.


In [160]:
pickled_df

Unnamed: 0,id


In [None]:

index = client.index(index_name)
index.add_documents(pickled_df.to_dict('records'))
print(f"Domain {domain} added to Meilisearch index.")
