In [42]:
import os, requests, json
from bs4 import BeautifulSoup
from urllib.parse import urlparse, urljoin
import csv
import pandas as pd

## CommonCrawl test

In [3]:
# For parsing URLs:
from urllib.parse import quote_plus
from bs4 import BeautifulSoup

In [None]:
# when missing, install trafilatura module for easy text extraction from web pages
!pip install trafilatura 

In [5]:
import trafilatura

In [6]:
def search_cc_index(url, index_name):
    """
    Search the Common Crawl Index for a given URL.

    This function queries the Common Crawl Index API to find records related to the specified URL. 
    It uses the index specified by `index_name` to retrieve the data and returns a list of JSON objects, 
    each representing a record from the index.

    Arguments:
        url (str): The URL to search for in the Common Crawl Index.
        index_name (str): The name of the Common Crawl Index to search (e.g., "CC-MAIN-2024-10").

    Returns:
        list: A list of JSON objects representing records found in the Common Crawl Index. 
              Returns None if the request fails or no records are found.

    Example:
        >>> search_cc_index("example.com", "CC-MAIN-2024-10")
        [{...}, {...}, ...]
    """
    encoded_url = quote_plus(url)
    index_url = f'http://index.commoncrawl.org/{index_name}-index?url={encoded_url}&output=json'
    response = requests.get(index_url)

    if response.status_code == 200:
        records = response.text.strip().split('\n')
        return [json.loads(record) for record in records]
    else:
        return None



def fetch_single_record(warc_record_filename, offset, length):
    """
    Fetch a single WARC record from Common Crawl.

    Arguments:
        record {dict} -- A dictionary containing the WARC record details.

    Returns:
        bytes or None -- The raw content of the response if found, otherwise None.
    """
    
    s3_url = f'https://data.commoncrawl.org/{warc_record_filename}'

    # Define the byte range for the request
    byte_range = f'bytes={offset}-{offset + length - 1}'

    # Send the HTTP GET request to the S3 URL with the specified byte range
    response = requests.get(
        s3_url,
        headers={'Range': byte_range},
        stream=True
    )

    if response.status_code == 206:
        # Use `stream=True` in the call to `requests.get()` to get a raw byte stream,
        # because it's gzip compressed data
        stream = ArchiveIterator(response.raw)
        for warc_record in stream:
            if warc_record.rec_type == 'response':
                return warc_record.content_stream().read()
    else:
        print(f"Failed to fetch data: {response.status_code}")
    
    return None


def append_df_row_to_pickle(row, pickle_file):
    """
    Append a row to a DataFrame stored in a pickle file.
    
    Arguments:
        row {pd.Series} -- The row to be appended to the DataFrame.
        pickle_file {str} -- The path to the pickle file where the DataFrame is stored.
    """
    # Check if the pickle file exists
    if os.path.exists(pickle_file):
        # Load the existing DataFrame from the pickle file
        df = pd.read_pickle(pickle_file)
    else:
        # If the file doesn't exist, create a new DataFrame
        df = pd.DataFrame(columns=row.index)

    # Append the new row to the DataFrame
    df = pd.concat([df, pd.DataFrame([row])], ignore_index=True)
    
    # Save the updated DataFrame back to the pickle file
    df.to_pickle(pickle_file)


def load_processed_indices(pickle_file):
    """
    Load processed indices from a pickle file to check previously processed records.

    Arguments:
        pickle_file {str} -- The path to the pickle file where the DataFrame is stored.
    
    Returns:
        Set of processed indices.
    """
    if os.path.exists(pickle_file):
        df = pd.read_pickle(pickle_file)
        # Assuming 'index' column is in the DataFrame and contains indices of processed records
        processed_indices = set(df['index'].unique())
        print(f"Loaded {len(processed_indices)} processed indices from {pickle_file}")
        return processed_indices
    else:
        print(f"No processed indices found. Pickle file '{pickle_file}' does not exist.")
        return set()

In [16]:
# The URL you want to look up in the Common Crawl index
target_url = 'https://www.slotzuylen.nl/*'  # Replace with your target URL

# list of indexes https://commoncrawl.org/get-started
#indexes  = ['CC-MAIN-2024-46','CC-MAIN-2024-30','CC-MAIN-2024-26']
indexes  = ['CC-MAIN-2024-30', 'CC-MAIN-2024-33','CC-MAIN-2024-38','CC-MAIN-2024-42','CC-MAIN-2024-46','CC-MAIN-2024-51']

record_dfs = []

# Fetch each index and store into a dataframe
for index_name in indexes:
    print('Running: ', index_name)
    records = search_cc_index(target_url,index_name)
    record_df = pd.DataFrame(records)
    record_df['index_name'] = index_name
    record_dfs.append(record_df)

# Combine individual dataframes
all_records_df = pd.concat(record_dfs)
all_records_df = all_records_df.sort_values(by='index_name', ascending=False)
all_records_df = all_records_df.reset_index()

# Create columns where to store data later
all_records_df['success_status'] = 'not processed'
all_records_df['html'] = ''

Running:  CC-MAIN-2024-30
Running:  CC-MAIN-2024-33
Running:  CC-MAIN-2024-38
Running:  CC-MAIN-2024-42
Running:  CC-MAIN-2024-46
Running:  CC-MAIN-2024-51


In [17]:
all_records_df.to_csv('all_records_df.csv', index=False)

In [18]:
df = all_records_df

In [20]:
# safeguareds
# If pickle file exists, check for processed items
pickle_file = 'commcrawl_indeed.pkl'
processed_indices = load_processed_indices(pickle_file)
#if processed_indices:
#    # Remove processed items
#    df = df[~df['index'].isin(processed_indices)]

# Create storage for later
successful = set()
results = {}

# Keep track of each row processed
i = 0 
perc = 0
n_records = len(df)
print(f"Found {n_records} records for {target_url}")
mod = int(n_records * 0.01)

No processed indices found. Pickle file 'commcrawl_indeed.pkl' does not exist.
Found 415 records for https://www.slotzuylen.nl/*


In [21]:
mod

4

In [22]:
from warcio.archiveiterator import ArchiveIterator

# Reset index to help with looping
df.reset_index(drop=True,inplace=True)


for i in range(len(df)):
    # Print every 1% process
    if i % mod == 0: 
        print(f'{i} of {n_records}: {perc}%')
        perc += 1

    record_url = df.loc[i, 'url']

    # Fetch only URLs that were not processed
    # If it was already processed, skip URL 
    # (Helps speeding if you only need one version of the HTML, not its history)
    if not record_url in successful:
        length = int(df.loc[i, 'length'])
        offset = int(df.loc[i, 'offset'])
        warc_record_filename = df.loc[i, 'filename']
        result = fetch_single_record(warc_record_filename, offset, length)
        
        if not result:
            df.loc[i,'success_status'] = 'invalid warc'
        else:
            df.loc[i,'success_status'] = 'success'
            df.loc[i,'html'] = result
    else: 
        df.loc[i,'success_status'] = 'previously processed'

    # Add to pickle file
    append_df_row_to_pickle(df.loc[i, :], pickle_file)

0 of 415: 0%
Failed to fetch data: 503
Failed to fetch data: 503
4 of 415: 1%
Failed to fetch data: 503
Failed to fetch data: 503
8 of 415: 2%
Failed to fetch data: 503
12 of 415: 3%
Failed to fetch data: 503
16 of 415: 4%
Failed to fetch data: 503
Failed to fetch data: 503
20 of 415: 5%
24 of 415: 6%
28 of 415: 7%
32 of 415: 8%
36 of 415: 9%
40 of 415: 10%
44 of 415: 11%
48 of 415: 12%
52 of 415: 13%
56 of 415: 14%
60 of 415: 15%
64 of 415: 16%
68 of 415: 17%
72 of 415: 18%
76 of 415: 19%
80 of 415: 20%
84 of 415: 21%
88 of 415: 22%
92 of 415: 23%
96 of 415: 24%
Failed to fetch data: 503
Failed to fetch data: 503
100 of 415: 25%
104 of 415: 26%
Failed to fetch data: 503
Failed to fetch data: 503
108 of 415: 27%
112 of 415: 28%
Failed to fetch data: 503
116 of 415: 29%
120 of 415: 30%
124 of 415: 31%
128 of 415: 32%
132 of 415: 33%
136 of 415: 34%
140 of 415: 35%
Failed to fetch data: 503
144 of 415: 36%
148 of 415: 37%
152 of 415: 38%
156 of 415: 39%
160 of 415: 40%
164 of 415: 41%
16

In [23]:
commoncrawl_data = pd.read_pickle(pickle_file)
commoncrawl_data[
    ['url','filename','index_name','success_status','html']
    ].head()

Unnamed: 0,url,filename,index_name,success_status,html
0,https://slotzuylen.nl/zakelijk/,crawl-data/CC-MAIN-2024-51/segments/1733066127...,CC-MAIN-2024-51,success,"b' <!doctype html>\r\n<html lang=""nl-NL"">\r..."
1,https://slotzuylen.nl/agenda/rondleiding-in-de...,crawl-data/CC-MAIN-2024-51/segments/1733066127...,CC-MAIN-2024-51,success,"b' <!doctype html>\r\n<html lang=""nl-NL"">\r..."
2,https://slotzuylen.nl/agenda/rondleiding-gehei...,crawl-data/CC-MAIN-2024-51/segments/1733066127...,CC-MAIN-2024-51,invalid warc,
3,https://slotzuylen.nl/agenda/overzicht/?relate...,crawl-data/CC-MAIN-2024-51/segments/1733066127...,CC-MAIN-2024-51,invalid warc,
4,https://slotzuylen.nl/agenda/opera-in-de-kaste...,crawl-data/CC-MAIN-2024-51/segments/1733066127...,CC-MAIN-2024-51,invalid warc,


In [37]:
m1 = commoncrawl_data[~commoncrawl_data[['html']].duplicated()]

In [40]:
m1

Unnamed: 0,index,urlkey,timestamp,url,mime,mime-detected,status,digest,length,offset,filename,languages,encoding,redirect,truncated,index_name,success_status,html
0,55,"nl,slotzuylen)/zakelijk",20241202071107,https://slotzuylen.nl/zakelijk/,text/html,text/html,200,OUJIY3WSHKFHBS3N4BJKO47EE6HF6DIJ,22620,507005272,crawl-data/CC-MAIN-2024-51/segments/1733066127...,nld,UTF-8,,,CC-MAIN-2024-51,success,"b' <!doctype html>\r\n<html lang=""nl-NL"">\r..."
1,27,"nl,slotzuylen)/agenda/rondleiding-in-de-steige...",20241202073900,https://slotzuylen.nl/agenda/rondleiding-in-de...,text/html,text/html,200,GSBFRQJEQXYYGWZQ57JGUEAW5NPWQRJC,23550,501964632,crawl-data/CC-MAIN-2024-51/segments/1733066127...,nld,UTF-8,,,CC-MAIN-2024-51,success,"b' <!doctype html>\r\n<html lang=""nl-NL"">\r..."
2,25,"nl,slotzuylen)/agenda/rondleiding-geheimen-van...",20241202064906,https://slotzuylen.nl/agenda/rondleiding-gehei...,text/html,text/html,200,CGLX2FHSIHEQYOZ6DOLBCVFSVB5337FG,23595,495038699,crawl-data/CC-MAIN-2024-51/segments/1733066127...,nld,UTF-8,,,CC-MAIN-2024-51,invalid warc,
5,22,"nl,slotzuylen)/agenda/muizenuitje/2023-10-17",20241202084045,https://slotzuylen.nl/agenda/muizenuitje/2023-...,text/html,text/html,200,3FBM4RJB4IJ2PCVJ2FRPW66DLMSYGNCM,23853,505705735,crawl-data/CC-MAIN-2024-51/segments/1733066127...,nld,UTF-8,,,CC-MAIN-2024-51,success,"b' <!doctype html>\r\n<html lang=""nl-NL"">\r..."
6,21,"nl,slotzuylen)/agenda/maand?related_series=13057",20241202064142,https://slotzuylen.nl/agenda/maand/?related_se...,text/html,text/html,200,SEZENN4N5PCAG2UPWAQ6UHJ7SSHMNZEC,17535,504430132,crawl-data/CC-MAIN-2024-51/segments/1733066127...,nld,UTF-8,,,CC-MAIN-2024-51,success,"b' <!doctype html>\r\n<html lang=""nl-NL"">\r..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
407,67,"nl,slotzuylen)/faq-huisregels",20240724055529,https://slotzuylen.nl/faq-huisregels/,text/html,text/html,200,7GJTGNXHGM7JLZB3OYLPMX3YRDFN5BO4,21171,449266506,crawl-data/CC-MAIN-2024-30/segments/1720763518...,nld,UTF-8,,,CC-MAIN-2024-30,success,"b' <!doctype html>\r\n<html lang=""nl-NL"">\r..."
408,68,"nl,slotzuylen)/huisregels-verhuur",20240724051135,https://slotzuylen.nl/huisregels-verhuur/,text/html,text/html,200,ZIKBSUYZ3BJ4TRIY7ABZTVJVFJQNYYMB,19561,469186209,crawl-data/CC-MAIN-2024-30/segments/1720763518...,nld,UTF-8,,,CC-MAIN-2024-30,success,"b' <!doctype html>\r\n<html lang=""nl-NL"">\r..."
409,69,"nl,slotzuylen)/kasteel",20240724063620,https://slotzuylen.nl/kasteel/,text/html,text/html,200,QMI2FCLQ2JMN6EB66IKEMIHP2D7IPJ34,21479,467946356,crawl-data/CC-MAIN-2024-30/segments/1720763518...,nld,UTF-8,,,CC-MAIN-2024-30,success,"b' <!doctype html>\r\n<html lang=""nl-NL"">\r..."
412,72,"nl,slotzuylen)/koetshuis-cafe",20240724063910,https://slotzuylen.nl/koetshuis-cafe/,text/html,text/html,200,3UJUXSCESO4R32RXRCOMF3FUKQWJDW3Q,21751,460389922,crawl-data/CC-MAIN-2024-30/segments/1720763518...,nld,UTF-8,,,CC-MAIN-2024-30,success,"b' <!doctype html>\r\n<html lang=""nl-NL"">\r..."


In [41]:
len(commoncrawl_data)

415

### TRAFILATURA text extraction

In [29]:
webpage = commoncrawl_data.html[0]
trafilatura.extract(webpage)

'Vergaderen\nWil je onder het systeemplafond van het kantoor vandaan komen met een vergadering, of juist mensen van over het hele land centraal bijeenbrengen zonder webcams? Slot Zuylen is de plek om te vergaderen.\nVan een bestuursvergadering tot congres of symposium, er is van alles mogelijk. Daarnaast zijn alle faciliteiten aanwezig en de centrale ligging met eigen parkeerplaats, vlak bij de A2, maakt het de ideale plek voor een bijeenkomst.\nPresenteren\nSlot Zuylen is een uitzonderlijke plek om een congres of symposium te organiseren. De zalen zorgen voor een onvergetelijke ervaring en bieden ruimte aan vele gasten. Tal van bedrijven kozen al voor de statige omgeving van Slot Zuylen als het decor van hun presentatie of prijsuitreiking. Van boekpresentaties, tot internationale literaire prijzen, we hebben het allemaal eens in huis gehad. Schroom daarom niet en neem contact met ons op via\nverhuur@slotzuylen.nl.\nLunch en diner\nVoor een zakelijke lunch of diner op een exclusieve lo

In [None]:
downloaded = trafilatura.fetch_url('https://github.blog/2019-03-29-leader-spotlight-erin-spiceland/')
trafilatura.extract(downloaded)