In [1]:
import os, requests, json
from bs4 import BeautifulSoup
from urllib.parse import urlparse, urljoin
import csv
import pandas as pd

## CommonCrawl Data

In [2]:
# For parsing URLs:
from urllib.parse import quote_plus
from bs4 import BeautifulSoup

In [None]:
# when missing, install trafilatura module for easy text extraction from web pages
!pip install trafilatura 

In [3]:
import trafilatura

In [4]:
def search_cc_index(url, index_name):
    """
    Search the Common Crawl Index for a given URL.

    This function queries the Common Crawl Index API to find records related to the specified URL. 
    It uses the index specified by `index_name` to retrieve the data and returns a list of JSON objects, 
    each representing a record from the index.

    Arguments:
        url (str): The URL to search for in the Common Crawl Index.
        index_name (str): The name of the Common Crawl Index to search (e.g., "CC-MAIN-2024-10").

    Returns:
        list: A list of JSON objects representing records found in the Common Crawl Index. 
              Returns None if the request fails or no records are found.

    Example:
        >>> search_cc_index("example.com", "CC-MAIN-2024-10")
        [{...}, {...}, ...]
    """
    encoded_url = quote_plus(url)
    index_url = f'http://index.commoncrawl.org/{index_name}-index?url={encoded_url}&output=json'
    response = requests.get(index_url)

    if response.status_code == 200:
        records = response.text.strip().split('\n')
        return [json.loads(record) for record in records]
    else:
        return None



def fetch_single_record(warc_record_filename, offset, length):
    """
    Fetch a single WARC record from Common Crawl.

    Arguments:
        record {dict} -- A dictionary containing the WARC record details.

    Returns:
        bytes or None -- The raw content of the response if found, otherwise None.
    """
    
    s3_url = f'https://data.commoncrawl.org/{warc_record_filename}'

    # Define the byte range for the request
    byte_range = f'bytes={offset}-{offset + length - 1}'

    # Send the HTTP GET request to the S3 URL with the specified byte range
    response = requests.get(
        s3_url,
        headers={'Range': byte_range},
        stream=True
    )

    if response.status_code == 206:
        # Use `stream=True` in the call to `requests.get()` to get a raw byte stream,
        # because it's gzip compressed data
        stream = ArchiveIterator(response.raw)
        for warc_record in stream:
            if warc_record.rec_type == 'response':
                return warc_record.content_stream().read()
    else:
        print(f"Failed to fetch data: {response.status_code}")
    
    return None


def append_df_row_to_pickle(row, pickle_file):
    """
    Append a row to a DataFrame stored in a pickle file.
    
    Arguments:
        row {pd.Series} -- The row to be appended to the DataFrame.
        pickle_file {str} -- The path to the pickle file where the DataFrame is stored.
    """
    # Check if the pickle file exists
    if os.path.exists(pickle_file):
        # Load the existing DataFrame from the pickle file
        df = pd.read_pickle(pickle_file)
    else:
        # If the file doesn't exist, create a new DataFrame
        df = pd.DataFrame(columns=row.index)

    # Append the new row to the DataFrame
    df = pd.concat([df, pd.DataFrame([row])], ignore_index=True)
    
    # Save the updated DataFrame back to the pickle file
    df.to_pickle(pickle_file)


def load_processed_indices(pickle_file):
    """
    Load processed indices from a pickle file to check previously processed records.

    Arguments:
        pickle_file {str} -- The path to the pickle file where the DataFrame is stored.
    
    Returns:
        Set of processed indices.
    """
    if os.path.exists(pickle_file):
        df = pd.read_pickle(pickle_file)
        # Assuming 'index' column is in the DataFrame and contains indices of processed records
        processed_indices = set(df['index'].unique())
        print(f"Loaded {len(processed_indices)} processed indices from {pickle_file}")
        return processed_indices
    else:
        print(f"No processed indices found. Pickle file '{pickle_file}' does not exist.")
        return set()

In [21]:
# load the list of URLs per country to search for in CC
# Country code: change here between 'NL' and 'UK'
cc_list = ['NL', 'UK', 'DE', 'FR']
''' when it works, 
    transform this code 
    into a for loop
    to loop through the cc_list
'''
url_list = pd.read_csv('./url_lists/'+cc_list[0]+'_urls.csv')[cc_list[0]+' domains'].values.tolist()

In [23]:
len(url_list)

53

In [5]:
# Import json data from Aipfy scraping into 4 separate dataframes

df0=pd.read_json(gdrive_path+cc_list[0]+'_dataset_website-content-crawler.json')
# select only two columns for analysis: url and text
df0=df0[['url','text']]

df1=pd.read_json(gdrive_path+cc_list[1]+'_dataset_website-content-crawler.json')
# select only two columns for analysis: url and text
df1=df1[['url','text']]

df2=pd.read_json(gdrive_path+cc_list[2]+'_dataset_website-content-crawler.json')
# select only two columns for analysis: url and text
df2=df2[['url','text']]

df3=pd.read_json(gdrive_path+cc_list[3]+'_dataset_website-content-crawler.json')
# select only two columns for analysis: url and text
df3=df3[['url','text']]

df0.head()

IndentationError: expected an indented block (1379649709.py, line 5)

In [None]:
# create a mapping of dataframes which could be addressed in a loop
df_dict = {'0':df0, '1':df1, '2':df2, '3':df3}

# then, loop through the df_dict to update each dataframe
for k, v in df_dict.items():

List of all Common Crawl indexes since 2018: 
- **2024:** 'CC-MAIN-2024-51', 'CC-MAIN-2024-46', 'CC-MAIN-2024-42', 'CC-MAIN-2024-38', 'CC-MAIN-2024-33', 'CC-MAIN-2024-30', 'CC-MAIN-2024-26', 'CC-MAIN-2024-22', 'CC-MAIN-2024-18', 'CC-MAIN-2024-10' 
- **2023:** CC-MAIN-2023-50 // CC-MAIN-2023-40 // CC-MAIN-2023-23 // CC-MAIN-2023-14 // CC-MAIN-2023-06 //
- **2022:** CC-MAIN-2022-49 // CC-MAIN-2022-40 // CC-MAIN-2022-33 // CC-MAIN-2022-27 // CC-MAIN-2022-21 // CC-MAIN-2022-05
- **2021:** CC-MAIN-2021-49 // CC-MAIN-2021-43 // CC-MAIN-2021-39 // CC-MAIN-2021-31 // CC-MAIN-2021-25 // CC-MAIN-2021-21 // CC-MAIN-2021-17 // CC-MAIN-2021-10 // CC-MAIN-2021-04 //
- **2020:** CC-MAIN-2020-50 // CC-MAIN-2020-45 // CC-MAIN-2020-40 // CC-MAIN-2020-34 // CC-MAIN-2020-29 // CC-MAIN-2020-24 // CC-MAIN-2020-16 // CC-MAIN-2020-10 // CC-MAIN-2020-05 //
- **2019:** CC-MAIN-2019-51 // CC-MAIN-2019-47 // CC-MAIN-2019-43 // CC-MAIN-2019-39 // CC-MAIN-2019-35 // CC-MAIN-2019-30 // CC-MAIN-2019-26 // CC-MAIN-2019-22 // CC-MAIN-2019-18 // CC-MAIN-2019-13 // CC-MAIN-2019-09 // CC-MAIN-2019-04
- **2018:** 'CC-MAIN-2018-51', 'CC-MAIN-2018-47', 'CC-MAIN-2018-43', 'CC-MAIN-2018-39', 'CC-MAIN-2018-34', 'CC-MAIN-2018-30', 'CC-MAIN-2018-26', 'CC-MAIN-2018-22', 'CC-MAIN-2018-17', 'CC-MAIN-2018-13', 'CC-MAIN-2018-09', 'CC-MAIN-2018-05'

### Test one website data

In [42]:
# list of indexes https://commoncrawl.org/get-started
#indexes  = ['CC-MAIN-2024-46','CC-MAIN-2024-30','CC-MAIN-2024-26']
indexes = ['CC-MAIN-2024-51', 'CC-MAIN-2024-46', 'CC-MAIN-2024-42', 'CC-MAIN-2024-38', 'CC-MAIN-2024-33', 
           'CC-MAIN-2024-30', 'CC-MAIN-2024-26', 'CC-MAIN-2024-22', 'CC-MAIN-2024-18', 'CC-MAIN-2024-10']

record_dfs = []
target_url = url_list[33]
print("searching all records for ", target_url, " in indexes")

# Fetch each index and store into a dataframe
for index_name in indexes:
    print('Running: ', index_name)
    records = search_cc_index(target_url,index_name)
    record_df = pd.DataFrame(records)
    record_df['index_name'] = index_name
    record_dfs.append(record_df)

# Combine individual dataframes
all_records_df = pd.concat(record_dfs)
all_records_df = all_records_df.sort_values(by='index_name', ascending=False)
all_records_df = all_records_df.reset_index()

# Create columns where to store data later
all_records_df['success_status'] = 'not processed'
all_records_df['html'] = ''

searching all records for  https://www.middachten.nl  in indexes
Running:  CC-MAIN-2024-51
Running:  CC-MAIN-2024-46
Running:  CC-MAIN-2024-42
Running:  CC-MAIN-2024-38
Running:  CC-MAIN-2024-33
Running:  CC-MAIN-2024-30
Running:  CC-MAIN-2024-26
Running:  CC-MAIN-2024-22
Running:  CC-MAIN-2024-18
Running:  CC-MAIN-2024-10


In [45]:
# safeguareds
# If pickle file exists, check for processed items
pickle_file = 'commcrawl_indeed.pkl'
processed_indices = load_processed_indices(pickle_file)
#if processed_indices:
#    # Remove processed items
#    df = df[~df['index'].isin(processed_indices)]

# Create storage for later
successful = set()
results = {}

# Keep track of each row processed
i = 0 
perc = 0
n_records = len(all_records_df)
print(f"Found {n_records} records")

No processed indices found. Pickle file 'commcrawl_indeed.pkl' does not exist.
Found 15 records


In [48]:
from warcio.archiveiterator import ArchiveIterator

# Reset index to help with looping
all_records_df.reset_index(drop=True,inplace=True)


for i in range(len(all_records_df)):
    record_url = all_records_df.loc[i, 'url']

    # Fetch only URLs that were not processed
    # If it was already processed, skip URL 
    # (Helps speeding if you only need one version of the HTML, not its history)
    if not record_url in successful:
        length = int(all_records_df.loc[i, 'length'])
        offset = int(all_records_df.loc[i, 'offset'])
        warc_record_filename = all_records_df.loc[i, 'filename']
        result = fetch_single_record(warc_record_filename, offset, length)
        
        if not result:
            all_records_df.loc[i,'success_status'] = 'invalid warc'
        else:
            all_records_df.loc[i,'success_status'] = 'success'
            all_records_df.loc[i,'html'] = result
    else: 
        all_records_df.loc[i,'success_status'] = 'previously processed'

    # Add to pickle file
    append_df_row_to_pickle(all_records_df.loc[i, :], pickle_file)

In [64]:
commoncrawl_data = pd.read_pickle(pickle_file)
commoncrawl_data[
    ['url','filename','index_name','success_status','html']
    ]

Unnamed: 0,url,filename,index_name,success_status,html
0,https://www.middachten.nl,crawl-data/CC-MAIN-2024-51/segments/1733066127...,CC-MAIN-2024-51,success,"b'<!DOCTYPE html>\n<html lang=""nl-NL"" >\n\n<he..."
1,https://www.middachten.nl/,crawl-data/CC-MAIN-2024-51/segments/1733066127...,CC-MAIN-2024-51,success,"b'<!DOCTYPE html>\n<html lang=""nl-NL"" >\n\n<he..."
2,https://www.middachten.nl,crawl-data/CC-MAIN-2024-46/segments/1730477395...,CC-MAIN-2024-46,success,b'<!DOCTYPE html><html\nlang=nl-NL ><head><sty...
3,https://www.middachten.nl/,crawl-data/CC-MAIN-2024-46/segments/1730477395...,CC-MAIN-2024-46,success,b'<!DOCTYPE html><html\nlang=nl-NL ><head><sty...
4,https://www.middachten.nl,crawl-data/CC-MAIN-2024-42/segments/1727944253...,CC-MAIN-2024-42,success,b'<!DOCTYPE html><html\nlang=nl-NL ><head><sty...
5,https://www.middachten.nl/,crawl-data/CC-MAIN-2024-42/segments/1727944253...,CC-MAIN-2024-42,success,b'<!DOCTYPE html><html\nlang=nl-NL ><head><sty...
6,https://www.middachten.nl,crawl-data/CC-MAIN-2024-38/segments/1725700651...,CC-MAIN-2024-38,success,b'<!DOCTYPE html><html\nlang=nl-NL ><head><sty...
7,https://www.middachten.nl/,crawl-data/CC-MAIN-2024-38/segments/1725700651...,CC-MAIN-2024-38,success,b'<!DOCTYPE html><html\nlang=nl-NL ><head><sty...
8,https://www.middachten.nl/,crawl-data/CC-MAIN-2024-38/segments/1725700651...,CC-MAIN-2024-38,success,b'<!DOCTYPE html><html\nlang=nl-NL ><head><sty...
9,https://www.middachten.nl,crawl-data/CC-MAIN-2024-26/segments/1718198861...,CC-MAIN-2024-26,success,"b'<!DOCTYPE html>\n<html lang=""nl-NL"" >\n\n<he..."


In [65]:
commoncrawl_data.filename.unique()

array(['crawl-data/CC-MAIN-2024-51/segments/1733066127466.39/warc/CC-MAIN-20241202094452-20241202124452-00418.warc.gz',
       'crawl-data/CC-MAIN-2024-51/segments/1733066127466.39/warc/CC-MAIN-20241202094452-20241202124452-00165.warc.gz',
       'crawl-data/CC-MAIN-2024-46/segments/1730477395538.95/warc/CC-MAIN-20241114194152-20241114224152-00418.warc.gz',
       'crawl-data/CC-MAIN-2024-46/segments/1730477395538.95/warc/CC-MAIN-20241114194152-20241114224152-00165.warc.gz',
       'crawl-data/CC-MAIN-2024-42/segments/1727944253701.40/warc/CC-MAIN-20241010190240-20241010220240-00418.warc.gz',
       'crawl-data/CC-MAIN-2024-42/segments/1727944253701.40/warc/CC-MAIN-20241010190240-20241010220240-00165.warc.gz',
       'crawl-data/CC-MAIN-2024-38/segments/1725700651303.70/warc/CC-MAIN-20240910161250-20240910191250-00418.warc.gz',
       'crawl-data/CC-MAIN-2024-38/segments/1725700651303.70/warc/CC-MAIN-20240910161250-20240910191250-00165.warc.gz',
       'crawl-data/CC-MAIN-2024-38/segme

In [66]:
webpage = commoncrawl_data.html[1]
trafilatura.extract(webpage)

"Welkom op Middachten\nAl 800 jaar een gastvrij landgoed in bedrijf.\nKasteel Middachten opent ook dit jaar haar deuren voor een onvergetelijke kerstervaring voor jong en oud. Laat je meevoeren in een betoverend winterwonderland vol kerstbomen, fonkelende lichtjes, schitterende decoraties en nog veel meer! Lees verder…\nDeze website maakt gebruik van cookies. Door op 'Accepteren' te klikken, ga je akkoord met het gebruik van alle cookies op deze website."

In [67]:
def extract_text_from_html(html):
  try:
    text = trafilatura.extract(html)
    return text
  except:
    return None

commoncrawl_data['text'] = commoncrawl_data['html'].apply(extract_text_from_html)

In [68]:
commoncrawl_data.text[0]

"Welkom op Middachten\nAl 800 jaar een gastvrij landgoed in bedrijf.\nKasteel Middachten opent ook dit jaar haar deuren voor een onvergetelijke kerstervaring voor jong en oud. Laat je meevoeren in een betoverend winterwonderland vol kerstbomen, fonkelende lichtjes, schitterende decoraties en nog veel meer! Lees verder…\nDeze website maakt gebruik van cookies. Door op 'Accepteren' te klikken, ga je akkoord met het gebruik van alle cookies op deze website."

In [69]:
commoncrawl_data.text.unique()

array(["Welkom op Middachten\nAl 800 jaar een gastvrij landgoed in bedrijf.\nKasteel Middachten opent ook dit jaar haar deuren voor een onvergetelijke kerstervaring voor jong en oud. Laat je meevoeren in een betoverend winterwonderland vol kerstbomen, fonkelende lichtjes, schitterende decoraties en nog veel meer! Lees verder…\nDeze website maakt gebruik van cookies. Door op 'Accepteren' te klikken, ga je akkoord met het gebruik van alle cookies op deze website.",
       "Deze website maakt gebruik van cookies. Door op 'Accepteren' te klikken, ga je akkoord met het gebruik van alle cookies op deze website.",
       "Welkom op Middachten\nAl 800 jaar een gastvrij landgoed in bedrijf.\nBezoek ons prachtige kasteel, de schitterende tuinen en de expositie ‘In Volle Vaart’ tijdens Open Monumentendag in het 2de weekend van september!\nDeze website maakt gebruik van cookies. Door op 'Accepteren' te klikken, ga je akkoord met het gebruik van alle cookies op deze website.",
       "Welkom op Mid

In [71]:
document = commoncrawl_data.text.tolist()

file_path = 'midd' + '.txt'

with open(file_path, 'w') as f:
    for text in document:
      f.write(text + '\n')

## 2024 Data

In [25]:
# list of indexes https://commoncrawl.org/get-started
#indexes  = ['CC-MAIN-2024-46','CC-MAIN-2024-30','CC-MAIN-2024-26']
indexes = ['CC-MAIN-2024-51', 'CC-MAIN-2024-46', 'CC-MAIN-2024-42', 'CC-MAIN-2024-38', 'CC-MAIN-2024-33', 'CC-MAIN-2024-30', 'CC-MAIN-2024-26', 'CC-MAIN-2024-22', 'CC-MAIN-2024-18', 'CC-MAIN-2024-10']

record_dfs = []

# Fetch each index and store into a dataframe
for index_name in indexes:
    print('Running: ', index_name)
    for i in range((len(url_list)-1)):
        records = search_cc_index(url_list[i],index_name)
        record_df = pd.DataFrame(records)
        record_df['index_name'] = index_name
        record_dfs.append(record_df)

# Combine individual dataframes
all_records_df = pd.concat(record_dfs)
all_records_df = all_records_df.sort_values(by='index_name', ascending=False)
all_records_df = all_records_df.reset_index()

# Create columns where to store data later
all_records_df['success_status'] = 'not processed'
all_records_df['html'] = ''

Running:  CC-MAIN-2024-51
Running:  CC-MAIN-2024-46
Running:  CC-MAIN-2024-42
Running:  CC-MAIN-2024-38
Running:  CC-MAIN-2024-33
Running:  CC-MAIN-2024-30
Running:  CC-MAIN-2024-26
Running:  CC-MAIN-2024-22
Running:  CC-MAIN-2024-18
Running:  CC-MAIN-2024-10


In [26]:
# SKIP
# or run if needed
all_records_df.to_csv('all_records_df.csv', index=False)

In [33]:
df = all_records_df

In [34]:
# safeguareds
# If pickle file exists, check for processed items
pickle_file = 'commcrawl_indeed.pkl'
processed_indices = load_processed_indices(pickle_file)
#if processed_indices:
#    # Remove processed items
#    df = df[~df['index'].isin(processed_indices)]

# Create storage for later
successful = set()
results = {}

# Keep track of each row processed
i = 0 
perc = 0
n_records = len(df)
print(f"Found {n_records} records")
mod = int(n_records * 0.01)

No processed indices found. Pickle file 'commcrawl_indeed.pkl' does not exist.
Found 405 records


In [35]:
from warcio.archiveiterator import ArchiveIterator

# Reset index to help with looping
df.reset_index(drop=True,inplace=True)


for i in range(len(df)):
    # Print every 1% process
    if i % mod == 0: 
        print(f'{i} of {n_records}: {perc}%')
        perc += 1

    record_url = df.loc[i, 'url']

    # Fetch only URLs that were not processed
    # If it was already processed, skip URL 
    # (Helps speeding if you only need one version of the HTML, not its history)
    if not record_url in successful:
        length = int(df.loc[i, 'length'])
        offset = int(df.loc[i, 'offset'])
        warc_record_filename = df.loc[i, 'filename']
        result = fetch_single_record(warc_record_filename, offset, length)
        
        if not result:
            df.loc[i,'success_status'] = 'invalid warc'
        else:
            df.loc[i,'success_status'] = 'success'
            df.loc[i,'html'] = result
    else: 
        df.loc[i,'success_status'] = 'previously processed'

    # Add to pickle file
    append_df_row_to_pickle(df.loc[i, :], pickle_file)

0 of 405: 0%
4 of 405: 1%
8 of 405: 2%
12 of 405: 3%
16 of 405: 4%
20 of 405: 5%
24 of 405: 6%
28 of 405: 7%
32 of 405: 8%
36 of 405: 9%
40 of 405: 10%
44 of 405: 11%
48 of 405: 12%
52 of 405: 13%
56 of 405: 14%
60 of 405: 15%
64 of 405: 16%
68 of 405: 17%
72 of 405: 18%
76 of 405: 19%
80 of 405: 20%
84 of 405: 21%
88 of 405: 22%
92 of 405: 23%
96 of 405: 24%
100 of 405: 25%
104 of 405: 26%
108 of 405: 27%
112 of 405: 28%
116 of 405: 29%
120 of 405: 30%
124 of 405: 31%
128 of 405: 32%
132 of 405: 33%
136 of 405: 34%
140 of 405: 35%
144 of 405: 36%
148 of 405: 37%
152 of 405: 38%
156 of 405: 39%
160 of 405: 40%
164 of 405: 41%
168 of 405: 42%
172 of 405: 43%
176 of 405: 44%
180 of 405: 45%
184 of 405: 46%
188 of 405: 47%
192 of 405: 48%
196 of 405: 49%
200 of 405: 50%
204 of 405: 51%
208 of 405: 52%
212 of 405: 53%
216 of 405: 54%
220 of 405: 55%
224 of 405: 56%
228 of 405: 57%
232 of 405: 58%
236 of 405: 59%
240 of 405: 60%
244 of 405: 61%
248 of 405: 62%
252 of 405: 63%
256 of 405: 64

In [37]:
commoncrawl_data = pd.read_pickle(pickle_file)
commoncrawl_data[
    ['url','filename','index_name','success_status','html']
    ].head()

Unnamed: 0,url,filename,index_name,success_status,html
0,https://www.artland.top/,crawl-data/CC-MAIN-2024-51/segments/1733066127...,CC-MAIN-2024-51,success,"b'<!DOCTYPE html>\n<html lang=""nl"">\n <head..."
1,http://www.sypesteyn.nl/,crawl-data/CC-MAIN-2024-51/segments/1733066449...,CC-MAIN-2024-51,success,"b'\n<!doctype html>\n<html lang=""en"">\n<head>\..."
2,https://www.kasteeltuinassumburg.nl/,crawl-data/CC-MAIN-2024-51/segments/1733066035...,CC-MAIN-2024-51,success,"b'<!doctype html>\n<html lang=""nl-NL"" prefix=""..."
3,https://ruinevanbrederode.nl/,crawl-data/CC-MAIN-2024-51/segments/1733066329...,CC-MAIN-2024-51,success,"b'<!DOCTYPE html>\n<html class=""avada-html-lay..."
4,https://ruinevanbrederode.nl/,crawl-data/CC-MAIN-2024-51/segments/1733066047...,CC-MAIN-2024-51,success,"b'<!DOCTYPE html>\n<html class=""avada-html-lay..."


In [38]:
m1 = commoncrawl_data[~commoncrawl_data[['url']].duplicated()]

In [39]:
m1

Unnamed: 0,index,index_name,urlkey,timestamp,url,mime,mime-detected,status,digest,length,offset,filename,languages,encoding,redirect,success_status,html
0,0,CC-MAIN-2024-51,"top,artland)/",20241202145148,https://www.artland.top/,text/html,text/html,200,NUVW44YE5H2XIA7V2WXICTC6HDTOUXHO,8415,614889314,crawl-data/CC-MAIN-2024-51/segments/1733066127...,"nld,eng",UTF-8,,success,"b'<!DOCTYPE html>\n<html lang=""nl"">\n <head..."
1,0,CC-MAIN-2024-51,"nl,sypesteyn)/",20241208192644,http://www.sypesteyn.nl/,text/html,text/html,200,HYMVK23MO7EFTRVES2TUSQAPB6GRGZIS,1188,51684207,crawl-data/CC-MAIN-2024-51/segments/1733066449...,eng,UTF-8,,success,"b'\n<!doctype html>\n<html lang=""en"">\n<head>\..."
2,0,CC-MAIN-2024-51,"nl,kasteeltuinassumburg)/",20241201181716,https://www.kasteeltuinassumburg.nl/,text/html,text/html,200,J3UBTSYVKOUR6NRLDODRYHL4DZYFDIT3,59300,791873947,crawl-data/CC-MAIN-2024-51/segments/1733066035...,nld,UTF-8,,success,"b'<!doctype html>\n<html lang=""nl-NL"" prefix=""..."
3,0,CC-MAIN-2024-51,"nl,ruinevanbrederode)/",20241205031650,https://ruinevanbrederode.nl/,text/html,text/html,200,536Y6Q2N3GNVKSCC4Y3APGH4JQPREPAQ,12932,460813328,crawl-data/CC-MAIN-2024-51/segments/1733066329...,nld,UTF-8,,success,"b'<!DOCTYPE html>\n<html class=""avada-html-lay..."
5,0,CC-MAIN-2024-51,"nl,kasteelvalkenburg)/",20241214111044,https://www.kasteelvalkenburg.nl/,text/html,text/html,200,UZEMGNM6TRMNMPPWRJGXKR3NJ7XJQUB3,31023,747772457,crawl-data/CC-MAIN-2024-51/segments/1733066124...,"nld,eng",UTF-8,,success,"b'<!doctype html>\n<html lang=""nl-NL"">\n<head>..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
388,1,CC-MAIN-2024-10,"nl,kasteelheeze)/",20240303094635,http://www.kasteelheeze.nl/,text/html,text/html,301,VCGGKYHEN445ZIZ2DO54OTBRT2E237RK,567,2357294,crawl-data/CC-MAIN-2024-10/segments/1707947476...,,,https://www.kasteelheeze.nl/,success,b'<html>\r\n<head><title>301 Moved Permanently...
391,2,CC-MAIN-2024-10,"nl,museummartena)/",20240226064857,https://museummartena.nl/,text/html,text/html,301,3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ,723,16640626,crawl-data/CC-MAIN-2024-10/segments/1707947474...,,,http://www.museummartena.nl/,invalid warc,
393,0,CC-MAIN-2024-10,"nl,slotzuylen)/",20240301205802,http://slotzuylen.nl/,text/html,text/html,301,LGGT76I5HYSS6HQT36GPANELE4H7FWR7,569,2133700,crawl-data/CC-MAIN-2024-10/segments/1707947475...,,,https://slotzuylen.nl/,success,b'<html>\r\n<head><title>301 Moved Permanently...
402,0,CC-MAIN-2024-10,"nl,museummartena)/",20240226064857,http://museummartena.nl/,text/html,text/html,301,HLNR6AWVWYCU3YAENY3HYHLIPNWN66X7,560,1358492,crawl-data/CC-MAIN-2024-10/segments/1707947474...,,,https://museummartena.nl/,success,b'<html>\r\n<head><title>301 Moved Permanently...


In [40]:
webpage = commoncrawl_data.html[0]
trafilatura.extract(webpage)

'Artland Kasteel Strijthagen, Landgraaf\nArtland is sinds 2016 een internationaal kunstcentrum gelegen in complex Strijthagen. In de Hoeve van Strijthagen worden exposities en muziek evenementen georganiseerd, is een galerie met oude, moderne en hedendaagse beeldende kunst en een open atelier. Er vinden workshops en rondleidingen plaats. Rondom het complex is een beeldschoon vijverpark met het Beeldenpark.\nVolg Artland op Instagram @kasteelstrijthagen en @artland.to.do\n© ArtLand.top. All rights reserved. Kasteel Strijthagen. Strijthagen. Rouenhof 17-21, Landgraaf, Limburg. Nederland. Sitemap.'

## 2018 Data

In [None]:
indexes_2018 = ['CC-MAIN-2018-51', 'CC-MAIN-2018-47', 'CC-MAIN-2018-43', 'CC-MAIN-2018-39', 'CC-MAIN-2018-34', 'CC-MAIN-2018-30', 'CC-MAIN-2018-26', 'CC-MAIN-2018-22', 'CC-MAIN-2018-17', 'CC-MAIN-2018-13', 'CC-MAIN-2018-09', 'CC-MAIN-2018-05']

record_dfs = []

# Fetch each index and store into a dataframe
for index_name in indexes_2018:
    print('Running: ', index_name)
    for i in range((len(url_list)-1)):
        records = search_cc_index(url_list[i],index_name)
        record_df = pd.DataFrame(records)
        record_df['index_name'] = index_name
        record_dfs.append(record_df)

# Combine individual dataframes
all_records_df = pd.concat(record_dfs)
all_records_df = all_records_df.sort_values(by='index_name', ascending=False)
all_records_df = all_records_df.reset_index()

# Create columns where to store data later
all_records_df['success_status'] = 'not processed'
all_records_df['html'] = ''