In [2]:
!pip install boto3 findspark py4j



In [3]:
import boto3
from io import BytesIO
import gzip
import tempfile

In [4]:
#init spark
# Import the findspark module 
import findspark
import os
# Initialize via the full spark path
findspark.init("/opt/apache-spark")
# Import the SparkSession and SQLContext modules
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext

# Build the SparkSession
spark = SparkSession.builder \
   .master("local[*]") \
   .appName("CommonCrawl") \
   .config("spark.executor.memory", "4gb") \
   .getOrCreate()
spark.conf.set("spark.sql.debug.maxToStringFields", 1000)
# Main entry point for Spark functionality. A SparkContext represents the
# connection to a Spark cluster, and can be used to create :class:`RDD` and
# broadcast variables on that cluster.      
sc = spark.sparkContext

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/07/31 11:05:06 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [20]:
import os
import boto3
from io import BytesIO
import gzip
import requests

def get_s3_file_content(bucket, dataset, key, data_dir='data'):
    # Ensure the data directory exists
    if not os.path.exists(data_dir):
        os.makedirs(data_dir)
    
    # Define the local path for the file
    local_path = os.path.join(data_dir, key.replace('/', '_'))
    
    # Function to download and uncompress the file
    def download_and_uncompress_file():
        # Initialize S3 client
        s3 = boto3.client('s3')
        
        # Get the compressed file from S3
        compressed_file = BytesIO(s3.get_object(Bucket=bucket, Key=key)['Body'].read())
        
        # Uncompress the file
        uncompressed_file = gzip.GzipFile(None, 'rb', fileobj=compressed_file).read()
        
        # Save the uncompressed file locally
        with open(local_path, 'wb') as f:
            f.write(uncompressed_file)
        
        return uncompressed_file
    
        # Function to download and uncompress the file
    def download_and_uncompress_file_noaws():
        # Construct the URL
        url = f"https://data.commoncrawl.org/{key}"
        
        # Download the file
        response = requests.get(url)
        response.raise_for_status()  # Raise an error for bad status codes
        
        # Get the compressed file from the response
        compressed_file = BytesIO(response.content)

        if ".gz" in key:
            # Uncompress the file
            uncompressed_file = gzip.GzipFile(None, 'rb', fileobj=compressed_file).read()
        else:
            uncompressed_file = compressed_file.read()
        
        # Save the uncompressed file locally
        with open(local_path, 'wb') as f:
            f.write(uncompressed_file)
        
        return uncompressed_file
    
    # Check if file exists locally
    if os.path.exists(local_path):
        print(f"File {local_path} already exists. Loading it.")
        with open(local_path, 'rb') as f:
            file_content = f.read()
    else:
        print(f"File {local_path} does not exist. Downloading it from Common Crawl.")
        file_content = download_and_uncompress_file_noaws()
    
    return file_content

def download_s3_file(bucket, dataset, key, data_dir='data'):
    # Ensure the data directory exists
    if not os.path.exists(data_dir):
        os.makedirs(data_dir)
    
    # Define the local path for the file
    local_path = os.path.join(data_dir, key.replace('/', '_'))
    
    # Function to download and uncompress the file
    def download_and_uncompress_file():
        # Initialize S3 client
        s3 = boto3.client('s3')
        
        # Get the compressed file from S3
        compressed_file = BytesIO(s3.get_object(Bucket=bucket, Key=key)['Body'].read())
        
        # Uncompress the file
        uncompressed_file = gzip.GzipFile(None, 'rb', fileobj=compressed_file).read()
        
        # Save the uncompressed file locally
        with open(local_path, 'wb') as f:
            f.write(uncompressed_file)
        
        return uncompressed_file
    
        # Function to download and uncompress the file
    def download_and_uncompress_file_noaws():
        # Construct the URL
        url = f"https://data.commoncrawl.org/{key}"
        
        # Download the file
        response = requests.get(url)
        response.raise_for_status()  # Raise an error for bad status codes
        
        # Get the compressed file from the response
        compressed_file = BytesIO(response.content)

        if ".gz" in key:
            # Uncompress the file
            uncompressed_file = gzip.GzipFile(None, 'rb', fileobj=compressed_file).read()
        else:
            uncompressed_file = compressed_file.read()
        
        # Save the uncompressed file locally
        with open(local_path, 'wb') as f:
            f.write(uncompressed_file)
        
        return uncompressed_file
    
    # Check if file exists locally
    if os.path.exists(local_path) or os.path.exists(local_path[:-3]):
        print(f"File {local_path} already exists. Loading it.")
    else:
        print(f"File {local_path} does not exist. Downloading it from Common Crawl.")
        download_and_uncompress_file_noaws()
    
    return local_path

In [6]:
bucket = "commoncrawl"
dataset = "CC-MAIN-2024-30"
key = f"crawl-data/{dataset}/cc-index.paths.gz"
content = get_s3_file_content(bucket, dataset, key)

File data/crawl-data_CC-MAIN-2024-30_cc-index.paths.gz already exists. Loading it.


In [7]:
files = content.decode("ascii").split("\n")
cluster_idx = [file for file in files if "cluster.idx" in file][0]

In [8]:
# get index file
key = cluster_idx
index_file = get_s3_file_content(bucket, dataset, key).decode("ascii").split("\n")

File data/cc-index_collections_CC-MAIN-2024-30_indexes_cluster.idx already exists. Loading it.


In [9]:
key = f"crawl-data/{dataset}/cc-index-table.paths.gz"
files = get_s3_file_content(bucket, dataset, key).decode("ascii").split("\n")
cluster_paths = [file for file in files]

File data/crawl-data_CC-MAIN-2024-30_cc-index-table.paths.gz already exists. Loading it.


In [12]:
#find relevant index files

# get at domains
at_files = [x for x in index_file if x.startswith("com")]

# get index paths

index_files =  set([x.split("\t")[1] for x in at_files ])

index_keys = [f"cc-index/collections/{dataset}/indexes/{x}" for x in index_files]

#domains = set([".".join(x.split(")")[0].split(",")[::-1]) for x in at_files])
print(index_keys)

['cc-index/collections/CC-MAIN-2024-30/indexes/cdx-00104.gz', 'cc-index/collections/CC-MAIN-2024-30/indexes/cdx-00145.gz', 'cc-index/collections/CC-MAIN-2024-30/indexes/cdx-00070.gz', 'cc-index/collections/CC-MAIN-2024-30/indexes/cdx-00144.gz', 'cc-index/collections/CC-MAIN-2024-30/indexes/cdx-00041.gz', 'cc-index/collections/CC-MAIN-2024-30/indexes/cdx-00124.gz', 'cc-index/collections/CC-MAIN-2024-30/indexes/cdx-00043.gz', 'cc-index/collections/CC-MAIN-2024-30/indexes/cdx-00154.gz', 'cc-index/collections/CC-MAIN-2024-30/indexes/cdx-00096.gz', 'cc-index/collections/CC-MAIN-2024-30/indexes/cdx-00113.gz', 'cc-index/collections/CC-MAIN-2024-30/indexes/cdx-00099.gz', 'cc-index/collections/CC-MAIN-2024-30/indexes/cdx-00123.gz', 'cc-index/collections/CC-MAIN-2024-30/indexes/cdx-00109.gz', 'cc-index/collections/CC-MAIN-2024-30/indexes/cdx-00089.gz', 'cc-index/collections/CC-MAIN-2024-30/indexes/cdx-00060.gz', 'cc-index/collections/CC-MAIN-2024-30/indexes/cdx-00093.gz', 'cc-index/collections/C

In [21]:
index_filenames = []
import time
import requests
for key in index_keys:
    print("Downloading: " + key)
    with tempfile.NamedTemporaryFile(delete=False, suffix=".gz.parquet") as temp:
        parquet_file_path = download_s3_file(bucket, dataset, key)
        #parquet_file = requests.get("https://ds5q9oxwqwsfj.cloudfront.net/"+key).content
        index_filenames.append(parquet_file_path)


Downloading: cc-index/collections/CC-MAIN-2024-30/indexes/cdx-00104.gz
File data/cc-index_collections_CC-MAIN-2024-30_indexes_cdx-00104.gz already exists. Loading it.
Downloading: cc-index/collections/CC-MAIN-2024-30/indexes/cdx-00145.gz
File data/cc-index_collections_CC-MAIN-2024-30_indexes_cdx-00145.gz already exists. Loading it.
Downloading: cc-index/collections/CC-MAIN-2024-30/indexes/cdx-00070.gz
File data/cc-index_collections_CC-MAIN-2024-30_indexes_cdx-00070.gz already exists. Loading it.
Downloading: cc-index/collections/CC-MAIN-2024-30/indexes/cdx-00144.gz
File data/cc-index_collections_CC-MAIN-2024-30_indexes_cdx-00144.gz already exists. Loading it.
Downloading: cc-index/collections/CC-MAIN-2024-30/indexes/cdx-00041.gz
File data/cc-index_collections_CC-MAIN-2024-30_indexes_cdx-00041.gz already exists. Loading it.
Downloading: cc-index/collections/CC-MAIN-2024-30/indexes/cdx-00124.gz
File data/cc-index_collections_CC-MAIN-2024-30_indexes_cdx-00124.gz already exists. Loading it

In [16]:
"""
import shutil
import gzip
import json
import re
pattern = r'[^/]+(?=\.gz)'
for i,x in enumerate(index_filenames):
    print(i)
    match = re.search(pattern, x)
    ex = match.group()
    with gzip.open(index_filenames[i], 'rb') as s_file, open(f"data/{ex}.index", 'wb') as d_file:
        print(s_file,d_file)
        shutil.copyfileobj(s_file, d_file, 65536)
        os.unlink(index_filenames[i])
"""

0
<gzip _io.BufferedReader name='data/cc-index_collections_CC-MAIN-2024-30_indexes_cdx-00104.gz' 0x786d0afdfca0> <_io.BufferedWriter name='data/cc-index_collections_CC-MAIN-2024-30_indexes_cdx-00104.index'>


BadGzipFile: Not a gzipped file (b'co')

In [23]:
for filename in index_filenames:
    # Define the new filename without the .gz extension
    if os.path.exists(filename):
        new_filename = filename[:-3]
        
        # Rename the file
        os.rename(filename, new_filename)