In [4]:
import boto3
from io import BytesIO
import gzip
import tempfile

In [5]:
#init spark
# Import the findspark module 
import findspark
import os
# Initialize via the full spark path
findspark.init("/opt/apache-spark")
# Import the SparkSession and SQLContext modules
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext

# Build the SparkSession
spark = SparkSession.builder \
   .master("local[12]") \
   .appName("CommonCrawl") \
   .config("spark.executor.memory", "1gb") \
   .getOrCreate()
spark.conf.set("spark.sql.debug.maxToStringFields", 1000)
# Main entry point for Spark functionality. A SparkContext represents the
# connection to a Spark cluster, and can be used to create :class:`RDD` and
# broadcast variables on that cluster.      
sc = spark.sparkContext

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/09/30 15:06:51 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [6]:
# Path to the folder containing the CSV files
csv_folder_path = "combined_domains_cc"

# Read all CSV files in the folder
df = spark.read.option("header", "true").csv(os.path.join(csv_folder_path, "*.csv"))

In [7]:
from pyspark.sql.types import *

bhSchema = StructType([
    StructField("_id", StringType(), True),
    StructField("_index", StringType(), True),
    StructField("author", StringType(), True),
    StructField("category", ArrayType(StringType()), True),
    StructField("checked", BooleanType(), True),
    StructField("confirmed", BooleanType(), True),
    StructField("content", StringType(), True),
    StructField("content_origin", StringType(), True),
    StructField("content_raw", StringType(), True),
    StructField("datasource", StringType(), True),
    StructField("date", TimestampType(), True),
    StructField("date_entry", TimestampType(), True),
    StructField("domain", StringType(), True),
    StructField("keywords", ArrayType(StringType()), True),
    StructField("language", StringType(), True),
    StructField("language_origin", StringType(), True),
    StructField("latitude", DoubleType(), True),
    StructField("location", StringType(), True),
    StructField("longitude", DoubleType(), True),
    StructField("source", StringType(), True),
    StructField("submitted_by", StringType(), True),
    StructField("subtitle", StringType(), True),
    StructField("subtitle_origin", StringType(), True),
    StructField("summary", StringType(), True),
    StructField("title", StringType(), True),
    StructField("title_origin", StringType(), True),
    StructField("translated", BooleanType(), True),
    StructField("updated_by", StringType(), True)
])

In [8]:
# Load the JSON file into a DataFrame using the defined schema
dfBH = spark.read.schema(bhSchema).json("../export_osdp_2024-04-25_07-58-58_1.json", multiLine=True)
# Show the DataFrame
dfBH.show(5)

# Print the schema
df.printSchema()
dfBH.printSchema()

[Stage 1:>                                                          (0 + 1) / 1]

+--------------------+------+--------------------+---------+-------+---------+--------------------+--------------------+-----------+----------+-------------------+-------------------+-----------------+--------------------+--------+---------------+--------+--------+---------+--------------------+------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+----------+
|                 _id|_index|              author| category|checked|confirmed|             content|      content_origin|content_raw|datasource|               date|         date_entry|           domain|            keywords|language|language_origin|latitude|location|longitude|              source|submitted_by|            subtitle|     subtitle_origin|             summary|               title|        title_origin|translated|updated_by|
+--------------------+------+--------------------+---------+-------+---------+--------------------+--------------------+------

                                                                                

In [9]:
from urllib.parse import urlparse, urlunparse, quote, unquote, parse_qsl, urlencode
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType
def normalize_url(url):
    # Parse the URL into components
    parsed_url = urlparse(url)
    
    # Convert scheme and host to lowercase
    scheme = parsed_url.scheme.lower()
    host = parsed_url.hostname.lower() if parsed_url.hostname else ''
    
    # Remove default port
    port = parsed_url.port
    if (scheme == 'http' and port == 80) or (scheme == 'https' and port == 443):
        port = None
    
    # Normalize path
    path = quote(unquote(parsed_url.path))
    segments = path.split('/')
    resolved_segments = []
    for segment in segments:
        if segment == '..':
            if resolved_segments:
                resolved_segments.pop()
        elif segment and segment != '.':
            resolved_segments.append(segment)
    normalized_path = '/' + '/'.join(resolved_segments)
    
    # Ensure trailing slash for directory paths
    if normalized_path[-1] != '/' and not normalized_path.split('/')[-1].count('.'):
        normalized_path += '/'
    
    # Sort query parameters
    query = urlencode(sorted(parse_qsl(parsed_url.query)))
    
    # Reconstruct the URL without fragment
    normalized_url = urlunparse((scheme, f"{host}:{port}" if port else host, normalized_path, '', query, ''))
    
    return normalized_url
normalize_url_udf = udf(normalize_url, StringType())

In [10]:
df_normalized_cc = df.withColumn("normalized_url", normalize_url_udf(df["url"]))
df_normalized_bh = dfBH.withColumn("normalized_url", normalize_url_udf(dfBH["source"]))
for col_name in df_normalized_cc.columns:
    df_normalized_cc = df_normalized_cc.withColumnRenamed(col_name, f"CC_{col_name}")

# Rename all columns in df2
for col_name in df_normalized_bh.columns:
    df_normalized_bh = df_normalized_bh.withColumnRenamed(col_name, f"BH_{col_name}")

In [11]:
# Perform the join operation to find overlaps
overlap_df = df_normalized_cc.join(df_normalized_bh, df_normalized_cc["CC_url"]==df_normalized_bh["BH_source"] ,"inner")

# Show the overlapping URLs
overlap_df.show(1)

# Count the number of overlapping URLs
overlap_count = overlap_df.count()
print(f"Number of overlapping URLs: {overlap_count}")

                                                                                

+--------------------+--------------------+--------------------+---------+----------------+---------+--------------------+---------+---------+--------------------+----------+------------+------+----------+--------------------+--------------------+--------------------+---------+---------------+-----------+----------+------------+--------------------+--------------------+--------------+-------------+-------------------+-------------------+----------+--------------------+-----------+------------------+-----------+-----------+------------+--------------------+---------------+-----------+------------------+--------------------+--------------------+--------------------+-------------+-------------+--------------------+
|         CC_MetaData|             CC_Date|              CC_url|  CC_mime|CC_mime-detected|CC_status|           CC_digest|CC_length|CC_offset|         CC_filename|CC_charset|CC_languages|CC_TLD| CC_Domain|             CC_Path|   CC_normalized_url|              BH__id|BH__index|

[Stage 8:>                                                          (0 + 1) / 1]

Number of overlapping URLs: 393


                                                                                

In [12]:
# get all entries in the scoped common crawl that have the same url in the bh dataset
cc_wo_overlap = df_normalized_cc.join(df_normalized_bh, df_normalized_cc["CC_url"]==df_normalized_bh["BH_source"], how="left_anti")

In [13]:
overlap_df.printSchema()

root
 |-- CC_MetaData: string (nullable = true)
 |-- CC_Date: string (nullable = true)
 |-- CC_url: string (nullable = true)
 |-- CC_mime: string (nullable = true)
 |-- CC_mime-detected: string (nullable = true)
 |-- CC_status: string (nullable = true)
 |-- CC_digest: string (nullable = true)
 |-- CC_length: string (nullable = true)
 |-- CC_offset: string (nullable = true)
 |-- CC_filename: string (nullable = true)
 |-- CC_charset: string (nullable = true)
 |-- CC_languages: string (nullable = true)
 |-- CC_TLD: string (nullable = true)
 |-- CC_Domain: string (nullable = true)
 |-- CC_Path: string (nullable = true)
 |-- CC_normalized_url: string (nullable = true)
 |-- BH__id: string (nullable = true)
 |-- BH__index: string (nullable = true)
 |-- BH_author: string (nullable = true)
 |-- BH_category: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- BH_checked: boolean (nullable = true)
 |-- BH_confirmed: boolean (nullable = true)
 |-- BH_content: string (nulla

In [38]:
# Count unique domains of BH dataset that overlaps
unique_domains_count = overlap_df.select("CC_Domain").distinct().count()
print(f"Number of unique domains: {unique_domains_count}")

# Show the domains of BH dataset that overlaps
overlap_df.select("CC_Domain").distinct().show(truncate=False)

#overlap_df.show()

                                                                                

Number of unique domains: 23
+------------------------+
|CC_Domain               |
+------------------------+
|thehackernews.com       |
|de.euronews.com         |
|alghad.com              |
|americanmilitarynews.com|
|newatlas.com            |
|sofrep.com              |
|securityaffairs.com     |
|defence-blog.com        |
|securelist.com          |
|scitechdaily.com        |
|techxplore.com          |
|kyivindependent.com     |
|i-hls.com               |
|addiyar.com             |
|thecyberwire.com        |
|bulgarianmilitary.com   |
|thebarentsobserver.com  |
|gbhackers.com           |
|tass.com                |
|cybernews.com           |
+------------------------+
only showing top 20 rows



In [40]:
# get all entries which url isn't in the bh dataaset but the domain is

# Get the distinct values from overlap_df
distinct_cc_domains = overlap_df.select("CC_Domain").distinct()
distinct_domains_list = [row["CC_Domain"] for row in distinct_cc_domains.collect()]

cc_irrelevant_same_domain = cc_wo_overlap.filter(cc_wo_overlap["CC_domain"].isin(distinct_domains_list))

In [41]:
# Get a random sample with a fixed seed
total_relevant_rows = overlap_df.count()
total_irrelvant_rows = cc_irrelevant_same_domain.count()
fraction = 2*(total_relevant_rows/(total_irrelvant_rows+total_relevant_rows))  # Fraction of rows to sample
# total number of irrelevant rows should be twice as big as relevant rows
seed = 1337  # Fixed seed for reproducibility

In [42]:
from pyspark.sql.functions import lit
selection_cols_BH = ["CC_normalized_url","CC_filename","BH_language_origin","BH_language","BH_keywords","BH_category"]
selection_cols_CC = ["CC_normalized_url","CC_filename"]
relevant_data = overlap_df.select(selection_cols_BH).withColumn("is_relevant", lit(True))
irrelevant_data = cc_irrelevant_same_domain.select(selection_cols_CC).sample(withReplacement=False, fraction=fraction, seed=seed).withColumn("is_relevant", lit(False))

cc_wo_overlap.select("CC_Domain").distinct().count()

141

In [43]:
all_data = relevant_data.unionByName(irrelevant_data,allowMissingColumns=True)
all_data.count()

                                                                                

1212

In [44]:
all_data.show(2000)

                                                                                

+--------------------+--------------------+------------------+-----------+--------------------+-------------------+-----------+
|   CC_normalized_url|         CC_filename|BH_language_origin|BH_language|         BH_keywords|        BH_category|is_relevant|
+--------------------+--------------------+------------------+-----------+--------------------+-------------------+-----------+
|https://alghad.co...|crawl-data/CC-MAI...|                ar|         en|[Struktur, Strukt...|          [krimisi]|       true|
|https://cybernews...|crawl-data/CC-MAI...|                en|         en|[Struktur, Strukt...|            [cyber]|       true|
|https://cybernews...|crawl-data/CC-MAI...|                en|         en|[Struktur, Strukt...|            [cyber]|       true|
|https://cybernews...|crawl-data/CC-MAI...|                en|         en|[Struktur, Strukt...|            [cyber]|       true|
|https://cybernews...|crawl-data/CC-MAI...|                en|         en|[Struktur, Strukt...|         

In [45]:
#count distinct number of warcs to download
distinct_count = all_data.select("CC_filename").distinct().count()
distinct_count

                                                                                

1203

In [46]:
all_data.write.mode('overwrite').option("header", "true").json("target_data")

                                                                                