In [17]:
import tldextract
import tld
import pandas as pd
from glob import glob
from datetime import datetime
from cryptography import x509
from cryptography.hazmat.backends import default_backend
from cryptography.hazmat.primitives import hashes

import pyspark.sql.functions as psf
import pyspark.sql.types as pst

In [26]:
%run ./spark-instance-gustavo.ipynb

SparkConf created
Started SparkSession
Spark version 3.5.0


In [1]:
clean_spark()

NameError: name 'clean_spark' is not defined

In [5]:
def eval_list_list_str(my_list):
    try:
        if isinstance(my_list, str):
            return eval(my_list)
        else:
            return [[]]
    except:
        return [[]]


eval_list_list_str_udf = psf.udf(eval_list_list_str, pst.ArrayType(pst.ArrayType(pst.StringType())))


def eval_list(my_list):
    try:
        if isinstance(my_list, str):
            return eval(my_list)
        else:
            return []
    except:
        return []


eval_udf = psf.udf(eval_list, pst.ArrayType(pst.IntegerType()))


def load_hosts_data(port, ts):
    hosts_base_path = HOSTS_PATH_FMT.format(port=port, year=ts.year, month=ts.month, day=ts.day)
    hosts_df = spark.read.option("header", "true") \
                         .option("lineSep", "\n") \
                         .option("quote", "\"") \
                         .option("escape", "\"") \
                         .option("inferSchema", "true") \
                         .csv(f"../dataset/{hosts_base_path}")
    return hosts_df


def load_certs_data(port, ts):
    certs_base_path = CERTS_PATH_FMT.format(port=port, year=ts.year, month=ts.month, day=ts.day)
    certs_df = spark.read.option("header", "true") \
                         .option("multiline", "true") \
                         .option("wholeFile", "true") \
                         .option("inferSchema", "true") \
                         .csv(f"../dataset/{certs_base_path}")
    return certs_df



def load_ldap_data(port, ts):
    ldap_base_path = LDAP_PATH_FMT.format(port=port, year=ts.year, month=ts.month, day=ts.day)
    ldap_df = spark.read.option("header", "true") \
                        .option("lineSep", "\n") \
                        .option("quote", "\"") \
                        .option("escape", "\"") \
                        .option("inferSchema", "true") \
                        .csv(f"../dataset/{ldap_base_path}")
    return ldap_df


def load_ldapstarttls_data(port, ts):
    starttls_base_path = STARTTLS_PATH_FMT.format(port=port, year=ts.year, month=ts.month, day=ts.day)
    starttls_df = spark.read.option("header", "true") \
                            .option("lineSep", "\n") \
                            .option("quote", "\"") \
                            .option("escape", "\"") \
                            .option("inferSchema", "true") \
                            .csv(f"../dataset/{starttls_base_path}")
    return starttls_df


# port 636 scans occurs one day earlier than port 389
PORT_SCANDATE_MAP = {
    636: [
        datetime(2024, 2, 20),
        datetime(2024, 2, 27),
        datetime(2024, 3, 5),
        datetime(2024, 3, 12),
        datetime(2024, 3, 19)
    ],
    389: [
        datetime(2024, 2, 21),
        datetime(2024, 2, 28),
        datetime(2024, 3, 6),
        datetime(2024, 3, 13),
        datetime(2024, 3, 20)
    ]
}

HOSTS_PATH_FMT = "catrin/measurements/tool=goscanner/format=raw/port={port}/scan=tcp/result=hosts/year={year}/month={month:02d}/day={day:02d}"
CERTS_PATH_FMT = "catrin/measurements/tool=goscanner/format=raw/port={port}/scan=tls/result=certs/year={year}/month={month:02d}/day={day:02d}"
LDAP_PATH_FMT = "catrin/measurements/tool=goscanner/format=raw/port={port}/scan=ldap/result=ldap/year={year}/month={month:02d}/day={day:02d}"
STARTTLS_PATH_FMT = "catrin/measurements/tool=goscanner/format=raw/port={port}/scan=starttls_ldap/result=starttls_ldap/year={year}/month={month:02d}/day={day:02d}"

goscanner_timestamps = []
for dates in PORT_SCANDATE_MAP.values():
    goscanner_timestamps.extend(dates)

In [18]:
# Decoding X.509 certificates
def get_x509(pem: str):
    return x509.load_pem_x509_certificate(str.encode(pem), default_backend())


def get_extensions(cert):
    tls_key_exchange = []
    san_list = []
    try:
        extensions = cert.extensions
        for extension in extensions:
            if isinstance(extension.value, x509.TLSFeature):
                tls_key_exchange.append(extension.value.key_exchange)
                continue
            if isinstance(extension.value, x509.SubjectAlternativeName):
                subject_alt_name = extension.value
                san_list += [name for name in subject_alt_name.get_values_for_type(x509.DNSName)]
                continue
    except:
        pass

    return tls_key_exchange, san_list


def get_x509_fields(pem: str):
    try:
        cert = get_x509(pem)
    except ValueError:
        # the certificate contains bytes that cannot be interpreted. Probably invalid cert
        # https://github.com/pyca/cryptography/issues/6804
        print(pem)
        return 4 * [None]  # CHANGE HERE IN CASE ADDITIONAL RETURN PARAMETER

    _, san_list = get_extensions(cert)

    public_key_size = None
    try:
        public_key_size = cert.public_key().key_size
    except AttributeError:
        #'cryptography.hazmat.bindings._rust.openssl.ed25519' object has no attribute 'key_size'
        pass

    cert_fp = cert.fingerprint(hashes.SHA256()).hex().upper()

    return (cert.signature_algorithm_oid._name,
            public_key_size,
            san_list,
            cert_fp,
           )


pem_decoded_schema = pst.StructType([pst.StructField("tls_signature_algorithm", pst.StringType(), True),
                                     pst.StructField("pubkey_bit_size", pst.IntegerType(), True),
                                     pst.StructField("leaf_data_names", pst.ArrayType(pst.StringType()), True),  # SAN
                                     pst.StructField("fingerprint", pst.StringType(), True),
                                    ])


decode_cert_udf = psf.udf(get_x509_fields, pem_decoded_schema)

In [27]:
dfs = []
for port, ts_list in PORT_SCANDATE_MAP.items():
    print(port)
    for ts in ts_list:
        print(ts)
        hosts_df = load_hosts_data(port, ts)

        hosts_df = hosts_df.withColumnRenamed("id", "host_id")
        hosts_df = hosts_df.withColumnRenamed("cert_id", "hosts_cert_id")
        hosts_df = hosts_df.select("host_id", "hosts_cert_id")

        certs_df = load_certs_data(port, ts)
        certs_df = certs_df.withColumnRenamed("id", "cert_id")
        certs_df = certs_df.withColumn("decoded_cert", decode_cert_udf(psf.col("cert"))).drop("cert", "system_cert_store").select("cert_id", "decoded_cert.*")

        ldap_df = None
        if port == 636:
            ldap_df = load_ldap_data(port, ts)
        else:  # port 389
            ldap_df = load_ldapstarttls_data(port, ts)

        ldap_df = ldap_df.withColumnRenamed("id", "ldap_id").select("ldap_id", "ldap_server")

        ldap_id_df = ldap_df.filter(psf.col("ldap_server") == 1).select("ldap_id")

        ldap_hosts_df = hosts_df.join(ldap_id_df, ldap_id_df.ldap_id == hosts_df.host_id, "inner").filter(psf.col("ldap_id").isNotNull()).drop("ldap_id")

        ldap_hosts_cert_df = ldap_hosts_df.join(certs_df, ldap_hosts_df.hosts_cert_id == certs_df.cert_id, "inner").drop("hosts_cert_id", "cert_id")
        
        when = ts.strftime("%Y%m%d")
        ldap_hosts_cert_df = ldap_hosts_cert_df.withColumn("date", psf.lit(when))

        dfs.append(ldap_hosts_cert_df)
    print("------------------")

636
2024-02-20 00:00:00
2024-02-27 00:00:00
2024-03-05 00:00:00
2024-03-12 00:00:00
2024-03-19 00:00:00
------------------
389
2024-02-21 00:00:00
2024-02-28 00:00:00
2024-03-06 00:00:00
2024-03-13 00:00:00
2024-03-20 00:00:00
------------------


In [29]:
goscanner_df = dfs[0].unionByName(dfs[1])
for i in range(2, len(dfs)):
    goscanner_df = goscanner_df.unionByName(dfs[i])

output = "../dataset/processing/2024-goscanner-ldap-san"
goscanner_df.coalesce(1).pandas_api().to_parquet(output, mode="overwrite")

In [41]:
parquet_file = "../dataset/catrin/ldap-dependency-2023Nov-goscanner.parquet"  # November 2023
#parquet_file = "../dataset/catrin/2024-goscanner-ldap-san.parquet"  # February-March 2024
goscanner_df = spark.read.parquet(parquet_file)
output = "../dataset/processing/2023-Nov-leaf_data_names"
goscanner_df.select(psf.explode_outer("leaf_data_names").alias("san")).distinct().coalesce(1).pandas_api().to_csv(output, index=False)

In [42]:
def fld(domain):
    if domain is None:
        return None
    domain = str(domain)
    try:
        tld.get_tld(domain, fail_silently=False, fix_protocol=True)
        return tld.get_fld(domain, fail_silently=False, fix_protocol=True)
    except tld.exceptions.TldBadUrl:
        return None
    except tld.exceptions.TldDomainNotFound:
        return None
    except ValueError:
        return None


def get_sld(domain):
    if domain is None:
        return None
    domain = str(domain)
    try:
        return tldextract.extract(domain).registered_domain
    except:
        return None


c = glob(f"{output}/*.csv")
san_pdf = pd.read_csv(c[0])
san_pdf["fld"] = san_pdf.san.apply(fld)
san_pdf["sld"] = san_pdf.san.apply(get_sld)

valid_names_pdf = san_pdf[san_pdf["fld"].notnull()]
additional_names_pdf = pd.DataFrame(valid_names_pdf["sld"].unique(), columns=["san"])
names_pdf = pd.concat([pd.DataFrame(valid_names_pdf["san"], columns=["san"]), additional_names_pdf], ignore_index=True).drop_duplicates(subset=["san"])
names_pdf = names_pdf[names_pdf["san"] != ""].sort_values("san", ascending=True)
names_pdf.to_csv("../dataset/processing/san.csv", index=False)