In [3]:
%run ./spark-instance.ipynb

SparkConf created
Started SparkSession
Spark version 3.3.1


In [17]:
clean_spark()  # run by EOB

CLEANING SPARK INSTANCE...


In [4]:
%matplotlib inline

from collections import defaultdict
from datetime import datetime
import math
import os

import matplotlib.pyplot as plt
import pandas as pd

from cryptography import x509
from cryptography.hazmat.backends import default_backend
from cryptography.hazmat.primitives import hashes


from pyspark.sql.types import TimestampType, StringType, ArrayType, IntegerType, BooleanType, StructField, StructType
import pyspark.sql.functions as psf
from pyspark.storagelevel import StorageLevel

### Extracting Goscanner data

In [5]:
def eval_list_list_str(my_list):
    try:
        if isinstance(my_list, str):
            return eval(my_list)
        else:
            return [[]]
    except:
        return [[]]


eval_list_list_str_udf = psf.udf(eval_list_list_str, ArrayType(ArrayType(StringType())))


def eval_list(my_list):
    try:
        if isinstance(my_list, str):
            return eval(my_list)
        else:
            return []
    except:
        return []


eval_udf = psf.udf(eval_list, ArrayType(IntegerType()))


def load_hosts_data(port, ts):
    hosts_base_path = HOSTS_PATH_FMT.format(port=port, year=ts.year, month=ts.month, day=ts.day)
    hosts_df = spark.read.option("header", "true") \
                         .option("lineSep", "\n") \
                         .option("quote", "\"") \
                         .option("escape", "\"") \
                         .option("inferSchema", "true") \
                         .csv(f"s3a://{hosts_base_path}")
    return hosts_df


def load_certs_data(port, ts):
    certs_base_path = CERTS_PATH_FMT.format(port=port, year=ts.year, month=ts.month, day=ts.day)
    certs_df = spark.read.option("header", "true") \
                         .option("multiline", "true") \
                         .option("wholeFile", "true") \
                         .option("inferSchema", "true") \
                         .csv(f"s3a://{certs_base_path}")
    return certs_df


def load_tls_data(port, ts):
    tls_base_path = TLS_PATH_FMT.format(port=port, year=ts.year, month=ts.month, day=ts.day)
    tls_df = spark.read.option("header", "true") \
                       .option("multiline", "true") \
                       .option("wholeFile", "true") \
                       .option("inferSchema", "true") \
                       .csv(f"s3a://{tls_base_path}")
    return tls_df


def load_ldap_data(port, ts):
    ldap_base_path = LDAP_PATH_FMT.format(port=port, year=ts.year, month=ts.month, day=ts.day)
    ldap_df = spark.read.option("header", "true") \
                        .option("lineSep", "\n") \
                        .option("quote", "\"") \
                        .option("escape", "\"") \
                        .option("inferSchema", "true") \
                        .csv(f"s3a://{ldap_base_path}")
    return ldap_df


def load_ldapstarttls_data(port, ts):
    starttls_base_path = STARTTLS_PATH_FMT.format(port=port, year=ts.year, month=ts.month, day=ts.day)
    starttls_df = spark.read.option("header", "true") \
                            .option("lineSep", "\n") \
                            .option("quote", "\"") \
                            .option("escape", "\"") \
                            .option("inferSchema", "true") \
                            .csv(f"s3a://{starttls_base_path}")
    return starttls_df


def load_cert_validator(port, ts):
    cert_validator_base_path = CERTVAL_PATH_FMT.format(port=port, year=ts.year, month=ts.month, day=ts.day)
    df = spark.read.parquet(f"s3a://{cert_validator_base_path}")
    return df


def convert_output_df(df):
    df = df.select("id", "generic_error", psf.explode("root_stores").alias("store_name", "result"))
    df = df.select("id", "generic_error", "store_name", "result.*")
    df = df.withColumn("vc", eval_list_list_str_udf(psf.col("valid_chains")))
    df = df.drop("valid_chains")
    df = df.withColumnRenamed("vc", "valid_chains")
    return df


len_udf = psf.udf(lambda chain: [len(x) for x in chain], ArrayType(IntegerType()))


def chain_len(df):
    df = df.withColumn("chain_len", len_udf(psf.col("valid_chains")))
    df.select("chain_len").groupBy(psf.col("chain_len")).count().show()
    return df


SHORT1 = "Unknown authority"
SHORT2 = "Expired/Not yet valid"
SHORT3 = "Not authorized"
SHORT4 = "Too many intermediates"
SHORT5 = "Bad extension"
SHORT6 = "Other errors"
SHORT7 = "Self-signed"


short_error_name_map = {
    'x509: certificate signed by unknown authority': SHORT1,
    'x509: certificate signed by unknown authority - With possible explanation': SHORT1,
    'x509: certificate has expired or is not yet valid': SHORT2,
    'x509: certificate is not authorized to sign other certificates': SHORT3,
    'x509: too many intermediates for path length constraint': SHORT4,
    'x509: unhandled critical extension': SHORT5
}


def peer_certs_len(peer_certs_str):
    peer_certs = eval_list(peer_certs_str)
    return len(peer_certs)


peer_certs_len_udf = psf.udf(peer_certs_len, IntegerType())


def error_str(error_list, peer_certs_len_list):
    for i in range(len(error_list)):
        if "certificate signed by unknown authority" in error_list[i]:
            if peer_certs_len_list[i] == 1:
                return SHORT7

    for error in error_list:
        if error != "":
            break
    return short_error_name_map[error]


error_str_udf = psf.udf(error_str, StringType())


def parse_error(error_data):
    if error_data == "":
        return ""
    # magic
    parsed_error_data = ':'.join(error_data.split(":")[:2])
    if "possibly because" in error_data:
        parsed_error_data = parsed_error_data.split("(")[0] + "- With possible explanation"
    return parsed_error_data


parse_error_udf = psf.udf(parse_error, StringType())


# port 636 scans occurs one day earlier than port 389
PORT_SCANDATE_MAP = {
    636: [
        datetime(2023, 11, 7),
        datetime(2023, 11, 14),
        datetime(2023, 11, 21),
        datetime(2023, 11, 28)
    ],
    389: [
        datetime(2023, 11, 1),
        datetime(2023, 11, 8),
        datetime(2023, 11, 15),
        datetime(2023, 11, 22),
        datetime(2023, 11, 29)
    ]
}

HOSTS_PATH_FMT = "catrin/measurements/tool=goscanner/format=raw/port={port}/scan=tcp/result=hosts/year={year}/month={month:02d}/day={day:02d}"
CERTS_PATH_FMT = "catrin/measurements/tool=goscanner/format=raw/port={port}/scan=tls/result=certs/year={year}/month={month:02d}/day={day:02d}"
TLS_PATH_FMT = "catrin/measurements/tool=goscanner/format=raw/port={port}/scan=tls/result=tls_verbose/year={year}/month={month:02d}/day={day:02d}"
LDAP_PATH_FMT = "catrin/measurements/tool=goscanner/format=raw/port={port}/scan=ldap/result=ldap/year={year}/month={month:02d}/day={day:02d}"
STARTTLS_PATH_FMT = "catrin/measurements/tool=goscanner/format=raw/port={port}/scan=starttls_ldap/result=starttls_ldap/year={year}/month={month:02d}/day={day:02d}"
CERTVAL_PATH_FMT = "catrin/data_processing/tool=cert-validator/format=parquet/port={port}/year={year}/month={month:02d}/day={day:02d}"


goscanner_timestamps = []
for dates in PORT_SCANDATE_MAP.values():
    goscanner_timestamps.extend(dates)


tls_version_str_dict = {
    int("0x0301", 16): "TLSv1.0",
    int("0x0302", 16): "TLSv1.1",
    int("0x0303", 16): "TLSv1.2",
    int("0x0304", 16): "TLSv1.3",
    int("0x0300", 16): "SSLv3"
}


def tls_version_to_string(version_number: int):
    return tls_version_str_dict.get(version_number, str(version_number))


tls_version_udf = psf.udf(tls_version_to_string, StringType())


def convert_cipher(x):
    try:
        parts = [int(part, 16) for part in x.split(",")]
    except ValueError:
        # to cover reserved values like 0x00,0x1C-1D
        return None
    cipher = parts[0] << 8
    cipher |= parts[1]
    return hex(cipher)[2:]


# https://www.iana.org/assignments/tls-parameters/tls-parameters.xhtml#tls-parameters-4
tls_parameters_pdf = pd.read_csv("tls-parameters-4.csv")
tls_parameters_pdf["Cipher"] = tls_parameters_pdf["Value"].apply(convert_cipher)
tls_parameters_pdf["DTLS-OK"] = tls_parameters_pdf["DTLS-OK"].apply(lambda x: str(x))
tls_parameters_pdf["Recommended"] = tls_parameters_pdf["Recommended"].apply(lambda x: str(x))
tls_parameters_pdf["Reference"] = tls_parameters_pdf["Reference"].apply(lambda x: str(x))
tls_parameter_dict = tls_parameters_pdf[["Cipher", "Description"]].set_index("Cipher").to_dict()["Description"]


def cipher_to_description(cipher):
    return tls_parameter_dict.get(cipher, "Unknown")


cipher_to_description_udf = psf.udf(cipher_to_description, StringType())

In [6]:
# Decoding X.509 certificates
def get_x509(pem: str):
    return x509.load_pem_x509_certificate(str.encode(pem), default_backend())


def get_extensions(cert):
    tls_key_exchange = []
    san_list = []
    try:
        extensions = cert.extensions
        for extension in extensions:
            if isinstance(extension.value, x509.TLSFeature):
                tls_key_exchange.append(extension.value.key_exchange)
                continue
            if isinstance(extension.value, x509.SubjectAlternativeName):
                subject_alt_name = extension.value
                san_list += [name for name in subject_alt_name.get_values_for_type(x509.DNSName)]
                continue
    except:
        pass

    return tls_key_exchange, san_list


def get_x509_fields(pem: str):
    try:
        cert = get_x509(pem)
    except ValueError:
        # the certificate contains bytes that cannot be interpreted. Probably invalid cert
        # https://github.com/pyca/cryptography/issues/6804
        print(pem)
        return 4 * [None]  # CHANGE HERE IN CASE ADDITIONAL RETURN PARAMETER

    _, san_list = get_extensions(cert)

    public_key_size = None
    try:
        public_key_size = cert.public_key().key_size
    except AttributeError:
        #'cryptography.hazmat.bindings._rust.openssl.ed25519' object has no attribute 'key_size'
        pass

    cert_fp = cert.fingerprint(hashes.SHA256()).hex().upper()

    return (cert.signature_algorithm_oid._name,
            public_key_size,
            san_list,
            cert_fp,
           )


pem_decoded_schema = StructType([StructField("tls_signature_algorithm", StringType(), True),
                                 StructField("pubkey_bit_size", IntegerType(), True),
                                 StructField("leaf_data_names", ArrayType(StringType()), True),  # SAN
                                 StructField("fingerprint", StringType(), True),
                                ])


decode_cert_udf = psf.udf(get_x509_fields, pem_decoded_schema)

In [8]:
dfs = []
for port, ts_list in PORT_SCANDATE_MAP.items():
    print(port)
    for ts in ts_list:
        print(ts)
        hosts_df = load_hosts_data(port, ts)

        hosts_df = hosts_df.withColumnRenamed("id", "host_id")
        hosts_df = hosts_df.withColumnRenamed("ip", "ipv4")
        hosts_df = hosts_df.withColumnRenamed("cert_id", "hosts_cert_id")
        hosts_df = hosts_df.select("host_id", "port", "ipv4", "protocol", "cipher", "hosts_cert_id", "peer_certificates")
        hosts_df = hosts_df.withColumn("peer_certs_len", peer_certs_len_udf(psf.col("peer_certificates"))).drop("peer_certificates")

        certval_df = convert_output_df(load_cert_validator(port, ts))
        certval_df = certval_df.withColumn("parsed_root_store_error", parse_error_udf(psf.col("root_store_error")))
        g_certval_df = certval_df.groupby("id") \
                                .agg(psf.collect_set("store_name").alias("store_name_list"),
                                    psf.collect_list("is_valid").alias("valid_list"),
                                    psf.collect_list("parsed_root_store_error").alias("root_store_error_list")
                                    )
        g_certval_df = g_certval_df.withColumnRenamed("id", "certval_id")

        certs_df = load_certs_data(port, ts)
        certs_df = certs_df.withColumnRenamed("id", "cert_id")
        certs_df = certs_df.withColumn("decoded_cert", decode_cert_udf(psf.col("cert"))).drop("cert", "system_cert_store").select("cert_id", "decoded_cert.*")

        certval_df = load_cert_validator(port, ts)

        ldap_df = None
        if port == 636:
            ldap_df = load_ldap_data(port, ts)
        else:  # port 389
            ldap_df = load_ldapstarttls_data(port, ts)

        ldap_df = ldap_df.withColumnRenamed("id", "ldap_id").select("ldap_id", "ldap_server")

        ldap_id_df = ldap_df.filter(psf.col("ldap_server") == 1).select("ldap_id")

        ldap_hosts_df = hosts_df.join(ldap_id_df, ldap_id_df.ldap_id == hosts_df.host_id, "inner").filter(psf.col("ldap_id").isNotNull()).drop("ldap_id")

        ldap_hosts_cert_df = ldap_hosts_df.join(certs_df, ldap_hosts_df.hosts_cert_id == certs_df.cert_id, "inner").drop("hosts_cert_id", "cert_id")

        ldap_hosts_cert_val_df = ldap_hosts_cert_df.join(g_certval_df, ldap_hosts_cert_df.host_id == g_certval_df.certval_id, "inner").drop("certval_id")


        ldap_hosts_cert_val_df = ldap_hosts_cert_val_df.withColumn("tls_version", tls_version_udf(psf.col("protocol"))).drop("protocol")
        ldap_hosts_cert_val_df = ldap_hosts_cert_val_df.withColumn("tls_cipher", cipher_to_description_udf(psf.col("cipher"))).drop("cipher")
        
        when = ts.strftime("%Y%m%d")
        ldap_hosts_cert_val_df = ldap_hosts_cert_val_df.withColumn("date", psf.lit(when))

        dfs.append(ldap_hosts_cert_val_df)
    print("------------------")

636
2023-11-07 00:00:00
2023-11-14 00:00:00
2023-11-21 00:00:00
2023-11-28 00:00:00
------------------
389
2023-11-01 00:00:00
2023-11-08 00:00:00
2023-11-15 00:00:00
2023-11-22 00:00:00
2023-11-29 00:00:00
------------------


In [9]:
goscanner_df = dfs[0].unionByName(dfs[1])
for i in range(2, len(dfs)):
    goscanner_df = goscanner_df.unionByName(dfs[i])

output = "luvizottocesarg-tmp/ldap-dependency-goscanner"
goscanner_df.coalesce(1).write.parquet(f"s3a://{output}")