In [1]:
%run ./spark-instance-gustavo.ipynb

ModuleNotFoundError: No module named 'pyspark'

ModuleNotFoundError: No module named 'pyspark'

In [1]:
print("hi")

hi


In [6]:
clean_spark()

NameError: name 'clean_spark' is not defined

In [2]:
import os
from datetime import datetime

import pandas as pd

from pyspark.errors import *
from pyspark.sql.types import TimestampType, StringType
import pyspark.sql.functions as psf
from pyspark.storagelevel import StorageLevel

ModuleNotFoundError: No module named 'pyspark'

In [None]:
def rfc9325_recommendation(cipher, tls_version, public_key_size) -> str:
    # https://datatracker.ietf.org/doc/html/rfc9325#name-general-guidelines
    # about cipher suites:
    # https://utcc.utoronto.ca/~cks/space/blog/tech/SSLCipherNames

    # "SHALL" and "MUST"
    cipher_params = cipher.split("_")
    veredict = "Y"
    must_not = ["NULL", "RC4"]
    if any(item in must_not for item in cipher_params):
        veredict = "N"

    for param in cipher_params:
        try:
            key_size = int(param.rstrip("L"))
            if key_size < 128:  # should not; 112 is must not
                veredict = "N"
        except:
            pass

    if "_DES40_" in cipher:
        veredict = "N"

    # the "SHOULD NOT" also taken into account
    should_not = ["TLS_RSA_WITH_",  # e.g. TLS_RSA_WITH_AES_128_CBC_SHA
                  "TLS_DH_", 
                  "TLS_ECDH_"  # https://doi.org/10.1007/978-3-319-24174-6_21
                 ]
    if any(i in cipher for i in should_not):
        veredict = "N"

    if tls_version == "TLSv1.2": 
        recommended_v2 = ["TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256",
                          "TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384",
                          "TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256",
                          "TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384"
                         ]
        if cipher in recommended_v2:
            veredict = "Y"

        if "TLS_DHE_" in cipher:
            veredict = "N"

    # _CBC_ should not be used unless encrypt-then-mac is successfully negotiated
    # encrypt_then_mac extension in client hello with extension_type=0x16 and extension_data=<empty>
    # https://datatracker.ietf.org/doc/html/rfc7366

    # use client hello to detect ECDSA that use NIST curve P-256 and X25519 rfc4492#section-5.1
    # secp256r1 and x25519

    # https://www.rfc-editor.org/rfc/rfc8446#section-9.1
    recommended_v3 = ["TLS_AES_128_GCM_SHA256",
                      "TLS_AES_256_GCM_SHA384",
                      "TLS_CHACHA20_POLY1305_SHA256"
                     ]
    if tls_version == "TLSv1.3":
        if cipher in recommended_v3:
            veredict = "Y"

    # TODO key_exchange_size has to be retrieved from the ConnectionState in the Goscanner
    #if "DHE" in cipher and (key_exchange_size is None or key_exchange_size < 2048):
    #    veredict = "N"
    #if "ECDH" in cipher and (key_exchange_size is None or key_exchange_size < 224):
    #    veredict = "N"

    if "TLS_RSA_WITH_" not in cipher and "RSA" in cipher:
        hash_algo = cipher.split("_")[-1]
        recommended_hash = ["SHA256", "SHA384", "SHA512"]
        not_recommended_hash = ["SHA1", "SHA", "MD5"]
        if hash_algo in recommended_hash:
            veredict = "Y"
        elif hash_algo in not_recommended_hash:
            veredict = "N"

        if public_key_size is None or public_key_size < 2048:
            veredict = "N"

    # if "truncated_hmac": veredict = "N"  # in the extended client hello
    return veredict


is_rfc9325_recommended_udf = psf.udf(rfc9325_recommendation, StringType())

### Extracting Censys data

In [3]:
DATASET = "universal-internet-dataset"
CENSYS_BASE_PATH_FMT = "censys/dataset={dataset}/format=parquet"
CENSYS_PATH_FMT = os.path.join(CENSYS_BASE_PATH_FMT, "year={year}/month={month:02d}/day={day:02d}")

# Censys snapshot of 2022-Nov
timestamps = [
    datetime(2022, 11, 1),
    datetime(2022, 11, 8),
    datetime(2022, 11, 15),
    datetime(2022, 11, 22),
    datetime(2022, 11, 29),
]


def load_censys_data(ts):
    censys_base_path = CENSYS_PATH_FMT.format(dataset=DATASET, year=ts.year, month=ts.month, day=ts.day)
    try:
        censys_df = spark.read.option("basePath", f"../dataset/{censys_base_path}").parquet(f"../dataset/{censys_base_path}")
    except AnalysisException as e:
        print(e)
    return censys_df


def filter_df_by_label(df, ts, label: str):
    llabel = label.lower()
    dns_filtered_df = df.select('*').filter(
        (psf.expr(f"exists(dns_names, x -> lower(x) like '%.{llabel}.%') or exists(dns_names, x -> lower(x) like '{llabel}.%')")) | 
        (psf.expr(f"exists(r_dns_names, x -> lower(x) like '%.{llabel}.%') or exists(r_dns_names, x -> lower(x) like '{llabel}.%')"))
    )
    dns_filtered_df = dns_filtered_df.withColumn("filter_source", psf.lit("dns_rdns"))

    ser_filtered_df = df.select('*').where((psf.array_contains(psf.col("service_names_list"), label.upper())))
    ser_filtered_df = ser_filtered_df.withColumn("filter_source", psf.lit("service_name"))

    filtered_df = ser_filtered_df.unionByName(dns_filtered_df)
    uniq_filtered_df = filtered_df.dropDuplicates(["ipv4"])
    uniq_filtered_df = uniq_filtered_df.withColumn("date", psf.lit(ts).cast(TimestampType()))

    return uniq_filtered_df

In [4]:
from ip_as_org import IPASnPrefix, ASOrg

def ip_to_country(ip: str) -> str:
    asn = ip_asn.get_asn_from_ip(ip)
    return as_org.get_country_from_asn(asn)

def ip_to_org(ip: str) -> str:
    asn = ip_asn.get_asn_from_ip(ip)
    return as_org.get_org_name_from_asn(asn)

dataset_dir = "../dataset/"

ModuleNotFoundError: No module named 'pyasn'

In [5]:
ip_to_country_udf = psf.udf(ip_to_country, StringType())
ip_to_org_udf = psf.udf(ip_to_org, StringType())

CENSYS_TS_DICT = {}
for ts in timestamps:
    print(ts)
    censys_df = load_censys_data(ts)

    select_df = censys_df.select("host_identifier.ipv4",
                                 censys_df.dns.names.alias("dns_names"),
                                 censys_df.dns.reverse_dns.names.alias("r_dns_names"),
                                 "service_names_list",
                                 "services.port",
                                 censys_df.services.tls.version_selected.alias("tls_version"),
                                 censys_df.services.tls.cipher_selected.alias("tls_cipher"),
                                 "services.tls.certificates.leaf_data.pubkey_bit_size",
                                 censys_df.services.tls.certificates.leaf_data.signature.self_signed.alias("tls_signature_self_signed"),
                                 censys_df.services.tls.certificates.leaf_data.signature.signature_algorithm.alias("tls_signature_algorithm"),
                                 censys_df.services.tls.certificates.leaf_data.subject.common_name.alias("cert_cn"),
                                 censys_df.services.tls.certificates.leaf_data.issuer.common_name.alias("issuer_cn")
                                )

    uniq_filtered_df = filter_df_by_label(select_df, "ldap")
    when = ts.strftime("%Y%m%d")
    ip_asn = IPASnPrefix(when, dataset_dir)
    as_org = ASOrg(when, dataset_dir)
    uniq_filtered_df = uniq_filtered_df.withColumn("country", ip_to_country_udf(psf.col("ipv4")))
    uniq_filtered_df = uniq_filtered_df.withColumn("org", ip_to_org_udf(psf.col("ipv4")))

    CENSYS_TS_DICT[ts] = uniq_filtered_df
print("-------")

NameError: name 'psf' is not defined

In [37]:
# Create data
data = [
    (["apple", "banana", "cherry"], [1, 2, 3]),
    (["mango", "guava", "pineapple"], [4, 5, 6])
]

# Create columns list
columns = ["Fruits", "Numbers"]

# Create dataframe
df = spark.createDataFrame(data, columns)

# Display dataframe
df.show()
exploded_df = df.select(psf.explode_outer(psf.arrays_zip(df.Fruits, df.Numbers)).alias("exploded"))
exploded_df = exploded_df.select(psf.col("exploded.Fruits").alias("Fruit"), psf.col("exploded.Numbers").alias("Number"))

exploded_df = exploded_df.filter(
   (psf.col("Fruit").like("%apple%")) | (psf.col("Number").like("%3%"))
)
exploded_df.show()

+--------------------+---------+
|              Fruits|  Numbers|
+--------------------+---------+
|[apple, banana, c...|[1, 2, 3]|
|[mango, guava, pi...|[4, 5, 6]|
+--------------------+---------+

+---------+------+
|    Fruit|Number|
+---------+------+
|    apple|     1|
|   cherry|     3|
|pineapple|     6|
+---------+------+



In [None]:
for df in CENSYS_TS_DICT.values():
    