In [None]:
%matplotlib inline

from collections import defaultdict
from datetime import datetime
from fnmatch import fnmatch
import math
from glob import glob
import os
import statistics

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import tldextract
from cryptography import x509
from cryptography.hazmat.backends import default_backend
from cryptography.hazmat.primitives import hashes

from pyspark.errors import *
import pyspark.sql.types as pst
import pyspark.sql.functions as psf
from pyspark.storagelevel import StorageLevel
from pyspark.sql.window import Window

In [None]:
%run ./spark-instance-gustavo.ipynb

In [None]:
clean_spark()

In [None]:
def eval_list_list_list_str(my_list):
    try:
        if isinstance(my_list, str):
            return eval(my_list)
        else:
            return [[[]]]
    except:
        return [[[]]]


eval_list_list_list_str_udf = psf.udf(eval_list_list_list_str, pst.ArrayType(pst.ArrayType(pst.ArrayType(pst.StringType()))))


def eval_list_list_str(my_list):
    try:
        if isinstance(my_list, str):
            return eval(my_list)
        else:
            return [[]]
    except:
        return [[]]


eval_list_list_str_udf = psf.udf(eval_list_list_str, pst.ArrayType(pst.ArrayType(pst.StringType())))


def convert_ldap_attributes(df):
    df = df.withColumn("asd", eval_list_list_str_udf(psf.col("attribute_names")))
    df = df.drop("attribute_names")
    df = df.withColumnRenamed("asd", "attribute_names")
    df = df.withColumn("qwe", eval_list_list_list_str_udf(psf.col("attribute_values_list")))
    df = df.drop("attribute_values_list")
    df = df.withColumnRenamed("qwe", "attribute_values_list")
    return df


def flatten_ldap_metadata(df):
    df = df.select("id", "ip", "port", "result_code", "error_data", "matched_dns",
                                    psf.explode_outer(psf.arrays_zip(
                                                      df.attribute_names,
                                                      df.attribute_values_list)))
    df = df.select("id", "ip", "port", "result_code", "error_data", "matched_dns", "col.attribute_names", "col.attribute_values_list")
    df = df.select("id", "ip", "port", "result_code", "error_data", "matched_dns",
                                    psf.explode_outer(psf.arrays_zip(
                                                      df.attribute_names,
                                                      df.attribute_values_list)))
    df = df.select("id", "ip", "port", "result_code", "error_data", "matched_dns", "col.attribute_names", "col.attribute_values_list")
    df = df.withColumnRenamed("attribute_names", "attribute_name")
    return df


def flat_ldap_attr_values(df):
    # good for vendorName filters for example
    df = df.select("id", "ip", "port", "result_code", "error_data", "matched_dns", "attribute_name", psf.explode_outer(df.attribute_values_list))
    df = df.withColumnRenamed("col", "attribute_value")
    return df


def load_ldap_root_dse(port, ts):
    root_dse_base_path = ROOT_DSE_PATH_FMT.format(port=port, year=ts.year, month=ts.month, day=ts.day)
    df = spark.read.option("header", "true") \
                   .option("multiline", "true") \
                   .option("wholeFile", "true") \
                   .option("inferSchema", "true") \
                   .csv(f"../dataset/{root_dse_base_path}")

    df = convert_ldap_attributes(df)
    return df


def load_ldap_schema(port, ts):
    schema_base_path = SCHEMA_PATH_FMT.format(port=port, year=ts.year, month=ts.month, day=ts.day)
    df = spark.read.option("header", "true") \
                   .option("multiline", "true") \
                   .option("wholeFile", "true") \
                   .option("inferSchema", "true") \
                   .csv(f"../dataset/{schema_base_path}")

    df = convert_ldap_attributes(df)
    return df

CRAWL_PORT_SCANDATE_MAP = {
    636: datetime(2024, 4, 23),
    389: datetime(2024, 4, 24),
}

SCHEMA_PATH_FMT = "catrin/measurements/tool=goscanner/format=raw/port={port}/scan=ldap_crawl/result=ldap_schema/year={year}/month={month:02d}/day={day:02d}"
ROOT_DSE_PATH_FMT = "catrin/measurements/tool=goscanner/format=raw/port={port}/scan=ldap_crawl/result=ldap_root_dse/year={year}/month={month:02d}/day={day:02d}"

In [None]:
for port, ts in CRAWL_PORT_SCANDATE_MAP.items():
    print(port, ts)
    #schema_df = load_ldap_schema(port, ts)
    #schema_df = flatten_ldap_metadata(schema_df)

    root_dse_df = load_ldap_root_dse(port, ts)
    #root_dse_df = flatten_ldap_metadata(root_dse_df)

    break

What are the LDAPv2 only servers we see around?

In [None]:
ldapv2_df = root_dse_df.filter(
    psf.col("attribute_name") == "supportedLDAPVersion"
    ).filter(
    psf.col("attribute_values_list").isin(psf.lit([2]))
    ).select(
        "id"
    )

root_dse_df.join(ldapv2_df, "id", "inner").filter(
    (psf.col("attribute_name") == "supportedControl")
    | (psf.col("attribute_name") == "namingContexts")
    | (psf.col("attribute_name") == "vendorName")
).show()

print(ldapv2_df.count())

+-------+---------------+----+-----------+----------+-----------+----------------+---------------------+----------+
|     id|             ip|port|result_code|error_data|matched_dns|  attribute_name|attribute_values_list|serverName|
+-------+---------------+----+-----------+----------+-----------+----------------+---------------------+----------+
|  83836| 93.108.171.180| 389|          0|      NULL|      ['',]|  namingContexts|       [dc=pbx.local]|          |
|1126513|    148.69.66.4| 389|          0|      NULL|      ['',]|  namingContexts|       [dc=pbx.local]|          |
|1204577| 24.134.197.249| 389|          0|      NULL|      ['',]|  namingContexts|       [dc=pbx.local]|          |
|1420243| 51.155.241.131| 389|          0|      NULL|      ['',]|supportedControl| [1.3.6.1.4.1.4203...|          |
|2246843|  178.13.12.114| 389|          0|      NULL|      ['',]|  namingContexts|       [dc=pbx.local]|          |
|2381239|  89.155.23.161| 389|          0|      NULL|      ['',]|  namin

A private branch exchange (PBX) is a telephone system within an enterprise that switches calls between users on local lines, while enabling all users to share a certain number of external phone lines. - https://www.techtarget.com/searchunifiedcommunications/definition/private-branch-exchange

IP addresses with o=Interpol  
https://www.entrust.com/blog/2015/04/interpol-world-entrust-datacards-joynes-talks-epassports-and-border-security/  

In [580]:
# TODO extract from namingContexts O, OU and CN; e.g. o=Interpol (216.117.52.155)

for port, ts in CRAWL_PORT_SCANDATE_MAP.items():
    print(port, ts)

    root_dse_df = load_ldap_root_dse(port, ts)

    flat_root_dse_df = flatten_ldap_metadata(root_dse_df)
    flat_root_dse_df.filter(
        psf.array_contains(psf.col("attribute_values_list"), "o=Interpol")
    ).select("ip", "attribute_name", "attribute_values_list").show(truncate=False)

636 2024-04-23 00:00:00
+--------------+--------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|ip            |attribute_name|attribute_values_list                                                                                                                                                          |
+--------------+--------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|148.66.197.88 |namingContexts|[o=Interpol, o=entrust, dc=local, o=World, dc=com, dc=org, dc=ssh, c=AE, c=AU, c=BB, c=BE, c=CA, c=CL, c=CO, c=ES, c=GB, c=JP, c=NZ, c=OM, c=SV, c=UN, c=US, c=XP, c=ZZ]       |
|216.117.52.153|namingContexts|[o=Interpol, o=entrust, dc=local, o=World, dc=com, dc=org, dc=ssh, dc=SE, c=AE, c=AU, c=BB, c=BE, c=CA, c=CL, c=C

User attributes that RootDSE replied

In [264]:
for port, ts in CRAWL_PORT_SCANDATE_MAP.items():
    print(port, ts)

    root_dse_df = load_ldap_root_dse(port, ts)

    flat_root_dse_df = flatten_ldap_metadata(root_dse_df)
    _df = flat_root_dse_df.filter(
        (psf.col("attribute_name") == "telephoneNumber")
        | (psf.col("attribute_name") == "mobile")
        | (psf.col("attribute_name") == "mail")
        | (psf.col("attribute_name") == "customerNumber")
    ).select("ip", "attribute_name", "attribute_values_list")
    print(
        _df.groupBy("ip").agg(
            psf.collect_set("attribute_name").alias("attribute_names"),
            psf.collect_set("attribute_values_list").alias("attribute_values")
        ).count()
    )
    _df.groupBy("attribute_values_list").count().sort("count", ascending=False).show(truncate=False)
    _df.filter(psf.col("attribute_name") == "customerNumber").show(truncate=False)


636 2024-04-23 00:00:00
0
+---------------------+-----+
|attribute_values_list|count|
+---------------------+-----+
+---------------------+-----+

+---+--------------+---------------------+
|ip |attribute_name|attribute_values_list|
+---+--------------+---------------------+
+---+--------------+---------------------+

389 2024-04-24 00:00:00
200
+----------------------------+-----+
|attribute_values_list       |count|
+----------------------------+-----+
|[123456789]                 |392  |
|[interact@s.h, interact@s.h]|196  |
|[119537]                    |1    |
|[111113]                    |1    |
|[222223]                    |1    |
|[111112]                    |1    |
+----------------------------+-----+

+--------------+--------------+---------------------+
|ip            |attribute_name|attribute_values_list|
+--------------+--------------+---------------------+
|111.93.179.155|customerNumber|[119537]             |
|210.92.54.110 |customerNumber|[111112]             |
|203.240.69

What are the searchable PGP keys servers?

In [575]:
flat_root_dse_df = flatten_ldap_metadata(root_dse_df)
pgp_keys_df = flat_root_dse_df.filter(
    psf.col("attribute_values_list").isin(psf.lit(['o=Searchable PGP keys', 'o=Uploaded PGP keys', 'o=Users', 'o=Prefs', 'o=CRL', 'cn=PGPServerInfo', 'o=PGP keys']))
).select("id")

root_dse_df.join(pgp_keys_df, "id", "inner").select("ip").show(10)
print(pgp_keys_df.count())

+--------------+
|            ip|
+--------------+
|193.36.240.186|
|85.232.246.170|
| 23.174.16.232|
|  91.217.173.7|
|62.190.107.142|
|62.251.195.243|
|  212.5.123.11|
| 194.15.212.86|
| 69.58.105.244|
|194.187.222.24|
+--------------+
only showing top 10 rows

109


What does not help:  
- root dse serviceName  

Other results/insights:  
- namingContext - find PGP servers  
- no iPlanet servers...  
- Zentyal is a linux domain and directory server with AD compatibility  


Extracting visible attributes

In [578]:
dfs = []
for port, ts in CRAWL_PORT_SCANDATE_MAP.items():
    print(port, ts)

    root_dse_df = load_ldap_root_dse(port, ts)
    root_dse_df = flatten_ldap_metadata(root_dse_df)
    dfs.append(root_dse_df.select("attribute_name").distinct())

pdf = dfs[0].unionByName(dfs[1]).distinct().toPandas()
pdf.to_csv("root_dse_attributes.csv", index=False, header=True)

636 2024-04-23 00:00:00
389 2024-04-24 00:00:00


In [579]:
dfs = []
for port, ts in CRAWL_PORT_SCANDATE_MAP.items():
    print(port, ts)
    schema_df = load_ldap_schema(port, ts)
    schema_df = flatten_ldap_metadata(schema_df)
    dfs.append(schema_df.select("attribute_name").distinct())

pdf = dfs[0].unionByName(dfs[1]).distinct().toPandas()
pdf.to_csv("schema_attributes.csv", index=False, header=True)

636 2024-04-23 00:00:00
389 2024-04-24 00:00:00


In [None]:
_root_dse_df = root_dse_df.select("id", "ip", "port", "result_code", "error_data", "matched_dns",
                                psf.explode_outer(psf.arrays_zip(
                                                    root_dse_df.attribute_names,
                                                    root_dse_df.attribute_values_list)))
_root_dse_df = _root_dse_df.select("id", "ip", "port", "result_code", "error_data", "matched_dns", "col.attribute_names", "col.attribute_values_list")
_root_dse_df.filter(
    (psf.array_contains(psf.col("attribute_names"), "vendorVersion"))
    | (psf.array_contains(psf.col("attribute_names"), "vendorversion"))
).groupBy(psf.col("attribute_values_list")
).count().sort("count", ascending=False).show(20, truncate=False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------