# SSL Interception

## Set Up

In [1]:
import pyspark
from pyspark.sql import SparkSession
import pandas as pd
import os
from tldextract import extract
from pandas.core.frame import DataFrame
from pyspark.sql.functions import udf
from pyspark.sql.functions import col
from pyspark.sql.functions import StringType

In [2]:
import os
import sys

os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

In [3]:
SQLContext = SparkSession.builder.master("local[1]") \
                .appName("session-0") \
                .getOrCreate()

22/11/15 15:20:26 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [6]:
#Path to parquet file directory
data = "/mnt/chaseproject/uva/kd5eyn/ssl/2022-02-01/anon.ssl_20220201_0000-0030-0500.log.gz"
#anon.ssl_20220201_0000-0030-0500.log.gz
df = SQLContext.read.parquet(data)
#df = df.filter(df["anon_orig"] == "uva")

## Create Whitelist

* It's unlikely that SSL Interception proxies will use a public trust CA
* To get a whitelist of public trust CAs, we filter out validation status == OK
* Then we save the issuer organization

In [7]:
df.columns

['_lpp_ver',
 'anon_orig',
 'anon_resp',
 'cert_chain_fuids',
 'cipher',
 'client_cert_chain_fuids',
 'client_issuer',
 'client_subject',
 'curve',
 'established',
 'id_orig_h',
 'id_orig_p',
 'id_resp_h',
 'id_resp_p',
 'issuer',
 'ja3',
 'ja3s',
 'last_alert',
 'next_protocol',
 'resumed',
 'server_name',
 'subject',
 'ts',
 'uid',
 'validation_status',
 'version']

In [8]:
def get_issuer_org(issuer_str: str):
    if type(issuer_str) == str:
        o_index = issuer_str.find("O=")
        comma_index = issuer_str.find(",", o_index)
        #Get characters starting from after O= and before ,
        org_str = issuer_str[o_index + 2 :comma_index]
        return org_str.replace("\\", "")

def get_domain(server_name_str: str):
    if type(server_name_str) == str:
        url_parts = extract(server_name_str)
        subdomain = url_parts.subdomain
        domain = url_parts.domain
        #tld - top level domain
        tld = url_parts.suffix
        
        return f"{subdomain}.{domain}.{tld}"

In [9]:
issuer_org_udf = udf(lambda issuer_str: get_issuer_org(issuer_str), StringType())
domain_udf = udf(lambda server_name: get_domain(server_name), StringType())
df = df.withColumn("issuer_O", issuer_org_udf(col("issuer")))
df = df.withColumn("domain", domain_udf(col("server_name")))

In [10]:
df_valid = df.filter(df["validation_status"] == "ok")
df_valid = df_valid.dropDuplicates(["issuer_O"])

In [11]:
df_ok = df_valid.toPandas()

22/11/15 15:23:04 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

In [12]:
df_ok.iloc[1]

_lpp_ver                                                              1.11-2
anon_orig                                                                uva
anon_resp                                                               none
cert_chain_fuids           [FLiwae1ukhOKF8iZi1, Figp3Q3kuoAcFw1PA8, Fuiy7...
cipher                                 TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256
client_cert_chain_fuids                                                   []
client_issuer                                                           None
client_subject                                                          None
curve                                                              secp256r1
established                                                             True
id_orig_h                                                     100.157.29.206
id_orig_p                                                              11032
id_resp_h                                                     133.223.72.215

In [10]:
df_ok.head(20)

Unnamed: 0,_lpp_ver,anon_orig,anon_resp,cert_chain_fuids,cipher,client_cert_chain_fuids,client_issuer,client_subject,curve,established,...,next_protocol,resumed,server_name,subject,ts,uid,validation_status,version,issuer_O,domain
0,1.11-1,uva,none,"[FLhnlK3XKo65ln0Kjf, FqmQAa2RbXnmriGtP1]",TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,[],,,x25519,True,...,h2,False,us.puma.com,"CN=us.puma.com,O=PUMA SE,OU=Global E-Commerce,...",2021-12-28T14:54:54.970484Z,CHYoYS1Xj6HfVIDGOh,ok,TLSv12,D-Trust GmbH,us.puma.com
1,1.11-1,uva,none,"[F3HpUD4wzLgJ5g8HCg, FXFaXv4Il6I3xVhTB6]",TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA,[],,,secp256r1,True,...,,False,www.enable-javascript.com,CN=enable-javascript.com,2021-12-28T14:46:28.778130Z,CvrlIv2Xvwv8YAIp89,ok,TLSv12,DOMENY.PL sp. z o.o,www.enable-javascript.com
2,1.11-1,uva,none,"[F6Lksk3RQuLPisw302, FMGHyT3QdM8f06ig3f]",TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,[],,,secp256r1,True,...,,False,digital-eum-appdynamics.wellsfargo.com,"CN=prod-eum-appdynamics.wellsfargo.com,O=Wells...",2021-12-28T14:44:23.780893Z,CFVCco2d1Xm8qQX611,ok,TLSv12,Wells Fargo & Company,digital-eum-appdynamics.wellsfargo.com
3,1.11-1,uva,none,"[Fj6f2W2GHFq98drIMa, FcHjabPniKr6qhvQ7, FYcvh4...",TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384,[],,,x25519,True,...,,False,mattermost.mpifr-bonn.mpg.de,"CN=mattermost.mpifr-bonn.mpg.de,OU=Max-Planck-...",2021-12-28T14:46:00.085830Z,C6ccee2sDJA3N7cq5j,ok,TLSv12,Max-Planck-Gesellschaft,mattermost.mpifr-bonn.mpg.de
4,1.11-1,uva,none,"[FZ7e159zKvBUDPoK3, FWN4FG4TQTeQodUjP7, FsYdly...",TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384,[],,,secp256r1,True,...,,False,sync6.omnigroup.com,CN=sync6.omnigroup.com,2021-12-28T14:44:25.751412Z,CjgZDq2yrUF4eUy59l,ok,TLSv12,ZeroSSL,sync6.omnigroup.com
5,1.11-1,uva,none,"[FTZETr1ZtvG2PRPxU4, F7BQKU3bhDqJjSiVl8]",TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384,[],,,secp256r1,True,...,,False,perfsonar.nci.org.au,"CN=perfsonar.nci.org.au,O=Australian National ...",2021-12-28T14:49:38.746086Z,CGL0manmvAE9HZ92j,ok,TLSv12,QuoVadis Limited,perfsonar.nci.org.au
6,1.11-1,uva,none,"[FAshHBkeMi9RHFU81, FfTOqs1KXgjy4IL7W, F2paxeQ...",TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,[],,,x25519,True,...,h2,False,zworker7.me,CN=zworker7.me,2021-12-28T14:44:22.113212Z,CgyQHl1IJuhcOglzDi,ok,TLSv12,Let's Encrypt,.zworker7.me
7,1.11-1,uva,none,"[FtuTCf3xJuBy6fwESl, FEBx9E4jSwvpXp66g7]",TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,[],,,secp256r1,True,...,http/1.1,False,takedown.api.playstation.com,CN=*.api.playstation.com,2021-12-28T14:45:10.928183Z,ClWrlFv9XxSzi8zxa,ok,TLSv12,Comodo Japan,takedown.api.playstation.com
8,1.11-1,none,uva,"[Fzr2jpo32tyeffEI8, FKM8hK2BOLz1AAPTck, FXiiVQ...",TLS_DHE_RSA_WITH_AES_256_GCM_SHA384,[],,,,True,...,http/1.1,False,47eGcmVz.virginia.edu,CN=47eGcmVz.virginia.edu,2021-12-28T14:45:03.342702Z,CCzrit3xWJ40m4P9C5,ok,TLSv12,cPanel,47eGcmVz.virginia.edu
9,1.11-1,uva,none,"[F7HX6s2fDTKBBDnc7, FNGjfb2KlKpYF7O7G5, FylTAx...",TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384,[],,,x25519,True,...,,False,pixel.onaudience.com,CN=*.onaudience.com,2021-12-28T14:46:12.777290Z,C77HwU2F7Y7ydTGZQl,ok,TLSv12,home.pl S.A.,pixel.onaudience.com


In [11]:
#df_ok = df[df["validation_status"] == "ok"]
df_orgs = df_ok.groupby(["issuer_O"])["server_name"].agg("count").reset_index(name="count")
df_orgs.head(10)

Unnamed: 0,issuer_O,count
0,Aetna Inc,1
1,AffirmTrust,1
2,Amazon,1
3,Apple Inc.,1
4,COMODO CA Limited,1
5,CentralNic Luxembourg SC3A0rl,1
6,China Financial Certification Authority,1
7,Cloudflare,1
8,Comodo Japan,1
9,Corporation Service Company,1


In [12]:
def gen_whitelist(whitelist_path: str, df_orgs: pd.core.frame.DataFrame):
    whitelist_file = open(whitelist_path, "r+")
    whitelist_orgs = set(whitelist_file.read().splitlines())
    for org in df_orgs["issuer_O"]:
        if org not in whitelist_orgs:
            whitelist_orgs.add(org)
            whitelist_file.write(org + "\n")
    whitelist_file.close()
    return whitelist_orgs

In [13]:
def get_list_set(file_path: str) -> set:
    file = open(file_path, "r")
    return set(file.read().splitlines())

In [14]:
whitelist = gen_whitelist("whitelist.txt", df_orgs)

## Parse for Unable to Get Local Certificate

In [15]:
df_unable = df.filter(df.validation_status == "unable to get local issuer certificate")
df_unable = df_unable.groupBy(["domain", "issuer_O"]).count()
df_unable.count()

                                                                                

440

In [16]:
df_unable = df_unable.toPandas()
df_unable

                                                                                

Unnamed: 0,domain,issuer_O,count
0,p57-fmipmobile.icloud.com,Apple Inc.,1
1,,eYou,1
2,sfd.et1240.epichosted.com,EMR Direct,1
3,instantnews.huub.sliide.cloud,Sliide,3
4,wdcpalt.microsoft.com,Microsoft Corporation,5
...,...,...,...
435,array510.prod.do.dsp.mp.microsoft.com,Microsoft Corporation,1192
436,bmecmg.cloudapp.net,"N=283Lr0 4zOfx 6Nwi 1V,DC=BartonMalow,DC=co",15
437,cehcahealthcare.stdavids.com,EMR Direct,2
438,devops.shottracker.com,N=3x1M07 5EbvZ8tQOb 1V 4S1O61pWj 1w afGABq.48F...,1


In [17]:
blacklist_url = "blacklist.txt"
blacklist = get_list_set(blacklist_url)
blacklist

{'Alleghany County Public Schools',
 'BAYER',
 'Barracuda Networks',
 'Boston Scientific Corporation',
 'Charlotte County Public Schools',
 'Cisco',
 'Cisco Systems',
 'Cisco Systems\\',
 'FIREEYE',
 'Forcepoint LLC',
 'Fortinet',
 'Freddie Mac',
 'General Electric Company',
 'Hanover County Public Schools',
 'Harrisonburg City Public Schools',
 'Henrico County Public Schools',
 'Johnson Controls International PLC',
 'Kaspersky',
 'Kaspersky Lab',
 'LOCKSS box',
 'McKesson Corporation',
 'Menlo Security Inc.',
 'Private Internet Access',
 'SaferNet',
 'Technicolor',
 'Toyota North America',
 'Zscaler Inc.',
 'Zscaler\\',
 'cisco',
 'iboss Network Security'}

In [18]:
df_filtered = df_unable[~(df_unable["issuer_O"].isin(whitelist) | df_unable["issuer_O"].isin(blacklist))]

In [19]:
df_filtered = df_filtered[df_filtered["domain"].str.contains("None") == False]
df_filtered = df_filtered[df_filtered["issuer_O"].str.contains("None") == False]
df_filtered

Unnamed: 0,domain,issuer_O,count
2,sfd.et1240.epichosted.com,EMR Direct,1
3,instantnews.huub.sliide.cloud,Sliide,3
7,sfd.et1146.epichosted.com,EMR Direct,1
16,ceprd.sjhsyr.org,EMR Direct,2
21,sfprod.acpny.com,EMR Direct,1
...,...,...,...
430,carequality.crisphealth.org,EMR Direct,1
431,US.info.lgsmartad.com,LG Electronics Inc.,1
436,bmecmg.cloudapp.net,"N=283Lr0 4zOfx 6Nwi 1V,DC=BartonMalow,DC=co",15
437,cehcahealthcare.stdavids.com,EMR Direct,2


In [20]:
def is_selfsigned(domain: str, issuer_O: str) -> bool:
    domain_lower = extract(domain.lower()).domain
    issuer_lower = issuer_O.lower().replace(" ", "")
    return domain_lower in issuer_lower

In [21]:
df_not_selfsigned = df_filtered[~df_filtered.apply(lambda x: is_selfsigned(x["domain"], x["issuer_O"]), axis=1)]
df_not_selfsigned.reset_index(inplace=True)
df_not_selfsigned = df_not_selfsigned.drop("index", axis=1)
df_not_selfsigned

Unnamed: 0,domain,issuer_O,count
0,sfd.et1240.epichosted.com,EMR Direct,1
1,sfd.et1146.epichosted.com,EMR Direct,1
2,ceprd.sjhsyr.org,EMR Direct,2
3,sfprod.acpny.com,EMR Direct,1
4,ce.chppoc.org,EMR Direct,1
...,...,...,...
104,carequality.crisphealth.org,EMR Direct,1
105,US.info.lgsmartad.com,LG Electronics Inc.,1
106,bmecmg.cloudapp.net,"N=283Lr0 4zOfx 6Nwi 1V,DC=BartonMalow,DC=co",15
107,cehcahealthcare.stdavids.com,EMR Direct,2


# Virus Total API Data Exploration

* Virus Total has information about HTTPS certificates
* Check if Logged Certificate Organization matches one that Virus Total
* If they don't match, may be SSL interception because Virus Total information may be outdated

In [2]:
import vt
api_key_file = open("/home/ubuntu/GitLab/ssl_interception/virus_total_api_key", "r")
api_key = api_key_file.readline()
api_key_file.close()

In [6]:
client = vt.Client(api_key)
url_id = vt.url_id("http://dbankcloud.asia")

In [7]:
import nest_asyncio
nest_asyncio.apply()
#url = client.scan_url("http://dbankcloud.asia")
#url = client.get_object("/domains/{}/historical_whois", "dbankcloud.asia")
try:
    data = client.get_data("/domains/dbankcloud.asia/historical_whois")
except as E
#url

APIError: ('QuotaExceededError', 'Quota exceeded')

In [25]:
data[0]["attributes"]["whois_map"]["Name Server"]

'GNS1.HUAWEICLOUD-DNS.CN | GNS1.HUAWEICLOUD-DNS.COM | GNS1.HUAWEICLOUD-DNS.ORG | NS3.DNSV5.COM | NS4.DNSV5.COM'

In [26]:
data_cas = client.get_data("/domains/hhchealth.org/historical_ssl_certificates")

In [27]:
data_cas

[{'attributes': {'public_key': {'rsa': {'key_size': 2048,
     'modulus': '00acaf87ccb6c52e194ca344023d3683c4fe67f1f2bd794da5e3d96d8c9fdc0df9aa4991c3ff0afde5d27c1c1be4f467b35979af00c2d995fe60070f6bc9e8e44bedd52d9c0b89d394441a011e55c2620594f768b8b8500b53683375866d0543369413e81089d2cf360fbea6d202b08663dde052cb617a281116208cf18e0ad692a06d3317a297865011a0de8fffdc2c5c4a4b10b6c8b5faffb7710926542ae200532d8d36bd050a357a70860c279e37c8ac8b05e188873a0ef75cee68bd02ad013b823e669e4564119ce2446614c93139c7f8e4dd21604ccdd9562d32eb59543c9da45fb3b043300802875670b048e84fa194f30a3d5e277d95345fde3ff69d61',
     'exponent': '010001'},
    'algorithm': 'RSA'},
   'first_seen_date': 1652282289,
   'issuer': {'C': 'US',
    'CN': 'DigiCert TLS RSA SHA256 2020 CA1',
    'O': 'DigiCert Inc'},
   'cert_signature': {'signature': 'bd150ccfadadc2c85b9385936b1284970b4a66bb3ad39b00bcc6e5a5f87cbf100a5dceb39c468c2fe6be2ce211e0d5809be0a060b47c5a3ef3bb18e7ec9b149a722f48211eb6747c9048b8c57a8ba3d7c3cd903c65b9c11352bf47402c9f

In [28]:
data_cas[0]["attributes"]["issuer"]["O"]

'DigiCert Inc'

In [29]:
data_cas_2 = client.get_data("/domains/dbankcloud.asia/historical_ssl_certificates")

In [30]:
data_cas_2

[]

# Virus Total API Filtering

In [31]:
df_not_selfsigned.reset_index(inplace=True, drop=True)
df_not_selfsigned

Unnamed: 0,domain,issuer_O,count
0,sfd.et1240.epichosted.com,EMR Direct,1
1,sfd.et1146.epichosted.com,EMR Direct,1
2,ceprd.sjhsyr.org,EMR Direct,2
3,sfprod.acpny.com,EMR Direct,1
4,ce.chppoc.org,EMR Direct,1
...,...,...,...
104,carequality.crisphealth.org,EMR Direct,1
105,US.info.lgsmartad.com,LG Electronics Inc.,1
106,bmecmg.cloudapp.net,"N=283Lr0 4zOfx 6Nwi 1V,DC=BartonMalow,DC=co",15
107,cehcahealthcare.stdavids.com,EMR Direct,2


In [38]:
import time
import vt
def get_historical_ca(df: pd.DataFrame, client: vt.Client):
    historical_df = df.copy()
    historical_df.reset_index(drop=True, inplace=True)
    historical_cas = []
    for index, row in historical_df.iterrows():
        try:
            domain = row["domain"]
            ssl_ca_data = client.get_data(f"/domains/{domain}/historical_ssl_certificates")
            if len(ssl_ca_data) > 0:
                historical_cas.append(ssl_ca_data[0]["attributes"]["issuer"]["O"])
            else:
                historical_cas.append("None")
        except:
            historical_cas.append("Unable to find")
    historical_df["historical_ca"] = historical_cas
    return historical_df

In [39]:
import nest_asyncio
nest_asyncio.apply()
historical_df = get_historical_ca(df_not_selfsigned, client)
historical_df

Unnamed: 0,domain,issuer_O,count,historical_ca
0,sfd.et1240.epichosted.com,EMR Direct,1,DigiCert Inc
1,sfd.et1146.epichosted.com,EMR Direct,1,DigiCert Inc
2,ceprd.sjhsyr.org,EMR Direct,2,
3,sfprod.acpny.com,EMR Direct,1,
4,ce.chppoc.org,EMR Direct,1,DigiCert Inc
...,...,...,...,...
104,carequality.crisphealth.org,EMR Direct,1,
105,US.info.lgsmartad.com,LG Electronics Inc.,1,
106,bmecmg.cloudapp.net,"N=283Lr0 4zOfx 6Nwi 1V,DC=BartonMalow,DC=co",15,
107,cehcahealthcare.stdavids.com,EMR Direct,2,"Trustwave Holdings, Inc."


In [34]:
historical_df

Unnamed: 0,domain,issuer_O,count,historical_ca
0,sfd.et1240.epichosted.com,EMR Direct,1,DigiCert Inc
1,sfd.et1146.epichosted.com,EMR Direct,1,DigiCert Inc
2,ceprd.sjhsyr.org,EMR Direct,2,
3,sfprod.acpny.com,EMR Direct,1,
4,ce.chppoc.org,EMR Direct,1,DigiCert Inc
...,...,...,...,...
104,carequality.crisphealth.org,EMR Direct,1,
105,US.info.lgsmartad.com,LG Electronics Inc.,1,
106,bmecmg.cloudapp.net,"N=283Lr0 4zOfx 6Nwi 1V,DC=BartonMalow,DC=co",15,
107,cehcahealthcare.stdavids.com,EMR Direct,2,"Trustwave Holdings, Inc."


In [35]:
def save_historical_df(df: pd.DataFrame, source_file: str):
    file_ext_index = source_file.rfind(".")
    src_file_name = source_file[:file_ext_index]
    parquet_file_name = f"{src_file_name}_pos_intercept.parquet"
    df.to_parquet(parquet_file_name)
    return parquet_file_name

save_historical_df(historical_df, "2021-12-28.parquet")

'2021-12-28_pos_intercept.parquet'

In [36]:
test_df = pd.read_parquet("2021-12-28_pos_intercept.parquet")
test_df

Unnamed: 0,domain,issuer_O,count,historical_ca
0,sfd.et1240.epichosted.com,EMR Direct,1,DigiCert Inc
1,sfd.et1146.epichosted.com,EMR Direct,1,DigiCert Inc
2,ceprd.sjhsyr.org,EMR Direct,2,
3,sfprod.acpny.com,EMR Direct,1,
4,ce.chppoc.org,EMR Direct,1,DigiCert Inc
...,...,...,...,...
104,carequality.crisphealth.org,EMR Direct,1,
105,US.info.lgsmartad.com,LG Electronics Inc.,1,
106,bmecmg.cloudapp.net,"N=283Lr0 4zOfx 6Nwi 1V,DC=BartonMalow,DC=co",15,
107,cehcahealthcare.stdavids.com,EMR Direct,2,"Trustwave Holdings, Inc."
