# SSL Interception

## Set Up

In [1]:
import pyspark
from pyspark.sql import SparkSession
import pandas as pd
import os
from tldextract import extract
from pandas.core.frame import DataFrame
from pyspark.sql.functions import udf
from pyspark.sql.functions import col
from pyspark.sql.functions import StringType

In [2]:
import os
import sys

os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

In [3]:
SQLContext = SparkSession.builder.master("local[1]") \
                .appName("session-0") \
                .getOrCreate()

22/10/10 17:50:45 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [4]:
#Path to parquet file directory
data = "/home/ubuntu/parquet_data/2021-12-28.parquet"
df = SQLContext.read.parquet(data)
df = df.filter(df["anon_orig"] == "uva")

                                                                                

## Create Whitelist

* It's unlikely that SSL Interception proxies will use a public trust CA
* To get a whitelist of public trust CAs, we filter out validation status == OK
* Then we save the issuer organization

In [5]:
df.columns

['_lpp_ver',
 'anon_orig',
 'anon_resp',
 'cert_chain_fuids',
 'cipher',
 'client_cert_chain_fuids',
 'client_issuer',
 'client_subject',
 'curve',
 'established',
 'id_orig_h',
 'id_orig_p',
 'id_resp_h',
 'id_resp_p',
 'issuer',
 'ja3',
 'ja3s',
 'last_alert',
 'next_protocol',
 'resumed',
 'server_name',
 'subject',
 'ts',
 'uid',
 'validation_status',
 'version']

In [6]:
def get_issuer_org(issuer_str: str):
    if type(issuer_str) == str:
        o_index = issuer_str.find("O=")
        comma_index = issuer_str.find(",", o_index)
        #Get characters starting from after O= and before ,
        org_str = issuer_str[o_index + 2 :comma_index]
        return org_str.replace("\\", "")

def get_domain(server_name_str: str):
    if type(server_name_str) == str:
        domain = extract(server_name_str).domain
        #tld - top level domain
        tld = extract(server_name_str).suffix
        return f"{domain}.{tld}"

In [7]:
issuer_org_udf = udf(lambda issuer_str: get_issuer_org(issuer_str), StringType())
domain_udf = udf(lambda server_name: get_domain(server_name), StringType())
df = df.withColumn("issuer_O", issuer_org_udf(col("issuer")))
df = df.withColumn("domain", domain_udf(col("server_name")))

In [8]:
df_valid = df.filter(df["validation_status"] == "ok")
df_valid = df_valid.dropDuplicates(["issuer_O"])

In [9]:
df_ok = df_valid.toPandas()

22/10/10 17:51:06 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

In [10]:
df_ok.head(20)

Unnamed: 0,_lpp_ver,anon_orig,anon_resp,cert_chain_fuids,cipher,client_cert_chain_fuids,client_issuer,client_subject,curve,established,...,next_protocol,resumed,server_name,subject,ts,uid,validation_status,version,issuer_O,domain
0,1.11-1,uva,none,"[FLhnlK3XKo65ln0Kjf, FqmQAa2RbXnmriGtP1]",TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,[],,,x25519,True,...,h2,False,us.puma.com,"CN=us.puma.com,O=PUMA SE,OU=Global E-Commerce,...",2021-12-28T14:54:54.970484Z,CHYoYS1Xj6HfVIDGOh,ok,TLSv12,D-Trust GmbH,puma.com
1,1.11-1,uva,none,"[FFga513JxS1gabms71, FkQv5y2JnjlynWkxNf, FEIuB...",TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,[],,,secp256r1,True,...,h2,False,static-a.pushpushgo.com,CN=*.pushpushgo.com,2021-12-28T14:49:30.156720Z,C5mKrcOTga44l8VTd,ok,TLSv12,DOMENY.PL sp. z o.o,pushpushgo.com
2,1.11-1,uva,none,"[Ff5uVP2pcOGZNfTkod, Fx2KGg25WhPpKgf3El, F3ICw...",TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384,[],,,secp256r1,True,...,,False,rubicon.wellsfargo.com,"CN=rubicon.wellsfargo.com,O=Wells Fargo & Comp...",2021-12-28T14:47:34.280157Z,CTEPipSLA2R1riUMj,ok,TLSv12,Wells Fargo & Company,wellsfargo.com
3,1.11-1,uva,none,"[FXZQr43MIwIB4C3QWc, F48gmM3yqMRXx5L4Xf, Fc9t3...",TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384,[],,,x25519,True,...,,False,mattermost.mpifr-bonn.mpg.de,"CN=mattermost.mpifr-bonn.mpg.de,OU=Max-Planck-...",2021-12-28T14:51:13.828416Z,CgM0lf2fI6tvNF2hr1,ok,TLSv12,Max-Planck-Gesellschaft,mpg.de
4,1.11-1,uva,none,"[FFzPop1DnMo8QGyWwb, F3ko9p2O6gpPmIdudh, Fyvd9...",TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256,[],,,x25519,True,...,h2,False,richmond.com,CN=richmond.com,2021-12-28T14:47:36.400324Z,Cz22Dl4jCQl6yvTNbc,ok,TLSv12,ZeroSSL,richmond.com
5,1.11-1,uva,none,"[FTZETr1ZtvG2PRPxU4, F7BQKU3bhDqJjSiVl8]",TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384,[],,,secp256r1,True,...,,False,perfsonar.nci.org.au,"CN=perfsonar.nci.org.au,O=Australian National ...",2021-12-28T14:49:38.746086Z,CGL0manmvAE9HZ92j,ok,TLSv12,QuoVadis Limited,nci.org.au
6,1.11-1,uva,none,"[FPtK0mAzmQ6vDHZ6e, FiXSAH1O9l3ius3gUl, FAZxSz...",TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,[],,,secp256r1,True,...,,False,api.fillr.com,CN=*.fillr.com,2021-12-28T14:47:28.115120Z,CfJTYC12JwY0LFkar9,ok,TLSv12,Let's Encrypt,fillr.com
7,1.11-1,uva,none,"[FuFDKM3txWwYAmOh53, FjY3te35XEr8q0Huu4]",TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,[],,,secp256r1,True,...,http/1.1,False,takedown.api.playstation.com,CN=*.api.playstation.com,2021-12-28T14:50:10.952953Z,CtwblvHs7iVK9vaTe,ok,TLSv12,Comodo Japan,playstation.com
8,1.11-1,uva,none,"[FHEyJTFW7i4DrEqQa, F7quCI23KF2UEHACA1, FoLiFs...",TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384,[],,,secp521r1,True,...,,False,,CN=vps68142.servconfig.com,2021-12-28T14:48:07.233142Z,CJOVtu1GKKDBEhQoM7,ok,TLSv12,cPanel,
9,1.11-1,uva,none,"[F7HX6s2fDTKBBDnc7, FNGjfb2KlKpYF7O7G5, FylTAx...",TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384,[],,,x25519,True,...,,False,pixel.onaudience.com,CN=*.onaudience.com,2021-12-28T14:46:12.777290Z,C77HwU2F7Y7ydTGZQl,ok,TLSv12,home.pl S.A.,onaudience.com


In [11]:
#df_ok = df[df["validation_status"] == "ok"]
df_orgs = df_ok.groupby(["issuer_O"])["server_name"].agg("count").reset_index(name="count").sort_values(by="count", ascending=False)
df_orgs.head(10)

Unnamed: 0,issuer_O,count
0,Aetna Inc,1
1,AffirmTrust,1
31,Network Solutions L.L.C.,1
32,QuoVadis Limited,1
33,Root Networks,1
34,SECOM Trust Systems CO.,1
35,SSL Corp,1
36,SSL Corporation,1
37,Sectigo Limited,1
38,Starfield Technologies,1


In [12]:
def gen_whitelist(whitelist_path: str, df_orgs: pd.core.frame.DataFrame):
    whitelist_file = open(whitelist_path, "r+")
    whitelist_orgs = set(whitelist_file.read().splitlines())
    for org in df_orgs["issuer_O"]:
        if org not in whitelist_orgs:
            whitelist_orgs.add(org)
            whitelist_file.write(org + "\n")
    whitelist_file.close()
    return whitelist_orgs

In [13]:
def get_list_set(file_path: str) -> set:
    file = open(file_path, "r")
    return set(file.read().splitlines())

In [14]:
whitelist = gen_whitelist("whitelist.txt", df_orgs)

## Parse for Unable to Get Local Certificate

In [15]:
df_unable = df.filter(df.validation_status == "unable to get local issuer certificate")
df_unable = df_unable.groupBy(["domain", "issuer_O"]).count()
df_unable.count()

                                                                                

144

In [16]:
df_unable = df_unable.toPandas()
df_unable

                                                                                

Unnamed: 0,domain,issuer_O,count
0,,eYou,1
1,samsungotn.net,Samsung Electronics,24
2,riversidehealthcare.net,EMR Direct,2
3,,Infoblox,918
4,sascms.net,,3
...,...,...,...
139,aramark.com,"N=71aK19qLkXRtn,DC=fss,DC=aramark,DC=co",6
140,microsoft.com,Zscaler Inc.,1
141,,"N=4cDULlMrUA,DC=SRELAY,DC=CO",7
142,medi1tv.com,,1


In [17]:
blacklist_url = "blacklist.txt"
blacklist = get_list_set(blacklist_url)
blacklist

{'Alleghany County Public Schools',
 'BAYER',
 'Barracuda Networks',
 'Boston Scientific Corporation',
 'Charlotte County Public Schools',
 'Cisco',
 'Cisco Systems',
 'Cisco Systems\\',
 'FIREEYE',
 'Forcepoint LLC',
 'Fortinet',
 'Freddie Mac',
 'General Electric Company',
 'Hanover County Public Schools',
 'Harrisonburg City Public Schools',
 'Henrico County Public Schools',
 'Johnson Controls International PLC',
 'Kaspersky',
 'Kaspersky Lab',
 'LOCKSS box',
 'McKesson Corporation',
 'Menlo Security Inc.',
 'Private Internet Access',
 'SaferNet',
 'Technicolor',
 'Toyota North America',
 'Zscaler Inc.',
 'Zscaler\\',
 'cisco',
 'iboss Network Security'}

In [18]:
df_filtered = df_unable[~(df_unable["issuer_O"].isin(whitelist) | df_unable["issuer_O"].isin(blacklist))]

In [19]:
df_filtered = df_filtered[df_filtered["domain"].str.contains("None") == False]
df_filtered = df_filtered[df_filtered["issuer_O"].str.contains("None") == False]
df_filtered

Unnamed: 0,domain,issuer_O,count
1,samsungotn.net,Samsung Electronics,24
2,riversidehealthcare.net,EMR Direct,2
5,jumpcloud.com,JumpCloud,182
6,bayhealth.org,EMR Direct,2
9,sjhsyr.org,EMR Direct,2
...,...,...,...
135,rivhs.com,EMR Direct,1
136,lgsmartad.com,LG Electronics Inc.,1
137,dbankcloud.asia,Huawei,1
138,hhchealth.org,MaxMD,1


In [20]:
def is_selfsigned(domain: str, issuer_O: str) -> bool:
    domain_lower = extract(domain.lower()).domain
    issuer_lower = issuer_O.lower().replace(" ", "")
    return domain_lower in issuer_lower

In [21]:
df_not_selfsigned = df_filtered[~df_filtered.apply(lambda x: is_selfsigned(x["domain"], x["issuer_O"]), axis=1)]
df_not_selfsigned.reset_index(inplace=True)
df_not_selfsigned = df_not_selfsigned.drop("index", axis=1)
df_not_selfsigned

Unnamed: 0,domain,issuer_O,count
0,samsungotn.net,Samsung Electronics,24
1,riversidehealthcare.net,EMR Direct,2
2,bayhealth.org,EMR Direct,2
3,sjhsyr.org,EMR Direct,2
4,seattlechildrens.org,EMR Direct,2
...,...,...,...
64,ntstplatform.com,EMR Direct,4
65,rivhs.com,EMR Direct,1
66,lgsmartad.com,LG Electronics Inc.,1
67,dbankcloud.asia,Huawei,1


# Virus Total API Data Exploration

* Virus Total has information about HTTPS certificates
* Check if Logged Certificate Organization matches one that Virus Total
* If they don't match, may be SSL interception because Virus Total information may be outdated

In [22]:
import vt
api_key_file = open("virus_total_api_key", "r")
api_key = api_key_file.readline()
api_key_file.close()

In [23]:
client = vt.Client(api_key)
url_id = vt.url_id("http://dbankcloud.asia")

In [24]:
import nest_asyncio
nest_asyncio.apply()
#url = client.scan_url("http://dbankcloud.asia")
#url = client.get_object("/domains/{}/historical_whois", "dbankcloud.asia")
data = client.get_data("/domains/dbankcloud.asia/historical_whois")
#url

In [25]:
data[0]["attributes"]["whois_map"]["Name Server"]

'GNS1.HUAWEICLOUD-DNS.CN | GNS1.HUAWEICLOUD-DNS.COM | GNS1.HUAWEICLOUD-DNS.ORG | NS3.DNSV5.COM | NS4.DNSV5.COM'

In [26]:
data_cas = client.get_data("/domains/hhchealth.org/historical_ssl_certificates")

In [27]:
data_cas

[{'attributes': {'public_key': {'rsa': {'key_size': 2048,
     'modulus': '00acaf87ccb6c52e194ca344023d3683c4fe67f1f2bd794da5e3d96d8c9fdc0df9aa4991c3ff0afde5d27c1c1be4f467b35979af00c2d995fe60070f6bc9e8e44bedd52d9c0b89d394441a011e55c2620594f768b8b8500b53683375866d0543369413e81089d2cf360fbea6d202b08663dde052cb617a281116208cf18e0ad692a06d3317a297865011a0de8fffdc2c5c4a4b10b6c8b5faffb7710926542ae200532d8d36bd050a357a70860c279e37c8ac8b05e188873a0ef75cee68bd02ad013b823e669e4564119ce2446614c93139c7f8e4dd21604ccdd9562d32eb59543c9da45fb3b043300802875670b048e84fa194f30a3d5e277d95345fde3ff69d61',
     'exponent': '010001'},
    'algorithm': 'RSA'},
   'first_seen_date': 1652282289,
   'issuer': {'C': 'US',
    'CN': 'DigiCert TLS RSA SHA256 2020 CA1',
    'O': 'DigiCert Inc'},
   'cert_signature': {'signature': 'bd150ccfadadc2c85b9385936b1284970b4a66bb3ad39b00bcc6e5a5f87cbf100a5dceb39c468c2fe6be2ce211e0d5809be0a060b47c5a3ef3bb18e7ec9b149a722f48211eb6747c9048b8c57a8ba3d7c3cd903c65b9c11352bf47402c9f

In [28]:
data_cas[0]["attributes"]["issuer"]["O"]

'DigiCert Inc'

In [29]:
data_cas_2 = client.get_data("/domains/dbankcloud.asia/historical_ssl_certificates")

In [30]:
data_cas_2

[]

# Virus Total API Filtering

In [31]:
df_not_selfsigned.reset_index(inplace=True, drop=True)
df_not_selfsigned

Unnamed: 0,domain,issuer_O,count
0,samsungotn.net,Samsung Electronics,24
1,riversidehealthcare.net,EMR Direct,2
2,bayhealth.org,EMR Direct,2
3,sjhsyr.org,EMR Direct,2
4,seattlechildrens.org,EMR Direct,2
...,...,...,...
64,ntstplatform.com,EMR Direct,4
65,rivhs.com,EMR Direct,1
66,lgsmartad.com,LG Electronics Inc.,1
67,dbankcloud.asia,Huawei,1


In [32]:
import time
import vt
def get_historical_ca(df: pd.DataFrame, client: vt.Client):
    historical_df = df.copy()
    historical_df.reset_index(drop=True, inplace=True)
    historical_cas = []
    for index, row in historical_df.iterrows():
        try:
            domain = row["domain"]
            ssl_ca_data = client.get_data(f"/domains/{domain}/historical_ssl_certificates")
            if len(ssl_ca_data) > 0:
                historical_cas.append(ssl_ca_data[0]["attributes"]["issuer"]["O"])
            else:
                historical_cas.append("None")
        except:
            historical_cas.append("None")
    historical_df["historical_ca"] = historical_cas
    return historical_df

In [33]:
import nest_asyncio
nest_asyncio.apply()
historical_df = get_historical_ca(df_not_selfsigned, client)
historical_df

Unnamed: 0,domain,issuer_O,count,historical_ca
0,samsungotn.net,Samsung Electronics,24,DigiCert Inc
1,riversidehealthcare.net,EMR Direct,2,
2,bayhealth.org,EMR Direct,2,DigiCert Inc
3,sjhsyr.org,EMR Direct,2,Let's Encrypt
4,seattlechildrens.org,EMR Direct,2,
...,...,...,...,...
64,ntstplatform.com,EMR Direct,4,"GoDaddy.com, Inc."
65,rivhs.com,EMR Direct,1,
66,lgsmartad.com,LG Electronics Inc.,1,
67,dbankcloud.asia,Huawei,1,


In [34]:
historical_df

Unnamed: 0,domain,issuer_O,count,historical_ca
0,samsungotn.net,Samsung Electronics,24,DigiCert Inc
1,riversidehealthcare.net,EMR Direct,2,
2,bayhealth.org,EMR Direct,2,DigiCert Inc
3,sjhsyr.org,EMR Direct,2,Let's Encrypt
4,seattlechildrens.org,EMR Direct,2,
...,...,...,...,...
64,ntstplatform.com,EMR Direct,4,"GoDaddy.com, Inc."
65,rivhs.com,EMR Direct,1,
66,lgsmartad.com,LG Electronics Inc.,1,
67,dbankcloud.asia,Huawei,1,


In [35]:
import time

#Use df_not_selfsigned
def get_suspected(df: pd.DataFrame):
    suspected_issuers = set()
    for index, row in df.iterrows():
        if row["issuer_O"] in suspected_issuers:
            continue
        
        #API only allows 4 requests per minute
        if (index + 1)% 4 == 0:
            time.sleep(60)
        
        #Call the API to get Historical SSL Certificates
        try:
            domain = row["domain"]
            ssl_ca_data = client.get_data(f"/domains/{domain}/historical_ssl_certificates")

            #If we can't find Historical SSL Certificates
            if len(ssl_ca_data) == 0:
                suspected_issuers.add(row["issuer_O"])
                print(row["issuer_O"])
            elif row["issuer_O"].lower() not in ssl_ca_data[0]["attributes"]["issuer"]["O"]:
                suspected_issuers.add(row["issuer_O"])
                print(row["issuer_O"])
        except:
            continue

        
    return suspected_issuers