In [3]:
import json
import os
import subprocess
import shlex
import requests
import copy
from collections import defaultdict
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

In [4]:
import multiprocessing
import importlib
import requests
import sys

## Download the Latest Debian Sec Tracker Info

In [5]:
# DEB_SEC_URL = "https://security-tracker.debian.org/tracker/data/json/"
# deb_sec_json = requests.get(DEB_SEC_URL).json()

# with open("./debian_sec_tracker_feed.json", "w") as out:
#     json.dump(deb_sec_json, out, indent=4)

### Debian Security Tracker Dataset
We use the following Debian Security Feed data

In [6]:
deb_sec_json = json.load(open("./data/debian_sec_tracker_06232024.json", "r"))

In [7]:
len(deb_sec_json)

3581

In [8]:
# This is complete list of CVEs present in Debian Security Tracker
total_cve_list = []
for pack, dets in deb_sec_json.items():
    for cve in dets:
        if cve not in total_cve_list:
            total_cve_list.append(cve)

In [9]:
print(f"A total of {len(total_cve_list)} CVEs recorded in Debian Security Tracker")

A total of 36775 CVEs recorded in Debian Security Tracker


# Importing CVEs reported 2022 onwards
Here we import a CVE dataset that has CVEs in or after 2022 - created using the `get_apt_src_cves.py` script

In [10]:
deb_sec_json_recent = json.load(open("./data/deb_sec_tracker_merged_2022.json"))

In [11]:
recent_cve_list = []
for src, dets in deb_sec_json_recent.items():
    for cve in dets:
        if cve not in recent_cve_list:
            recent_cve_list.append(cve)

In [12]:
print(f"We have a total of {len(recent_cve_list)} CVEs reported in or after 2022")

We have a total of 8646 CVEs reported in or after 2022


In [13]:
vulnerable_apt_sources = []
for src, dets in deb_sec_json_recent.items():
    if dets == {}:
        continue
    if src not in vulnerable_apt_sources:
        vulnerable_apt_sources.append(src)

In [14]:
print(f"We have {len(vulnerable_apt_sources)} APT sources that are vulnerable and have atleast one CVE reported in 2022 and later")

We have 1190 APT sources that are vulnerable and have atleast one CVE reported in 2022 and later


# Vulnerable APT sources that ship atleast one binary package which is under the "libs" section of Debian

In [17]:
vuln_apt_src_deb_tags_map = json.load(open("./data/apt_src_deb_maps_info.json", "r"))

In [18]:
# Creating a map from vulnerable apt sources to the shared library binary packages that they ship 
vuln_apt_src_libdebs_map = {}
for src, debs in vuln_apt_src_deb_tags_map.items():
    vuln_apt_src_libdebs_map[src] = []
    for deb, dets in debs.items():
        if "role" in dets and dets["role"] == "shared-lib":
            vuln_apt_src_libdebs_map[src].append(deb)
        elif "section" in dets and dets["section"] == "libs":
            vuln_apt_src_libdebs_map[src].append(deb)
    if vuln_apt_src_libdebs_map[src] == []:
        vuln_apt_src_libdebs_map.pop(src)

In [19]:
print(f"There are {len(vuln_apt_src_libdebs_map)} apt sources that have atleast one CVE in or after 2022 and ship atleast one library package")

There are 339 apt sources that have atleast one CVE in or after 2022 and ship atleast one library package


In [20]:
len(deb_sec_json_recent['linux'])

2160

In [21]:
# We skip CVEs from the Linux apt source package
# because that has just one lib binary package and almost all CVEs
# are in the Linux kernel
vuln_apt_src_libdebs_map.pop("linux")

['libcpupower1']

In [22]:
debs_to_apt_src_map = {}
for apt_src, debs in vuln_apt_src_libdebs_map.items():
    for deb in debs:
        debs_to_apt_src_map[deb] = apt_src

## Scaling the experiment down to 20 vulnerable apt sources

In [28]:
vuln_apt_src_libdebs_map_reduced20 = {}
count = 0
for apt_src, deb_info in vuln_apt_src_libdebs_map.items():
    count += 1
    if count > 20:
        break
    vuln_apt_src_libdebs_map_reduced20[apt_src] = copy.deepcopy(deb_info)
len(vuln_apt_src_libdebs_map_reduced20)

20

In [29]:
INIT_LIB_CVES_POOL = []
for apt_src in vuln_apt_src_libdebs_map_reduced20:
    for cve in deb_sec_json_recent[apt_src]:
        if cve not in INIT_LIB_CVES_POOL:
            INIT_LIB_CVES_POOL.append(cve)
INIT_LIB_CVES_POOL = list(set(INIT_LIB_CVES_POOL))

In [30]:
print(f"The starting CVE pool for our experiments is {len(INIT_LIB_CVES_POOL)}")

The starting CVE pool for our experiments is 48


In [31]:
len(vuln_apt_src_libdebs_map_reduced20.keys())

20

In [32]:
with open("./data/vuln_apt_sources.txt", "w") as f:
    for apt_src in vuln_apt_src_libdebs_map_reduced20:
        f.write(f"{apt_src}\n")

# Run the CVE data collection script
Run the script `../run_large_scale.py`

# Checking which valid git commit links resulted in empty function list
We collected CVEs for the 338 apt sources using `run_large_scale.py` and now analyzing the data

In [33]:
cve_jsons_dir = "../cve_json_feed"

empty_func_git_commits = defaultdict(lambda: defaultdict(list))
for top_dir, sub_dirs, files in os.walk(cve_jsons_dir, topdown=False):
    for file_name in files:
        full_file_path = os.path.join(top_dir, file_name)
        if not full_file_path.endswith(".funcs.json"):
            continue
        extracted_dict = json.load(open(full_file_path, "r"))
        for git_src, cve_dict in extracted_dict.items():
            for cve, git_links in cve_dict.items():
                for git_link, file_dict_list in git_links.items():
                    all_funcs_empty = True
                    for file_dict in file_dict_list:
                        if not file_dict['vulnerable_functions'] == []:
                            all_funcs_empty = False
                    if all_funcs_empty:
                        empty_func_git_commits[full_file_path][git_link] += file_dict_list
                        

In [34]:
len(empty_func_git_commits)

7

In [35]:
for apt_src, git_links in empty_func_git_commits.items():
    for git_link, file_dict_list in git_links.items():
        for file_dict in file_dict_list:
            if not (
                file_dict['file'].endswith('.c') or
                file_dict['file'].endswith('.h') or
                file_dict['file'].endswith('.cpp') or
                file_dict['file'].endswith('.hpp') or
                file_dict['file'].endswith('.cc') or
                file_dict['file'].endswith('.cxx') or
                file_dict['file'].endswith('.hxx') or
                file_dict['file'].endswith('.hh')):
                continue
            print(f"Files in {git_link}")
            # print(file)

cve_func_list_map = defaultdict(list)
cve_func_git_src_tuples = []
for top_dir, sub_dirs, files in os.walk(cve_jsons_dir, topdown=False):
    for file_name in files:
        full_file_path = os.path.join(top_dir, file_name)
        if not full_file_path.endswith(".funcs.json"):
            continue
        extracted_dict = json.load(open(full_file_path, "r"))
        apt_src = full_file_path.split("/")[-2]
        for git_src, cve_dict in extracted_dict.items():
            for cve, git_links in cve_dict.items():
                for git_link, file_dict_list in git_links.items():
                    for file_dict in file_dict_list:
                        for func in file_dict['vulnerable_functions']:
                            cve_func_list_map[cve].append(func)
                            cve_func_git_src_tuples.append(f"{cve},{func['name']},{git_src},{apt_src}")

In [36]:
with open("data/cve_funcs_git_src_tuples.csv", "w") as f:
    for tuple in cve_func_git_src_tuples:
        f.write(f"{tuple}\n")

In [37]:
len(cve_func_git_src_tuples)

167

In [38]:
len(cve_func_list_map)

37

In [39]:
APT_SRCS_OF_FINAL_CVES = []
for apt_src in vuln_apt_src_libdebs_map_reduced20:
    for cve in deb_sec_json_recent[apt_src]:
        if cve in cve_func_list_map:
            APT_SRCS_OF_FINAL_CVES.append(apt_src)
            break

In [40]:
len(APT_SRCS_OF_FINAL_CVES)

14

In [41]:
for src in vuln_apt_src_libdebs_map_reduced20:
    if src not in APT_SRCS_OF_FINAL_CVES:
        print(src)

cfengine3
chicken
apr
faust
accountsservice
erlang-jose


In [42]:
with open("./data/FINAL_VULN_APT_SOURCES.txt", "w") as f:
    for src in APT_SRCS_OF_FINAL_CVES:
        f.write(f"{src}\n")

In [43]:
DEBS_OF_APT_SRCS_OF_FINAL_CVES = []

In [44]:
for apt_src in APT_SRCS_OF_FINAL_CVES:
    for deb in vuln_apt_src_libdebs_map_reduced20[apt_src]:
        DEBS_OF_APT_SRCS_OF_FINAL_CVES.append(deb)
DEBS_OF_APT_SRCS_OF_FINAL_CVES = list(set(DEBS_OF_APT_SRCS_OF_FINAL_CVES))

In [45]:
with open("./data/FINAL_DEB_LIST_OF_APT_SRCS.txt", "w") as f:
    for deb in DEBS_OF_APT_SRCS_OF_FINAL_CVES:
        # All other packages will anyways depend on libc6
        if 'libc6' in deb:
            continue
        f.write(f"{deb}\n")

# Getting list of all debs that reverse depend on the vuln debs

In [46]:
vuln_final_debs_rdepends = json.load(open("./data/vuln_debs_rdepends.json", "r")) # Created after apt update on July4

In [47]:
ALL_DEB_LIST = []

In [48]:
for deb, rdeps in vuln_final_debs_rdepends.items():
    ALL_DEB_LIST.append(deb)
    for rdep in rdeps:
        ALL_DEB_LIST.append(rdep)
ALL_DEB_LIST = list(set(ALL_DEB_LIST))

In [49]:
len(ALL_DEB_LIST)

38942

In [50]:
with open("./data/FINAL_ANALYSIS_DEB_LIST.txt", "w") as f:
    for deb in ALL_DEB_LIST:
        f.write(f"{deb}\n")