In [None]:
from typing import *
import pandas as pd
from pathlib import Path
import os
import random
import itertools
import sys
import re
from dotenv import load_dotenv, dotenv_values
from pydriller import Repository
import json
from multiprocessing import Pool
from itertools import chain
import numpy as np
from pymongo import MongoClient
import collections

In [None]:
import dotenv
dotenv.load_dotenv()
ENV = dotenv.dotenv_values(".env")
DATA_DIR = Path(ENV["DATA_DIR"])
MVN_ECO_DIR = Path(ENV["MVN_ECO_DIR"])
JARS_DIR = DATA_DIR / 'interim' / 'jars'
client = MongoClient("localhost", 42692)
db = client.s5_snyk_libio

In [None]:
vuln_data = list(db.mergedVuln.find())
libio_export_data = list(db.libioExport.find())

In [None]:
len(vuln_data), len(libio_export_data)

In [None]:
dep_to_client_gavs = collections.defaultdict(set)
for it in libio_export_data:
    dep_gav = it['DependencyName'] + ":" + it['DependencyVersion']
    client_gav = it['DependentName'] + ":" + it['DependentVersion']

    dep_g = dep_gav.split(':')[0]
    client_g = client_gav.split(':')[0]

    if dep_g == client_g:    # we don't want dependencies in the same project
        continue

    dep_to_client_gavs[dep_gav].add(client_gav)

len(dep_to_client_gavs.keys()) 

In [None]:
mvn_eco_deps = pd.read_csv(MVN_ECO_DIR / 'csv' / 'DEP.csv').to_dict(orient='records')
len(mvn_eco_deps), mvn_eco_deps[0].keys()

In [None]:
for it in mvn_eco_deps:
    dep_gav = it['Upstream G:A:V']
    client_gav = it['Downstream G:A:V']

    dep_g = dep_gav.split(':')[0]
    client_g = client_gav.split(':')[0]

    if dep_g == client_g:    # we don't want dependencies in the same project
        continue
        
    dep_to_client_gavs[dep_gav].add(client_gav)

len(dep_to_client_gavs.keys())

In [None]:
client_gav_set = set.union(*dep_to_client_gavs.values())
len(client_gav_set)

In [None]:
def gav_to_jar_url(package_gav: str) -> str:
    parts = package_gav.split(":")
    return parts[0].replace('.', '/') + "/" + parts[1] + "/" + parts[2] + "/" + parts[1] + "-" + parts[2] + ".jar"

def gen_aria2c_dl_txt(repo_url: str) -> str:
    lines = []
    for gav in client_gav_set:
        jar_url = gav_to_jar_url(gav)
        (JARS_DIR / jar_url).parent.mkdir(parents=True, exist_ok=True)
        dl_url = repo_url + '/' + jar_url
        line = f"{dl_url}\n\tout={jar_url}"
        lines.append(line)
    
    return '\n'.join(lines)

aria2c_dl_txt = gen_aria2c_dl_txt("https://repo1.maven.org/maven2")
(JARS_DIR / 'dl.txt').write_text(aria2c_dl_txt)

manually download...

In [None]:
client_gav_set_with_jar = set()
for gav in client_gav_set:
    if (JARS_DIR / gav_to_jar_url(gav)).is_file():
        client_gav_set_with_jar.add(gav)

len(client_gav_set_with_jar)

In [None]:
blacklist_gavs = {
    "com.github.rockylomo:rxlib:2.13.3",
    "org.dihedron.strutlets:strutlets:1.0.6",
    "org.dihedron.zephyr:zephyr:1.0.0.RC2",
    "com.github.rockylomo:rxlib:2.13.16"
}

In [None]:
uniq_cve_dep_client_set = set()
for vd_it in vuln_data:
    cve_ref = vd_it['cve_ref']
    if cve_ref == '':
        cve_ref = vd_it['snyk_url']
    
    vuln_gav = vd_it['vuln_gav']
    
    for client_gav in dep_to_client_gavs[vuln_gav]:
        dep_g = vuln_gav.split(':')[0]
        client_g = client_gav.split(':')[0]
        
        if client_gav not in client_gav_set_with_jar or client_gav in blacklist_gavs:
            continue
    
        if dep_g == client_g:    # we don't want dependencies in the same project
            continue
            
        uniq_cve_dep_client_set.add((cve_ref, vuln_gav, client_gav))

len(uniq_cve_dep_client_set)

In [None]:
groupby_cve__dep_ga__client_ga = collections.defaultdict(set)
for cve, dep_gav, client_gav in uniq_cve_dep_client_set:
    dep_ga = ':'.join(dep_gav.split(':')[:-1])
    client_ga = ':'.join(client_gav.split(':')[:-1])
    
    dep_g = dep_ga.split(':')[0]
    client_g = client_ga.split(':')[0]
    
    groupby_cve__dep_ga__client_ga[(cve, dep_ga, client_ga)].add((cve, dep_gav, client_gav))

len(groupby_cve__dep_ga__client_ga.keys())

In [None]:
random_cve_triplets = set()
random.seed(42)
for k in groupby_cve__dep_ga__client_ga.keys():
    random_triplet = random.choice(list(groupby_cve__dep_ga__client_ga[k]))
    random_cve_triplets.add(random_triplet)

len(random_cve_triplets)

In [None]:
random.choice(list(random_cve_triplets))

In [None]:
col = db.mergedVulnClients
col.drop()
for cve, dep_gav, client_gav in random_cve_triplets:
    it = dict()
    it['cve'] = cve
    it['dep_gav'] = dep_gav
    it['client_gav'] = client_gav
    col.insert_one(it)

In [None]:
g = itertools.groupby(random_cve_triplets, key=lambda x: (x[1]))
a = np.array([len(list(l)) for _, l in g])
np.min(a), np.max(a), np.median(a), np.average(a)

In [None]:
cve_count = len(set(x[0] for x in random_cve_triplets))
dep_count = len(set(x[1] for x in random_cve_triplets))
client_count = len(set(x[2] for x in random_cve_triplets))
cve_count, dep_count, client_count, client_count / dep_count, random_cve_triplets.__len__() / dep_count