In [1]:
from typing import *
import pandas as pd
from pathlib import Path
import os
import random
import sys
import re
from dotenv import load_dotenv, dotenv_values
from pydriller import Repository
import json
from multiprocessing import Pool
from itertools import chain
from pymongo import MongoClient
import collections

In [2]:
import dotenv
dotenv.load_dotenv()
ENV = dotenv.dotenv_values(".env")
DATA_DIR = Path(ENV["DATA_DIR"])

client = MongoClient("localhost", 42692)
db = client.s5_snyk_libio

In [3]:
db.mergedVuln.drop()

In [4]:
libio_vuln_data = list(db.libioVuln.find())
mvn_eco_vuln_data = list(db.mvnEcoVuln.find())
db.mergedVuln.insert_many(db.libioVuln.find())
db.mergedVuln.insert_many(db.mvnEcoVuln.find())

InsertManyResult([ObjectId('67fc3c9a6baaaadb2f3e0852'), ObjectId('67fc3c9a6baaaadb2f3e0853'), ObjectId('67fc3c9a6baaaadb2f3e0854'), ObjectId('67fc3c9a6baaaadb2f3e0855'), ObjectId('67fc3c9a6baaaadb2f3e0856'), ObjectId('67fc3c9a6baaaadb2f3e0857'), ObjectId('67fc3c9a6baaaadb2f3e0858'), ObjectId('67fc3c9a6baaaadb2f3e0859'), ObjectId('67fc3c9a6baaaadb2f3e085a'), ObjectId('67fc3c9a6baaaadb2f3e085c'), ObjectId('67fc3c9a6baaaadb2f3e085d'), ObjectId('67fc3c9a6baaaadb2f3e085e'), ObjectId('67fc3c9a6baaaadb2f3e085f'), ObjectId('67fc3c9a6baaaadb2f3e0860'), ObjectId('67fc3c9a6baaaadb2f3e0861'), ObjectId('67fc3c9a6baaaadb2f3e0862'), ObjectId('67fc3c9a6baaaadb2f3e0863'), ObjectId('67fc3c9a6baaaadb2f3e0864'), ObjectId('67fc3c9a6baaaadb2f3e0865'), ObjectId('67fc3c9a6baaaadb2f3e0866'), ObjectId('67fc3c9a6baaaadb2f3e0867'), ObjectId('67fc3c9a6baaaadb2f3e0868'), ObjectId('67fc3c9a6baaaadb2f3e0869'), ObjectId('67fc3c9a6baaaadb2f3e086a'), ObjectId('67fc3c9a6baaaadb2f3e086b'), ObjectId('67fc3c9a6baaaadb2f3e08

In [5]:
len(libio_vuln_data), len(mvn_eco_vuln_data)

(6864, 829)

In [6]:
print(len(set((':'.join(x['vuln_gav'].split(':')[:-1]), x['cve_ref']) for x in libio_vuln_data)))
print(len(set((':'.join(x['vuln_gav'].split(':')[:-1]), x['cve_ref']) for x in mvn_eco_vuln_data)))
print(len(set((':'.join(x['vuln_gav'].split(':')[:-1]), x['cve_ref']) for x in libio_vuln_data + mvn_eco_vuln_data)))

287
829
991


In [7]:
print(len(set((x['vuln_gav'], x['cve_ref']) for x in libio_vuln_data)))  # 6726  some do not have CVE ID
print(len(set((x['vuln_gav'], x['cve_ref']) for x in mvn_eco_vuln_data)))  # 829
print(len(set((x['vuln_gav'], x['cve_ref']) for x in libio_vuln_data + mvn_eco_vuln_data)))  # 7490

6856
829
7610


In [8]:
libio_group_by_gav_cve = collections.defaultdict(list)
for it in libio_vuln_data:
    libio_group_by_gav_cve[(it['vuln_gav'], it['cve_ref'])].append(it)

mvn_eco_group_by_gav_cve = collections.defaultdict(list)
for it in mvn_eco_vuln_data:
    mvn_eco_group_by_gav_cve[(it['vuln_gav'], it['cve_ref'])].append(it)

In [9]:
l = list(sorted(libio_group_by_gav_cve.items(), key=lambda x: len(x[1]), reverse=True))
l[0]

(('io.netty:netty-handler:4.0.21.Final', ''),
 [{'_id': ObjectId('67fc3a33251ac280f228a41c'),
   'data_src': 'snyk',
   'cve_ref': '',
   'snyk_url': 'https://security.snyk.io/vuln/SNYK-JAVA-IONETTY-2342128',
   'vuln_gav': 'io.netty:netty-handler:4.0.21.Final',
   'vuln_classes': ['io.netty.handler.ssl.OpenSslEngine'],
   'patches': ['https://github.com/netty/netty/commit/4581c441c99e1d5524870abca38d0ad2d553932a']},
  {'_id': ObjectId('67fc3a33251ac280f228a41e'),
   'data_src': 'snyk',
   'cve_ref': '',
   'snyk_url': 'https://security.snyk.io/vuln/SNYK-JAVA-IONETTY-31518',
   'vuln_gav': 'io.netty:netty-handler:4.0.21.Final',
   'vuln_classes': ['io.netty.handler.ssl.OpenSslServerContext'],
   'patches': ['https://github.com/netty/netty/commit/a1af35313cc8414109e7a411bb2401ae31046289']},
  {'_id': ObjectId('67fc3a33251ac280f228a41f'),
   'data_src': 'snyk',
   'cve_ref': '',
   'snyk_url': 'https://security.snyk.io/vuln/SNYK-JAVA-IONETTY-31515',
   'vuln_gav': 'io.netty:netty-handler

In [10]:
shared_gav_cve_items = list(set(libio_group_by_gav_cve.keys()).intersection(set(mvn_eco_group_by_gav_cve.keys())))
len(shared_gav_cve_items)

75

In [11]:
_rk = random.choice(shared_gav_cve_items)
libio_group_by_gav_cve[_rk], mvn_eco_group_by_gav_cve[_rk]  
# 'CVE-2021-37137', 'io.netty:netty-codec:4.1.5.Final'   mvn_eco data has the wrong cve_ref
# 'CVE-2013-7285' mvn_eco data includes 'ReflectionConverter' which is wrong
# 

([{'_id': ObjectId('67fc3a33251ac280f228aab5'),
   'data_src': 'snyk',
   'cve_ref': 'CVE-2018-12537',
   'snyk_url': 'https://security.snyk.io/vuln/SNYK-JAVA-IOVERTX-32470',
   'vuln_gav': 'io.vertx:vertx-core:3.3.3',
   'vuln_classes': ['io.vertx.core.http.impl.HttpClientRequestImpl',
    'io.vertx.core.http.HttpHeaders',
    'io.vertx.core.http.impl.HttpUtils',
    'io.vertx.core.http.impl.Http2HeadersAdaptor'],
   'patches': ['https://github.com/eclipse/vert.x/commit/1bb6445226c39a95e7d07ce3caaf56828e8aab72']}],
 [{'_id': ObjectId('67fc3c9a6baaaadb2f3e0a41'),
   'data_src': 'mvn_eco',
   'cve_ref': 'CVE-2018-12537',
   'vuln_gav': 'io.vertx:vertx-core:3.3.3',
   'vuln_classes': ['io.vertx.core.http.impl.Http2HeadersAdaptor',
    'io.vertx.core.http.impl.HttpClientRequestImpl',
    'io.vertx.core.http.HttpHeaders'],
   'patches': ['https://github.com/eclipse/vert.x/commit/1bb6445226c39a95e7d07ce3caaf56828e8aab72',
    'https://github.com/eclipse-vertx/vert.x/commit/3a6512695a9d61068

In [12]:
import pprint

diff_class_counter = 0

for k in shared_gav_cve_items:
    x = libio_group_by_gav_cve[k]
    if len(x) > 1:
        continue
    x = x[0]
    y = mvn_eco_group_by_gav_cve[k]
    if len(y) > 1:
        continue
    y = y[0]
    
    if set(x['vuln_classes']) == set(y['vuln_classes']):
        continue

    if set(y['vuln_classes']).issubset(set(x['vuln_classes'])):
        continue

    # if set(x['vuln_classes']).issubset(set(y['vuln_classes'])):
    #     continue
        
    diff_class_counter += 1
    pprint.pprint(x)
    print("\n------------\n")
    pprint.pprint(y)
    print("\n@@@@@@@@@@@@@@@\n")

diff_class_counter

{'_id': ObjectId('67fc3a33251ac280f228a2e6'),
 'cve_ref': 'CVE-2016-2402',
 'data_src': 'snyk',
 'patches': ['https://github.com/square/okhttp/commit/62b42bf27589fd8128f6d1a202455c5731c890c2',
             'https://github.com/square/okhttp/commit/a7d3396047a75183398f339310774ee415c1c4e8',
             'https://github.com/square/okhttp/commit/e121ed1ad291cbe04de989683d23f549d8e305db',
             'https://github.com/square/okhttp/commit/18b2660873ed8f028ee72981c882ed676aa08030',
             'https://github.com/square/okhttp/commit/a2bf207c980e4d2a0ecf457eadae9048e63d09aa',
             'https://github.com/square/okhttp/commit/ddbabf77803112fe804d1f44e5071244b00b9cfb'],
 'snyk_url': 'https://security.snyk.io/vuln/SNYK-JAVA-COMSQUAREUPOKHTTP3-73572',
 'vuln_classes': ['okhttp3.OkHttpClient',
                  'okhttp3.CertificatePinner',
                  'okhttp3.internal.Platform'],
 'vuln_gav': 'com.squareup.okhttp3:okhttp:3.0.1'}

------------

{'_id': ObjectId('67fc3c9a6baaaadb2f3e

9

have to remove modified classes that are not in the jar (libio)

In [13]:
mvn_eco_vuln_gav_cve_ref = set((x['vuln_gav'], x['cve_ref']) for x in mvn_eco_vuln_data)
len(mvn_eco_vuln_gav_cve_ref), len(list(x for x in libio_vuln_data if (x['vuln_gav'], x['cve_ref']) not in mvn_eco_vuln_gav_cve_ref))

(829, 6789)

In [14]:
libio_vuln_gav_cve_ref = set((x['vuln_gav'], x['cve_ref']) for x in libio_vuln_data)
len(libio_vuln_gav_cve_ref), len(list(x for x in mvn_eco_vuln_data if (x['vuln_gav'], x['cve_ref']) not in libio_vuln_gav_cve_ref))

(6856, 754)

In [15]:
db.mergedVuln.drop()
db.mergedVuln.insert_many(libio_vuln_data)
_ = db.mergedVuln.insert_many(x for x in mvn_eco_vuln_data if (x['vuln_gav'], x['cve_ref']) not in libio_vuln_gav_cve_ref)

In [16]:
uniq_cve = set()
uniq_cve_ga = set()
uniq_cve_gav_set = set()

for it in db.mergedVuln.find():
    cve_ref = it['cve_ref']
    if cve_ref == '':
        cve_ref = it['snyk_url']
    
    assert cve_ref != '' and cve_ref is not None
    
    uniq_cve.add(cve_ref)
    uniq_cve_ga.add((cve_ref, ':'.join(it['vuln_gav'].split(':')[:-1])))
    uniq_cve_gav_set.add((cve_ref, it['vuln_gav']))

print(len(uniq_cve))  
print(len(uniq_cve_ga))
print(len(uniq_cve_gav_set))

954
993
7618
