In [1]:
from typing import *
import pandas as pd
from pathlib import Path
import os
import sys
import re
from dotenv import load_dotenv, dotenv_values
from pydriller import Repository
import json
from multiprocessing import Pool
from itertools import chain
from pymongo import MongoClient
import collections

In [2]:
import dotenv
dotenv.load_dotenv()
ENV = dotenv.dotenv_values(".env")
DATA_DIR = Path(ENV["DATA_DIR"])

client = MongoClient("localhost", 42692)
db = client.s5_snyk_libio

In [3]:
vuln_urls_data = list(db.vulnUrls.find())
patch_urls_data = list(db.patchUrls.find())
patch_commits_data = list(db.patchCommitsLibio.find())
libio_data = list(db.libioExport.find())
vuln_details_data = list(db.vulnUrlDetails.find())

(len(vuln_urls_data), len(patch_urls_data), len(patch_commits_data), len(libio_data), len(vuln_details_data))

(25891, 678, 1153, 684299, 678)

In [4]:
all_vuln_urls = set()
for vuln_url_it in vuln_urls_data:
    dep_vuln_info = dict()
    for vuln_url in vuln_url_it['VulnUrls']:
        all_vuln_urls.add(vuln_url)

len(all_vuln_urls)

678

In [5]:
package_vuln_data = list()  # (dep_name:dep_ver) -> vuln_info

for vuln_url_it in vuln_urls_data:
    dep_vuln_info = dict()
    for vuln_url in vuln_url_it['VulnUrls']:
        patch_info: Dict[str, Dict] = dict()  # commit_url -> [commit_info] 
        _patch_urls_data = [pu for pu in patch_urls_data if pu['VulnUrl'] == vuln_url]

        for patch_urls_it in _patch_urls_data:
            for patch_url in patch_urls_it['PatchUrls']:
                _patch_commits_data = [pc for pc in patch_commits_data if pc['snykPatchUrl'] == patch_url]
                commit_info = list()

                for patch_commit_it in _patch_commits_data:
                    if patch_commit_it:
                        commit_info.append(patch_commit_it)

                if commit_info:
                    patch_info[patch_url] = commit_info

        if patch_info:
            dep_vuln_info[vuln_url] = patch_info

    if dep_vuln_info:
        pvd = dict()
        pvd["_id"] = vuln_url_it['DependencyName'] + ":" + vuln_url_it['DependencyVersion']
        pvd['vulnInfo'] = dep_vuln_info
        package_vuln_data.append(pvd)

In [6]:
db.libioPackageVuln.drop()
db.libioPackageVuln.insert_many(package_vuln_data)

InsertManyResult(['ch.qos.logback:logback-classic:0.9.18', 'ch.qos.logback:logback-classic:0.9.19', 'ch.qos.logback:logback-classic:0.9.24', 'ch.qos.logback:logback-classic:0.9.26', 'ch.qos.logback:logback-classic:0.9.28', 'ch.qos.logback:logback-classic:0.9.29', 'ch.qos.logback:logback-classic:0.9.6', 'ch.qos.logback:logback-classic:0.9.30', 'ch.qos.logback:logback-classic:0.9.9', 'ch.qos.logback:logback-classic:1.0.0', 'ch.qos.logback:logback-classic:1.0.1', 'ch.qos.logback:logback-classic:1.0.10', 'ch.qos.logback:logback-classic:1.0.11', 'ch.qos.logback:logback-classic:1.0.3', 'ch.qos.logback:logback-classic:1.0.2', 'ch.qos.logback:logback-classic:1.0.12', 'ch.qos.logback:logback-classic:1.0.13', 'ch.qos.logback:logback-classic:1.0.6', 'ch.qos.logback:logback-classic:1.0.4', 'ch.qos.logback:logback-classic:1.0.5', 'ch.qos.logback:logback-classic:1.0.7', 'ch.qos.logback:logback-classic:1.0.9', 'ch.qos.logback:logback-classic:1.1.1', 'ch.qos.logback:logback-classic:1.1.0', 'ch.qos.log

In [7]:
vuln_details_dict = dict()
for it in vuln_details_data:
    vuln_details_dict[it['VulnUrl']] = it

In [8]:
flattened_data = list()

for package_vuln_data_it in package_vuln_data:
    gav = package_vuln_data_it['_id']
    for snyk_url, vuln_info_v in package_vuln_data_it['vulnInfo'].items():
        modified_classes = set()
        modified_methods = set()
        patches = set()
        
        for commit_url, commit_v in vuln_info_v.items():  # all commits of a single vuln will be merged
            patches.add(commit_url)
            for commit_change in commit_v:
                modified_classes.add(commit_change['modifiedClass'])
                modified_methods.update(commit_change['changedMethods'])
        
        r = dict()
        r['data_src'] = 'snyk'
        r['cve_ref'] = vuln_details_dict[snyk_url]['CveId']
        r['snyk_url'] = snyk_url
        r['vuln_gav'] = gav
        r['vuln_classes'] = list(modified_classes)
        # r['vuln_method'] = list(modified_methods  omitting for now because we have to check if the method is present in the 'before' version
        r['patches'] = list(patches)
        
        flattened_data.append(r)


In [9]:
len(flattened_data)

8414

In [10]:
uniq_cve_ga = set()
uniq_snyk_ga = set()

for it in flattened_data:
    uniq_cve_ga.add((it['cve_ref'], ':'.join(it['vuln_gav'].split(':')[:-1])))
    uniq_snyk_ga.add((it['snyk_url'], ':'.join(it['vuln_gav'].split(':')[:-1])))

uniq_cve_ga.__len__(), len(uniq_snyk_ga)

(334, 337)

In [11]:
db.libioVuln.drop()
db.libioVuln.insert_many(flattened_data)

InsertManyResult([ObjectId('67fd163c92aface2d37dce9e'), ObjectId('67fd163c92aface2d37dce9f'), ObjectId('67fd163c92aface2d37dcea0'), ObjectId('67fd163c92aface2d37dcea1'), ObjectId('67fd163c92aface2d37dcea2'), ObjectId('67fd163c92aface2d37dcea3'), ObjectId('67fd163c92aface2d37dcea4'), ObjectId('67fd163c92aface2d37dcea5'), ObjectId('67fd163c92aface2d37dcea6'), ObjectId('67fd163c92aface2d37dcea7'), ObjectId('67fd163c92aface2d37dcea8'), ObjectId('67fd163c92aface2d37dcea9'), ObjectId('67fd163c92aface2d37dceaa'), ObjectId('67fd163c92aface2d37dceab'), ObjectId('67fd163c92aface2d37dceac'), ObjectId('67fd163c92aface2d37dcead'), ObjectId('67fd163c92aface2d37dceae'), ObjectId('67fd163c92aface2d37dceaf'), ObjectId('67fd163c92aface2d37dceb0'), ObjectId('67fd163c92aface2d37dceb1'), ObjectId('67fd163c92aface2d37dceb2'), ObjectId('67fd163c92aface2d37dceb3'), ObjectId('67fd163c92aface2d37dceb4'), ObjectId('67fd163c92aface2d37dceb5'), ObjectId('67fd163c92aface2d37dceb6'), ObjectId('67fd163c92aface2d37dce