In [2]:
import json
import logging
import os
import pandas as pd
import re
import sqlalchemy
import sys

In [3]:
# Logging init
os.remove("./build-db-from-osv.log") if os.path.exists("./build-db-from-osv.log") else None
logger = logging.getLogger()
fhandler = logging.FileHandler(filename='build-db-from-osv.log', mode='a')
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
fhandler.setFormatter(formatter)
logger.addHandler(fhandler)
logger.setLevel(logging.DEBUG)

- Read the json files for each ecosystem.
- Parse and make dataframe of these.
- Put the dataframe to database.

In [4]:
df = pd.DataFrame(columns = ['vul_id', 'system_name', 'package_name', 'vul_introduced', 'vul_fixed'])
# df = pd.DataFrame()
data_dir = os.path.join(os.path.join(os.path.join(os.path.join(os.getcwd(), os.pardir), os.pardir), "data"), "osv-data")
for (root,dirs,files) in os.walk(data_dir, topdown=True):
    for file in files:
        if file.endswith(".json"):
            file_path = os.path.join(data_dir, root, file)
            # logging.debug(f"files: {root}/{file} -> start")
            with open(file_path, 'r', encoding='utf-8') as f:
                # try:
                    data = json.load(f)

                    vul_id = data["id"]
                    # logging.debug(f"{vul_id}")

                    package_name = ""
                    
                    if 'affected' in data:
                        for affected in data['affected']:
                            if 'package' in affected and 'ranges' in affected:
                                package_name = affected['package']['name']
                                # logging.debug(f"package_name: {package_name}")
                                system_name = affected['package']['ecosystem']
                                # logging.debug(f"system_name: {system_name}")
                                ranges = affected['ranges']
                                for range in ranges:
                                    events = range['events']
                                    for event in events:
                                        if 'introduced' in event:
                                            vul_introduced = event['introduced']
                                        elif 'fixed' in event:
                                            vul_fixed = event['fixed']
                                            # logging.debug(f"vul intro: {vul_introduced}, fixed: {vul_fixed}")
                                            df_temp = pd.DataFrame({
                                                'vul_id': vul_id,
                                                'system_name': system_name.upper(),
                                                'package_name': package_name,
                                                'vul_introduced': vul_introduced,
                                                'vul_fixed': vul_fixed
                                            }, index=[0])
                                            # logging.debug(f"dataframe: {df_temp.to_string()}")
                                            df = pd.concat([df, df_temp], ignore_index=True)
                # except Exception:
                #     pass
df.head()

Unnamed: 0,vul_id,system_name,package_name,vul_introduced,vul_fixed
0,GHSA-69fv-gw6g-8ccg,CRATES.IO,arrayfire,0,3.6.0
1,GHSA-69fv-gw6g-8ccg,PYPI,arrayfire,0,3.6.0
2,RUSTSEC-2024-0008,CRATES.IO,trillium-client,0.0.0-0,0.5.4
3,GHSA-mjv9-vp6w-3rc9,CRATES.IO,aws-sigv4,0.55.0,0.55.1
4,GHSA-mjv9-vp6w-3rc9,CRATES.IO,aws-sigv4,0.54.1,0.54.2


In [5]:
print (df.system_name.unique())

['CRATES.IO' 'PYPI' 'GO' 'NPM' 'MAVEN' 'NUGET' 'SWIFTURL' 'RUBYGEMS'
 'PACKAGIST' 'PUB' 'HEX']


In [6]:
df.shape


(19095, 5)

In [7]:
df.dropna(inplace=True)
df.shape

(19095, 5)

In [8]:
def transformation_semver(x):
    if x == '0':
        return '0.0.0'
    elif x.count('.') == 0:
        return x + '.0.0'
    elif re.match(r'(\d+(\.\d*))', x) and x.count('.') == 1:
        return x + '.0'
    else:
        return x

In [9]:
def transformation_system_name(x):
    if x == 'CRATES.IO':
        return 'CARGO'
    else:
        return x

In [10]:
df['system_name'] = df['system_name'].apply(transformation_system_name)

In [11]:
df['vul_introduced'] = df['vul_introduced'].apply(transformation_semver)

In [12]:
def filter_rows_by_values(df, col, values):
    return df[~df[col].isin(values)]

In [13]:
df = filter_rows_by_values(df, 'system_name', ['MAVEN', 'NUGET', 'PACKAGIST', 'GO', 'RUBYGEMS',
 'SWIFTURL', 'PUB', 'HEX'])

In [14]:
df.head()

Unnamed: 0,vul_id,system_name,package_name,vul_introduced,vul_fixed
0,GHSA-69fv-gw6g-8ccg,CARGO,arrayfire,0.0.0,3.6.0
1,GHSA-69fv-gw6g-8ccg,PYPI,arrayfire,0.0.0,3.6.0
2,RUSTSEC-2024-0008,CARGO,trillium-client,0.0.0-0,0.5.4
3,GHSA-mjv9-vp6w-3rc9,CARGO,aws-sigv4,0.55.0,0.55.1
4,GHSA-mjv9-vp6w-3rc9,CARGO,aws-sigv4,0.54.1,0.54.2


In [15]:
df


Unnamed: 0,vul_id,system_name,package_name,vul_introduced,vul_fixed
0,GHSA-69fv-gw6g-8ccg,CARGO,arrayfire,0.0.0,3.6.0
1,GHSA-69fv-gw6g-8ccg,PYPI,arrayfire,0.0.0,3.6.0
2,RUSTSEC-2024-0008,CARGO,trillium-client,0.0.0-0,0.5.4
3,GHSA-mjv9-vp6w-3rc9,CARGO,aws-sigv4,0.55.0,0.55.1
4,GHSA-mjv9-vp6w-3rc9,CARGO,aws-sigv4,0.54.1,0.54.2
...,...,...,...,...,...
19090,GHSA-6p5r-g9mq-ggh2,PYPI,tensorflow-gpu,2.4.0,2.4.3
19091,GHSA-6p5r-g9mq-ggh2,PYPI,tensorflow-gpu,2.5.0,2.5.1
19092,GHSA-9v8h-57gv-qch6,PYPI,django,0.96.0,0.96.1
19093,GHSA-9v8h-57gv-qch6,PYPI,django,0.95.0,0.95.2


In [16]:
print (df.system_name.unique())

['CARGO' 'PYPI' 'NPM']


# Send it to POSTGRES

In [17]:
connection_str = "postgresql{dbapi}://{user}:{pw}@{host}:{port}/{db}".format(
        dbapi='',
        # dbapi="+pg8000",
        # dbapi="+psycopg",
        user="metricsuser",
        pw="metricspassword",
        host="localhost",
        port="5432",
        db="metrics")
# print (connection_str)
engine = sqlalchemy.create_engine(connection_str)
with engine.begin() as connection:
        df.to_sql(con=connection,
                name='osv',
                if_exists='append',
                index=False,
                dtype={
                        'vul_id': sqlalchemy.types.VARCHAR,
                        'system_name': sqlalchemy.types.VARCHAR,
                        'package_name': sqlalchemy.types.VARCHAR,
                        'vul_introduced': sqlalchemy.types.VARCHAR,
                        'vul_fixed': sqlalchemy.types.VARCHAR
                }
        )