In [1]:
import os
import csv
import datetime
from distutils.version import StrictVersion
import logging
import mysql.connector
from natsort import natsorted
import numpy as np
import pymysql
import pandas as pd
import psycopg
import semantic_version
import semver
import semver.version
import sqlalchemy
import sqlalchemy.orm
import psycopg_binary

In [2]:
# Logging init
os.remove("./ttr-cdo.log") if os.path.exists("./ttr-cdo.log") else None
logger = logging.getLogger()
fhandler = logging.FileHandler(filename='ttr-cdo.log', mode='a')
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
fhandler.setFormatter(formatter)
logger.addHandler(fhandler)
logger.setLevel(logging.DEBUG)

In [3]:
engine = sqlalchemy.create_engine("postgresql+psycopg://{user}:{pw}@localhost:{port}/{db}"
                       .format(user="postgres",
                               pw="shimul",
                               port="5432",
                               db="postgres"))
# we need to begin engine here and end at the end

In [4]:
# version range: [version_range_start, version_range_end)
def get_all_relations(engine, system_name, to_package_name):
    meta_data = sqlalchemy.MetaData()
    meta_data.reflect(engine)

    RELATIONS = meta_data.tables['relations']

    # View the columns present in the users table
    # print(RELATIONS.columns)

    query = sqlalchemy.sql.select(RELATIONS)\
            .where(
                sqlalchemy.and_(
                    RELATIONS.c.system_name == system_name,
                    RELATIONS.c.to_package_name == to_package_name,
                    RELATIONS.c.is_regular == True
                )
            )
    
    # print(str(query))

    # result = engine.execute(query).fetchall()
    with engine.connect() as conn:
        df = pd.read_sql(query, conn)
    # print(df)

    return df

In [11]:
def get_all_package_info(engine, system_name, package_name):
    meta_data = sqlalchemy.MetaData()
    meta_data.reflect(engine)

    VERSIONINFO = meta_data.tables['versioninfo']

    # View the columns present in the users table
    # print(VERSIONINFO.columns)

    query = sqlalchemy.sql.select(VERSIONINFO)\
            .where(sqlalchemy.and_(VERSIONINFO.c.system_name == system_name,
                                VERSIONINFO.c.package_name == package_name
            ))\
            .order_by(VERSIONINFO.c.version_name)
    
    # print(str(query))

    # result = engine.execute(query).fetchall()
    with engine.connect() as conn:
        df = pd.read_sql(query, conn)
    # print(df)

    return df

In [5]:
def get_package_info_for_version(engine, system_name, package_name, version_name):
    meta_data = sqlalchemy.MetaData()
    meta_data.reflect(engine)

    VERSIONINFO = meta_data.tables['versioninfo']

    # View the columns present in the users table
    # print(VERSIONINFO.columns)

    query = sqlalchemy.sql.select(VERSIONINFO)\
            .where(sqlalchemy.and_(VERSIONINFO.c.system_name == system_name,
                                VERSIONINFO.c.package_name == package_name,
                                VERSIONINFO.c.version_name == str(version_name)
            ))
    
    # print(str(query))

    # result = engine.execute(query).fetchall()
    with engine.connect() as conn:
        df = pd.read_sql(query, conn)
    # print(df)

    return df

In [9]:
def get_vul_data(ecosystem):    
    file_name = os.path.join(os.path.join(os.path.join(os.path.join(os.path.join(os.getcwd(), os.pardir), os.pardir), 'data'), 'extracted-info-from-osv-data'), ecosystem + '.csv')
    print (file_name)
    df = pd.read_csv(file_name,
                    header=0,
                    sep=',',
                    on_bad_lines='skip',
                    usecols=['vul_id', 'pkg_name', 'vul_introduced', 'vul_fixed', 'repo_url'],
                    )
    df.columns = ['vul_id', 'package_name', 'vul_introduced', 'vul_fixed', 'repo_url']

    # replace values in the 'vul_introduced' column based on condition
    df.loc[df['vul_introduced'] == '0', 'vul_introduced'] = '0.0.0'
    
    df
    return df

In [7]:
def do_calc(ecosystem):
    df_vul_data = get_vul_data(ecosystem.lower())
    # print(df_vul_data)
    df_vul_data_grouped = df_vul_data.groupby('vul_id')
    # df_vul_data_grouped.reset_index()
    # print(df_vul_data_grouped)

    cdo_data_points = {}
    ttr_data_points = {}

    count = 0
    for df_vul_group_name, df_vul_group in df_vul_data_grouped:
        logging.debug ('\n\nnew vul')
        # df_group_name is what used on groupby which is in this case 'vul_id'
        logging.debug (df_vul_group_name)
        # And df_group has the slice of dataframe for this 'vul_id'
        logging.debug (df_vul_group)

        for index, row in df_vul_group.iterrows():
            to_package_name = row['package_name']

        # Get the versioninfo and relations for this 'to_package_name'
        df_all_version_info = get_all_package_info(engine=engine,system_name=ecosystem, package_name=to_package_name)
        logging.debug(f"{to_package_name} version_info: {df_all_version_info}")
        df_all_relations = get_all_relations(engine=engine, system_name=ecosystem, to_package_name=to_package_name)
        # print(df_all_relations)

        # Convert the df to adopt SEMVER
        df_all_version_info['version_name'] = df_all_version_info['version_name'].apply(semantic_version.Version)
        # print (df_all_version_info.dtypes)
        df_all_relations['from_version'] = df_all_relations['from_version'].apply(semantic_version.Version)
        df_all_relations['to_version'] = df_all_relations['to_version'].apply(semantic_version.Version)


        # Get the vul_range for this specific vul
        vul_ranges = []
        for index, row in df_vul_group.iterrows():
            to_package_name = row['package_name']
            df_temp = df_all_version_info.loc[ 
                (df_all_version_info['system_name'] == ecosystem) &
                (df_all_version_info['package_name'] == to_package_name) &
                (df_all_version_info['version_name'] == semantic_version.Version(row['vul_fixed']))
            ]
            logging.debug(f"df_temp: {df_temp}")
            # TODO: 
            vul_ranges.append([row['vul_introduced'], row['vul_fixed'], ])
        
        

        # Rank duplicates
        df_all_relations['rank'] = df_all_relations\
                .sort_values(['system_name', 'from_package_name', 'from_version'])\
                .groupby(by=['system_name', 'from_package_name', 'from_version'])\
                ['to_version'].rank(method='first', ascending=False)
        # Not including 'to_package_name' because they are the same for this dataframe
        df_all_relations = df_all_relations\
                .sort_values(['system_name', 'from_package_name', 'to_version', 'rank', 'from_version'])
        

        # Remove duplicates
        df_all_relations = df_all_relations.loc[ df_all_relations['rank'] == 1.0 ]

        # For debugging check the output csv manually
        df_all_relations.to_csv('df_all_relations' + str(count) + '.csv', sep='\t', encoding='utf-8', index=False)

        

        # analysis
        df_relations_grouped = df_all_relations.groupby(by=['system_name', 'from_package_name', 'to_package_name'])
                #.sort_values(['system_name', 'from_package_name', 'to_package_name', 'from_version'])
        
        
        for df_relations_group_name, df_relations_group in df_relations_grouped:
            # logging.debug(df_relations_group.dtypes)
            # df_relations_group['from_version'] = df_relations_group['from_version'].apply(semver.version.Version.parse)
            
            # df_relations_group['from_version'] = df_relations_group['from_version'].apply(semantic_version.Version)
            # df_relations_group['to_version'] = df_relations_group['to_version'].apply(semantic_version.Version)
            df_relations_group.sort_values(by=['to_version'])
            logging.debug(f"relations_group: {df_relations_group}")

            # Get the relations of this group which has at least one of the vul versions of the 'to_package_name'
            df_relations_vul = pd.DataFrame()
            for vul_range in vul_ranges:
                df_temp = df_relations_group.loc[(df_relations_group['to_version'] >= semantic_version.Version(vul_range[0]))
                                                 & (df_relations_group['to_version'] < semantic_version.Version(vul_range[1]))]
                logging.debug(f"vul_relations: {df_temp}")
                df_relations_vul = pd.concat([df_relations_vul, df_temp])
            logging.debug(f"concatenated vul: {df_relations_vul}")
            logging.debug(f"concatenated vul size: {df_relations_vul.shape[0]}")
            if df_relations_vul.shape[0] == 0:
                continue
            # (Optional:) Remove duplicates from df_relation_vul

            # Find the highest 'from_version' from this dataframe (P1)
            from_package_name_highest_vul_version = df_relations_vul['from_version'].max()
            logging.debug(f"highest vul version: {from_package_name_highest_vul_version}")
            logging.debug(f"highest vul version worth checking: {from_package_name_highest_vul_version}") if df_relations_vul.shape[0] > 0 else None

            # Get the relations where they have adopted a fixed ver of this 'to_package_name'
            # and greater than P1
            df_relations_fixed = pd.DataFrame()
            for vul_range in vul_ranges:
                df_temp = df_relations_group.loc[(df_relations_group['to_version'] >= semantic_version.Version(vul_range[1]))
                                                 & (df_relations_group['to_version'] >= from_package_name_highest_vul_version)]
                logging.debug(f"fixed_relations: {df_temp}")
                df_relations_fixed = pd.concat([df_relations_fixed, df_temp])
            logging.debug(f"concatenated fixed: {df_relations_fixed}")
            logging.debug(f"concatenated fixed size: {df_relations_fixed.shape[0]}")

            from_package_name_lowest_fixed_version = df_relations_fixed['from_version'].min()
            logging.debug(f"lowest fixed version: {from_package_name_lowest_fixed_version}")
            logging.debug(f"lowest fixed version worth checking: {from_package_name_lowest_fixed_version}") if df_relations_fixed.shape[0] > 0 else None

            # Find out TTR / CDO
            # if df_relation_fixed Nan and df_relations_vul > 0 -> CDO
            if df_relations_vul.shape[0] > 0 and df_relations_fixed.shape[0] == 0:
                logging.debug(f"Might be a CDO data point.")
                logging.debug(f"-------------------CDO----------------------")
                # Get the fixed version release data
                earliest_to_fixed_version = semantic_version.Version('9999999.99999999.99999999')
                for vul_range in vul_ranges:
                    logging.debug(f"target_fixed_package_version: {to_package_name}@{vul_range[1]}")
                    to_fixed_version_info = df_all_version_info.loc[ 
                        (df_all_version_info['system_name'] == ecosystem.upper()) &
                        (df_all_version_info['package_name'] == to_package_name) &
                        (df_all_version_info['version_name'] == semantic_version.Version(vul_range[1]))
                    ]
                    to_fixed_version_info = to_fixed_version_info.reset_index()
                    logging.debug(f"to_fixed_version_info: {to_fixed_version_info}")
                    if (to_fixed_version_info.empty == False):
                        logging.debug(f"to_fixed_version_info version_name: {to_fixed_version_info['version_name'].values[:1]}")
                        logging.debug(f"to_fixed_version_info version_name dtype: {to_fixed_version_info['version_name'].values[:1].dtype}")
                        logging.debug(f"to_fixed_version_info version_name: Version?? {isinstance(to_fixed_version_info['version_name'].values[:1], semantic_version.Version)}")
                        logging.debug(f"to_fixed_version_info version_name: {to_fixed_version_info['version_name'].values[:1][0]}")
                        logging.debug(f"index: {to_fixed_version_info['index'].values[:1]}")
                        logging.debug(f"index 0 : {to_fixed_version_info['version_name'][0]}")
                        logging.debug(f"version: {semantic_version.Version(str(to_fixed_version_info['version_name'][0]))} ")
                        if (semantic_version.Version(str(to_fixed_version_info['version_name'][0])) < semantic_version.Version('9999999.99999999.99999999')):
                            earliest_to_fixed_version = to_fixed_version_info
                            earliest_fixed_version_release_date = to_fixed_version_info['release_date'].values[:1]
                logging.debug(f"earliest_to_fixed_version: {earliest_to_fixed_version}")

                if isinstance(to_fixed_version_info, pd.DataFrame):
                    # Found a CDO data point
                    logging.debug(f"earliest_to_fixed_version_release_date: {earliest_fixed_version_release_date}")
                    logging.debug(f"earliest_to_fixed_version_release_date dType: {earliest_fixed_version_release_date.dtype}")
                    cdo_data_point = np.datetime64('today') - earliest_fixed_version_release_date 
                    logging.debug(f"cdo_data_point: {cdo_data_point}")
                    logging.debug(f"cdo_data_point: dtype {cdo_data_point.dtype}")
                    if to_package_name not in cdo_data_points:
                        cdo_data_points[to_package_name] = []
                    cdo_data_points[to_package_name].append(cdo_data_point)

                logging.debug(f"-------------------CDO----------------------")
                
            # -> TTR
            elif df_relations_vul.shape[0] > 0 and df_relations_fixed.shape[0] > 0:
                logging.debug(f"Might be a TTR data point.")
                logging.debug(f"-------------------TTR----------------------")
                # Get the fixed version release data
                earliest_to_fixed_version = semantic_version.Version('9999999.99999999.99999999')
                for vul_range in vul_ranges:
                    logging.debug(f"target_fixed_package_version: {to_package_name}@{vul_range[1]}")
                    to_fixed_version_info = df_all_version_info.loc[ 
                        (df_all_version_info['system_name'] == ecosystem.upper()) &
                        (df_all_version_info['package_name'] == to_package_name) &
                        (df_all_version_info['version_name'] == semantic_version.Version(vul_range[1]))
                    ]
                    to_fixed_version_info = to_fixed_version_info.reset_index()
                    logging.debug(f"to_fixed_version_info: {to_fixed_version_info}")
                    if (to_fixed_version_info.empty == False):
                        logging.debug(f"to_fixed_version_info version_name: {to_fixed_version_info['version_name'].values[:1]}")
                        logging.debug(f"to_fixed_version_info version_name dtype: {to_fixed_version_info['version_name'].values[:1].dtype}")
                        logging.debug(f"to_fixed_version_info version_name: Version?? {isinstance(to_fixed_version_info['version_name'].values[:1], semantic_version.Version)}")
                        logging.debug(f"to_fixed_version_info version_name: {to_fixed_version_info['version_name'].values[:1][0]}")
                        logging.debug(f"index: {to_fixed_version_info['index'].values[:1]}")
                        logging.debug(f"index 0 : {to_fixed_version_info['version_name'][0]}")
                        logging.debug(f"version: {semantic_version.Version(str(to_fixed_version_info['version_name'][0]))} ")
                        if (semantic_version.Version(str(to_fixed_version_info['version_name'][0])) < semantic_version.Version('9999999.99999999.99999999')):
                            earliest_to_fixed_version = to_fixed_version_info
                            earliest_fixed_version_release_date = to_fixed_version_info['release_date'].values[:1]
                logging.debug(f"earliest_to_fixed_version: {earliest_to_fixed_version}")

                # Get the earliest fixed version adoption time by the parent package
                df_relations_fixed = df_relations_fixed.sort_values(by=['from_version']).reset_index()
                from_fixed_version_relations_info = df_relations_fixed.head(1)
                logging.debug(f"from_fixed_version_relations_info: {from_fixed_version_relations_info}")
                from_fixed_version = df_relations_fixed['from_version'][0]
                logging.debug(f"from_fixed_version: {from_fixed_version}")
                # logging.debug(f"from_fixed_version dtype: {from_fixed_version.dtype}")
                from_package_name = df_relations_fixed['from_package_name'][0]
                logging.debug(f"from_package_name: {from_package_name}")
                
                # Get that version release time
                df_from_version_info = get_all_package_info(engine=engine,
                                                            system_name=ecosystem,
                                                            package_name=from_package_name)
                df_from_version_info['version_name'] = df_from_version_info['version_name'].apply(
                    semantic_version.Version
                )
                
                logging.debug(f"df_from_version_info (all): {df_from_version_info}")
                from_fixed_version_info = df_from_version_info.loc[
                    (df_from_version_info['system_name'] == ecosystem.upper()) &
                    (df_from_version_info['package_name'] == from_package_name) &
                    (df_from_version_info['version_name'] == from_fixed_version)
                ]
                from_fixed_version_info = from_fixed_version_info.reset_index()
                logging.debug(f"from_fixed_version_info: {from_fixed_version_info}")

                if from_fixed_version_info.empty == False and isinstance(to_fixed_version_info, pd.DataFrame):
                    # Found an actual TTR data point
                    logging.debug(f"earliest_to_fixed_version_release_date: {earliest_fixed_version_release_date}")
                    logging.debug(f"earliest_to_fixed_version_release_date dType: {earliest_fixed_version_release_date.dtype}")
                    from_fixed_version_release_date = from_fixed_version_info['release_date'].values[:1]
                    logging.debug(f"from_fixed_version: {from_fixed_version_info['version_name'][0]}")
                    logging.debug(f"from_fixed_version_release_date: {from_fixed_version_release_date}")
                    
                    ttr_data_point = from_fixed_version_release_date - earliest_fixed_version_release_date 
                    logging.debug(f"ttr_data_point: {ttr_data_point}")
                    logging.debug(f"ttr_data_point: dtype {ttr_data_point.dtype}")
                    if from_package_name not in ttr_data_points:
                        ttr_data_points[from_package_name] = []
                    ttr_data_points[from_package_name].append(ttr_data_point)
                logging.debug(f"-------------------TTR----------------------")
        count += 1
        if (count > 3):
            break

    logging.debug (cdo_data_points)
    logging.debug (ttr_data_points)
    # (key, value) = (package_name, [] of cdo/ttr data points)
    # elements of [] cdo/ttr data points -> timedelta
    

In [12]:
do_calc('NPM')

/Users/imranur/Research/security-metrics/code/calculate-ttr-cdo/../../data/extracted-info-from-osv-data/npm.csv


  meta_data.reflect(engine)
  meta_data.reflect(engine)
  meta_data.reflect(engine)
  meta_data.reflect(engine)
  meta_data.reflect(engine)
  meta_data.reflect(engine)
  meta_data.reflect(engine)
  meta_data.reflect(engine)
  meta_data.reflect(engine)
  meta_data.reflect(engine)
  meta_data.reflect(engine)
  meta_data.reflect(engine)
  meta_data.reflect(engine)
  meta_data.reflect(engine)
  meta_data.reflect(engine)
  meta_data.reflect(engine)
  meta_data.reflect(engine)
  meta_data.reflect(engine)
  meta_data.reflect(engine)
  meta_data.reflect(engine)
  meta_data.reflect(engine)
  meta_data.reflect(engine)
  meta_data.reflect(engine)
  meta_data.reflect(engine)
  meta_data.reflect(engine)
  meta_data.reflect(engine)
  meta_data.reflect(engine)
  meta_data.reflect(engine)
  meta_data.reflect(engine)
  meta_data.reflect(engine)
  meta_data.reflect(engine)
  meta_data.reflect(engine)
  meta_data.reflect(engine)
  meta_data.reflect(engine)
  meta_data.reflect(engine)
  meta_data.reflect(