In [2]:
import numpy as np
import pandas as pd
import matplotlib
from matplotlib import pyplot as plt
import tikzplotlib
import seaborn as sns

import json
from datetime import datetime

In [12]:
projects_df = pd.read_csv('/root/data/projects.csv',
                         parse_dates=['project_created_at', 'project_last_pushed_at', 'project_updated_at'])
packages_df = pd.read_csv('/root/data/packages_0_499.csv')
geiger_df = pd.read_csv('/root/data/geiger/geiger_findings_0_499.csv')

vet_df = pd.read_csv('/root/data/linters/vet_findings_0_499.csv')
gosec_df = pd.read_csv('/root/data/linters/gosec_findings_0_499.csv')

# Comparison of go-geiger performance with go vet and gosec

## Vet findings over geiger findings: any Vet message

In [13]:
df1 = geiger_df.drop_duplicates(subset=['package_import_path', 'module_path', 'module_version', 
                                        'file_name', 'line_number']).dropna()

df2 = vet_df.drop_duplicates(subset=['package_import_path', 'module_path', 'module_version', 
                                     'file_name', 'line_number']).dropna()

vet_merged_df = pd.merge(df1, df2, how='outer', on=['package_import_path', 'module_path', 'module_version', 
                                                    'file_name', 'line_number'])

In [14]:
vet_and_unsafe_df = vet_merged_df.dropna()
only_unsafe_df = vet_merged_df.loc[vet_merged_df['message'].isna()]
only_vet_df = vet_merged_df.loc[vet_merged_df['text'].isna()]

In [15]:
print("[tp] lines that were flagged by geiger and vet (any message): {}".format(vet_and_unsafe_df['line_number'].count()))
print("[fn] lines that were not flagged by vet: {}".format(only_unsafe_df['line_number'].count()))
print("[fp] lines that were flagged by vet (any message) but not geigered: {}".format(only_vet_df['line_number'].count()))

[tp] lines that were flagged by geiger and vet (any message): 219
[fn] lines that were not flagged by vet: 76738
[fp] lines that were flagged by vet (any message) but not geigered: 31224


## Vet findings over geiger findings: unsafeptr Vet message

In [16]:
df1 = geiger_df\
    .drop_duplicates(subset=['package_import_path', 'module_path', 'module_version', 'file_name', 'line_number'])\
    .dropna()

df2 = vet_df\
    [vet_df['message']=='possible misuse of unsafe.Pointer']\
    .drop_duplicates(subset=['package_import_path', 'module_path', 'module_version',  'file_name', 'line_number'])\
    .dropna()

vet_merged_df = pd.merge(df1, df2, how='outer', on=['package_import_path', 'module_path', 'module_version', 
                                                    'file_name', 'line_number'])

In [17]:
vet_and_unsafe_df = vet_merged_df.dropna()
only_unsafe_df = vet_merged_df.loc[vet_merged_df['message'].isna()]
only_vet_df = vet_merged_df.loc[vet_merged_df['text'].isna()]

In [18]:
print("[tp] lines that were flagged by geiger and vet (unsafeptr): {}".format(vet_and_unsafe_df['line_number'].count()))
print("[fn] lines that were not flagged by vet: {}".format(only_unsafe_df['line_number'].count()))
print("[fp] lines that were flagged by vet (unsafeptr) but not geigered: {}".format(only_vet_df['line_number'].count()))

[tp] lines that were flagged by geiger and vet (unsafeptr): 213
[fn] lines that were not flagged by vet: 76744
[fp] lines that were flagged by vet (unsafeptr) but not geigered: 0


## Gosec findings over geiger findings: any gosec message

In [19]:
df1 = geiger_df\
    .drop_duplicates(subset=['package_import_path', 'module_path', 'module_version', 'file_name', 'line_number'])\
    .dropna()

df2 = gosec_df\
    .drop_duplicates(subset=['package_import_path', 'module_path', 'module_version',  'file_name', 'line_number'])\
    .dropna()

gosec_merged_df = pd.merge(df1, df2, how='outer', on=['package_import_path', 'module_path', 'module_version', 
                                                      'file_name', 'line_number'])

In [20]:
gosec_and_unsafe_df = gosec_merged_df.dropna()
only_unsafe_df = gosec_merged_df.loc[gosec_merged_df['message'].isna()]
only_gosec_df = gosec_merged_df.loc[gosec_merged_df['text'].isna()]

In [21]:
print("[tp] lines that were flagged by geiger and gosec (any message): {}".format(gosec_and_unsafe_df['line_number'].count()))
print("[fn] lines that were not flagged by gosec: {}".format(only_unsafe_df['line_number'].count()))
print("[fp] lines that were flagged by gosec (any message) but not geigered: {}".format(only_gosec_df['line_number'].count()))

[tp] lines that were flagged by geiger and gosec (any message): 36279
[fn] lines that were not flagged by gosec: 40678
[fp] lines that were flagged by gosec (any message) but not geigered: 114306


## Gosec findings over geiger findings: only the unsafe-related gosec messages

In [33]:
df1 = geiger_df\
    .drop_duplicates(subset=['package_import_path', 'module_path', 'module_version', 'file_name', 'line_number'])\
    .dropna()

df2 = gosec_df\
    [gosec_df['message']=='Use of unsafe calls should be audited']\
    .drop_duplicates(subset=['package_import_path', 'module_path', 'module_version',  'file_name', 'line_number'])\
    .dropna()

gosec_merged_df = pd.merge(df1, df2, how='outer', on=['package_import_path', 'module_path', 'module_version', 
                                                      'file_name', 'line_number'])

In [34]:
gosec_and_unsafe_df = gosec_merged_df.dropna()
only_unsafe_df = gosec_merged_df.loc[gosec_merged_df['message'].isna()]
only_gosec_df = gosec_merged_df.loc[gosec_merged_df['text'].isna()]

In [35]:
print("[tp] lines that were flagged by geiger and gosec (only unsafe-related): {}".format(gosec_and_unsafe_df['line_number'].count()))
print("[fn] lines that were not flagged by gosec: {}".format(only_unsafe_df['line_number'].count()))
print("[fp] lines that were flagged by gosec (only unsafe-related) but not geigered: {}".format(only_gosec_df['line_number'].count()))

[tp] lines that were flagged by geiger and gosec (only unsafe-related): 36267
[fn] lines that were not flagged by gosec: 40690
[fp] lines that were flagged by gosec (only unsafe-related) but not geigered: 0


## Gosec findings over geiger findings: only the unsafe-related gosec messages and only unsafe.Pointer in geiger results

In [39]:
df1 = geiger_df\
    [(geiger_df['match_type']=='unsafe.Pointer')|(geiger_df['match_type']=='unsafe.Offsetof')|
    (geiger_df['match_type']=='unsafe.Sizeof')|(geiger_df['match_type']=='unsafe.Alignof')]\
    .drop_duplicates(subset=['package_import_path', 'module_path', 'module_version', 'file_name', 'line_number'])\
    .dropna()

df2 = gosec_df\
    [gosec_df['message']=='Use of unsafe calls should be audited']\
    .drop_duplicates(subset=['package_import_path', 'module_path', 'module_version',  'file_name', 'line_number'])\
    .dropna()

gosec_merged_df = pd.merge(df1, df2, how='outer', on=['package_import_path', 'module_path', 'module_version', 
                                                      'file_name', 'line_number'])

In [40]:
gosec_and_unsafe_df = gosec_merged_df.dropna()
only_unsafe_df = gosec_merged_df.loc[gosec_merged_df['message'].isna()]
only_gosec_df = gosec_merged_df.loc[gosec_merged_df['text'].isna()]

In [42]:
print("[tp] lines that were flagged by geiger (only unsafe pkg matches) and gosec (only unsafe-related): {}".format(gosec_and_unsafe_df['line_number'].count()))
print("[fn] lines that were not flagged by gosec: {}".format(only_unsafe_df['line_number'].count()))
print("[fp] lines that were flagged by gosec (only unsafe-related) but not geigered: {}".format(only_gosec_df['line_number'].count()))

[tp] lines that were flagged by geiger (only unsafe pkg matches) and gosec (only unsafe-related): 36267
[fn] lines that were not flagged by gosec: 18019
[fp] lines that were flagged by gosec (only unsafe-related) but not geigered: 0
