In [1]:
import numpy as np
import pandas as pd
import matplotlib
from matplotlib import pyplot as plt
import tikzplotlib
import seaborn as sns

import json
from datetime import datetime

# Load my data

In [136]:
projects_df = pd.read_csv('/root/data/projects.csv',
                         parse_dates=['project_created_at', 'project_last_pushed_at', 'project_updated_at'])
projects_df['project_revision'] = projects_df.apply(lambda x: x['project_revision'][:10], axis=1)

packages_df = pd.read_csv('/root/data/packages_0_499.csv').dropna()
geiger_df = pd.read_csv('/root/data/geiger/geiger_findings_0_499.csv')
sampled_usages_app = pd.read_csv('/root/data/classification/sampled_usages_app.csv')
sampled_usages_std = pd.read_csv('/root/data/classification/sampled_usages_std.csv')

vet_df = pd.read_csv('/root/data/lexical/vet_findings_0_499.csv')

# Load Costa data

In [94]:
costa_data_unsafe_usage_per_project_df = pd.read_csv('/root/costa-replica/replication-package/data/unsafe-usage/unsafe_usage_per_project.csv')
costa_data_3k_curated_projects_df = pd.read_csv('/root/costa-replica/replication-package/data/data-selection/3k-curated-projects.csv')
costa_data_unsafe_usage_labels_df = pd.read_csv('/root/costa-replica/replication-package/data/manual-analysis/Unsafe Usage  - Labeled Set (Diego).csv')

# Comparison

## Fraction of projects using unsafe

In [24]:
costa_data_unsafe_usage_per_project_df\
    .sort_values(by='#', ascending=False).iloc[13]['Project']

'kubernetes/kubernetes'

In [43]:
costa_data_unsafe_usage_per_project_df[costa_data_unsafe_usage_per_project_df['Project']=='pachyderm/pachyderm']

Unnamed: 0.1,Unnamed: 0,Project,Description,#


In [46]:
costa_data_3k_curated_projects_df[costa_data_3k_curated_projects_df['full_name']=='pachyderm/pachyderm']

Unnamed: 0.1,Unnamed: 0,id,node_id,name,full_name,private,html_url,description,fork,url,...,license,id_y,LOC,id_x,n_commits,id_y.1,n_committers,n_authors,age,age_last_commit


In [36]:
mdf = pd.merge(packages_df, projects_df, how='left', on='project_name', validate='many_to_one')

df1 = mdf\
    [(mdf['module_path']==mdf['project_root_module'])&(mdf['is_standard']==False)]\
    .groupby('project_name')['package_unsafe_sum']\
    .agg(project_unsafe_pkg_count=lambda x: np.count_nonzero(x))\
    .reset_index()\
    .sort_values(by='project_unsafe_pkg_count', ascending=False)
                
df2 = mdf\
    [(mdf['module_path']!=mdf['project_root_module'])&(mdf['is_standard']==False)]\
    .groupby('project_name')['package_unsafe_sum']\
    .agg(dep_unsafe_pkg_count=lambda x: np.count_nonzero(x))\
    .reset_index()
                 
df = pd.merge(df1, df2, how='left', on='project_name', validate='one_to_one').fillna(0)
df['total_unsafe_pkg_count'] = df.apply(lambda x: x['project_unsafe_pkg_count'] + x['dep_unsafe_pkg_count'], axis=1)

In [160]:
df[df['project_name']=='kubernetes/kubernetes']

Unnamed: 0,project_name,project_unsafe_pkg_count,dep_unsafe_pkg_count,total_unsafe_pkg_count
0,kubernetes/kubernetes,58,105.0,163.0


In [144]:
my_projects = df['project_name'].unique()

comparison_df = pd.DataFrame(data={'project_name': [], 'my_count': [], 'costa_count': [], 'difference': [],
                                  'difference_abs': []})

for project in my_projects_with_unsafe:
    costa_project_df = costa_data_3k_curated_projects_df[costa_data_3k_curated_projects_df['full_name']==project]
    if len(costa_project_df) > 0:
        costa_unsafe_df = costa_data_unsafe_usage_per_project_df[costa_data_unsafe_usage_per_project_df['Project']==project]
        if len(costa_unsafe_df) > 0:
            costa_count = costa_unsafe_df.iloc[0]['#']
        else:
            costa_count = 0
        project_root_module = projects_df[projects_df['project_name']==project].iloc[0]['project_root_module']
        my_count_df = packages_df\
            [(packages_df['project_name']==project)&(packages_df['module_path']==project_root_module)]
        my_count = my_count_df['package_geiger_unsafe_pointer_sum'].sum() + \
            my_count_df['package_geiger_unsafe_sizeof_sum'].sum() + \
            my_count_df['package_geiger_unsafe_offsetof_sum'].sum() + \
            my_count_df['package_geiger_unsafe_alignof_sum'].sum()
        comparison_df = comparison_df.append(pd.DataFrame(data={
            'project_name': [project], 
            'my_count': [my_count], 
            'costa_count': [costa_count],
            'difference': [my_count - costa_count],
            'difference_abs': [abs(my_count - costa_count)]}))
            
comparison_df

Unnamed: 0,project_name,my_count,costa_count,difference,difference_abs
0,kubernetes/kubernetes,1885.0,2058.0,-173.0,173.0
0,cilium/cilium,345.0,290.0,55.0,55.0
0,pingcap/tidb,143.0,134.0,9.0,9.0
0,go-delve/delve,20.0,72.0,-52.0,52.0
0,elastic/beats,70.0,164.0,-94.0,94.0
...,...,...,...,...,...
0,gomods/athens,0.0,0.0,0.0,0.0
0,gomodule/redigo,0.0,0.0,0.0,0.0
0,google/go-cloud,0.0,0.0,0.0,0.0
0,google/go-github,0.0,0.0,0.0,0.0


In [145]:
comparison_df[(comparison_df['my_count']>0)&(comparison_df['costa_count']==0)]

Unnamed: 0,project_name,my_count,costa_count,difference,difference_abs
0,chrislusf/seaweedfs,4.0,0.0,4.0,4.0
0,thanos-io/thanos,6.0,0.0,6.0,6.0
0,gin-gonic/gin,3.0,0.0,3.0,3.0
0,istio/istio,11.0,0.0,11.0,11.0
0,heroiclabs/nakama,8.0,0.0,8.0,8.0
0,golang-migrate/migrate,5.0,0.0,5.0,5.0
0,360EntSecGroup-Skylar/excelize,1.0,0.0,1.0,1.0


In [146]:
comparison_df[(comparison_df['my_count']==0)&(comparison_df['costa_count']>0)]

Unnamed: 0,project_name,my_count,costa_count,difference,difference_abs
0,rclone/rclone,0.0,12.0,-12.0,12.0
0,git-lfs/git-lfs,0.0,5.0,-5.0,5.0
0,perkeep/perkeep,0.0,6.0,-6.0,6.0
0,microsoft/ethr,0.0,2.0,-2.0,2.0
0,golangci/golangci-lint,0.0,4.0,-4.0,4.0
0,syncthing/syncthing,0.0,4.0,-4.0,4.0
0,baidu/bfe,0.0,4.0,-4.0,4.0
0,kubernetes-sigs/kind,0.0,10.0,-10.0,10.0
0,labstack/echo,0.0,1.0,-1.0,1.0
0,zserge/lorca,0.0,2.0,-2.0,2.0


In [147]:
comparison_df\
    [comparison_df['difference_abs']>0]\
    .sort_values(by='difference_abs', ascending=False)[:20]

Unnamed: 0,project_name,my_count,costa_count,difference,difference_abs
0,jetstack/cert-manager,374.0,140.0,234.0,234.0
0,kubernetes/kubernetes,1885.0,2058.0,-173.0,173.0
0,golang/mobile,93.0,211.0,-118.0,118.0
0,TykTechnologies/tyk,187.0,79.0,108.0,108.0
0,elastic/beats,70.0,164.0,-94.0,94.0
0,golang/tools,44.0,113.0,-69.0,69.0
0,peterq/pan-light,0.0,64.0,-64.0,64.0
0,cilium/cilium,345.0,290.0,55.0,55.0
0,go-delve/delve,20.0,72.0,-52.0,52.0
0,ethereum/go-ethereum,91.0,42.0,49.0,49.0


## Labeled data sets intersection

In [95]:
costa_data_unsafe_usage_labels_df.columns

Index(['Random', 'Project', 'File', '# Unsafe Usage', 'Project.1', 'Github',
       'Updated Github Link', '# of Functional Chunks', 'Chunk 1', 'Context 1',
       'Comment', 'API Misuse?'],
      dtype='object')

In [97]:
sampled_usages_app.columns

Index(['Unnamed: 0', 'Unnamed: 0.1', 'Unnamed: 0.1.1', 'Unnamed: 0.1.1.1',
       'Unnamed: 0.1.1.1.1', 'Unnamed: 0.1.1.1.1.1', 'Unnamed: 0.1.1.1.1.1.1',
       'text', 'context', 'line_number', 'column', 'absolute_offset',
       'match_type', 'context_type', 'file_name', 'file_loc', 'file_byte_size',
       'package_import_path', 'module_path', 'module_version', 'project_name',
       'label', 'label2'],
      dtype='object')

In [101]:
sampled_usages_app.loc[:,['line_number', 'file_name', 'package_import_path', 'module_path', 'project_name']]

Unnamed: 0,line_number,file_name,package_import_path,module_path,project_name
0,116.0,fastwalk_unix.go,golang.org/x/tools/internal/fastwalk,golang.org/x/tools,mattermost/mattermost-server
1,4080.0,zz_generated.conversion.go,k8s.io/kubernetes/pkg/apis/core/v1,k8s.io/kubernetes,rancher/rancher
2,18.0,aliasing.go,golang.org/x/crypto/internal/subtle,golang.org/x/crypto,elastic/beats
3,2946.0,zz_generated.conversion.go,k8s.io/kubernetes/pkg/apis/core/v1,k8s.io/kubernetes,rook/rook
4,741.0,nl_linux.go,github.com/vishvananda/netlink/nl,github.com/vishvananda/netlink,kubernetes/kubernetes
...,...,...,...,...,...
995,166.0,/root/.cache/go-build/48/48c5a43960c863b51382d...,github.com/containers/libpod/libpod/lock/shm,github.com/containers/libpod,containers/libpod
996,1075.0,zz_generated.conversion.go,k8s.io/kubernetes/pkg/apis/extensions/v1beta1,k8s.io/kubernetes,kubernetes/kubernetes
997,7968.0,zz_generated.conversion.go,k8s.io/kubernetes/pkg/apis/core/v1,k8s.io/kubernetes,rook/rook
998,4128.0,/root/.cache/go-build/3e/3eb68e40305fe81ca0dee...,github.com/godror/godror,github.com/godror/godror,xo/usql


In [104]:
costa_tomerge_df = costa_data_unsafe_usage_labels_df

costa_tomerge_df['jl_file_name'] = costa_tomerge_df.apply(lambda x: x['File'].split('/')[-1], axis=1)
costa_tomerge_df['jl_package_import_path'] = costa_tomerge_df.apply(lambda x: "/".join(x['File'].split('/')[2:-1]), axis=1)

costa_tomerge_df

Unnamed: 0,Random,Project,File,# Unsafe Usage,Project.1,Github,Updated Github Link,# of Functional Chunks,Chunk 1,Context 1,Comment,API Misuse?,jl_file_name,jl_package_import_path
0,1,33cn/chain33,33cn/chain33/system/store/mavl/db/tree_test.go,4,33cn/chain33,https://github.com/33cn/chain33/tree/master/sy...,,1,Get architecture info,Testing,,,tree_test.go,system/store/mavl/db
1,2,360EntSecGroup-Skylar/goreporter,360EntSecGroup-Skylar/goreporter/linters/align...,1,360EntSecGroup-Skylar/goreporter,https://github.com/360EntSecGroup-Skylar/gorep...,,,Get architecture info,Program,,,aligncheck.go,linters/aligncheck
2,1,AcalephStorage/consul-alerts,AcalephStorage/consul-alerts/Godeps/_workspace...,1,AcalephStorage/consul-alerts,https://github.com/AcalephStorage/consul-alert...,,,System Call,Program,,,terminal_notwindows.go,Godeps/_workspace/src/github.com/Sirupsen/logrus
3,1,AlecAivazis/survey,AlecAivazis/survey/terminal/output_windows.go,3,AlecAivazis/survey,https://github.com/AlecAivazis/survey/tree/mas...,,,System Call,Program,Used to get access to the terminal,,output_windows.go,terminal
4,2,Antonito/gfile,Antonito/gfile/_client/web/emitter.go,2,Antonito/gfile,https://github.com/Antonito/gfile/tree/master/...,,,System Call,Program,Converto to Slice Pointer -> Slice Header -> Ptr,,emitter.go,_client/web
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
587,3,zeromq/goczmq,zeromq/goczmq/auth.go,23,zeromq/goczmq,https://github.com/zeromq/goczmq/tree/master/a...,,,,,,,auth.go,
588,3,zmap/zgrab,zmap/zgrab/ztools/xssh/terminal/util_windows.go,5,zmap/zgrab,https://github.com/zmap/zgrab/tree/master/ztoo...,,,,,,,util_windows.go,ztools/xssh/terminal
589,1,zond/god,zond/god/radix/radix.go,1,zond/god,https://github.com/zond/god/tree/master/radix/...,,,Convert between Types,Program,,,radix.go,radix
590,1,zserge/lorca,zserge/lorca/messagebox_windows.go,2,zserge/lorca,https://github.com/zserge/lorca/tree/master/me...,,,System Call,Program,,,messagebox_windows.go,


In [107]:
sua_tomerge_df = sampled_usages_app

sua_tomerge_df['concatenated_file_path'] = sua_tomerge_df.apply(
    lambda x: "{}/{}".format(x['package_import_path'][len(x['module_path'])+1:], x['file_name']), axis=1)

sua_tomerge_df

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,Unnamed: 0.1.1.1,Unnamed: 0.1.1.1.1,Unnamed: 0.1.1.1.1.1,Unnamed: 0.1.1.1.1.1.1,text,context,line_number,...,file_name,file_loc,file_byte_size,package_import_path,module_path,module_version,project_name,label,label2,concatenated_file_path
0,0,0,0,0,0,0,0,nameBuf := (*[unsafe.Sizeof(dirent.Name)]byte)...,\t\t// of goimports. goimports only cares abou...,116.0,...,fastwalk_unix.go,128.0,3466.0,golang.org/x/tools/internal/fastwalk,golang.org/x/tools,v0.0.0-20200428021058-7ae4988eb4d9,mattermost/mattermost-server,cast-bytes,efficiency,internal/fastwalk/fastwalk_unix.go
1,1,1,1,1,1,1,1,out.PostStart = (*core.Handler)(unsafe.Pointer...,func Convert_core_KeyToPath_To_v1_KeyToPath(in...,4080.0,...,zz_generated.conversion.go,8109.0,431398.0,k8s.io/kubernetes/pkg/apis/core/v1,k8s.io/kubernetes,v1.18.0,rancher/rancher,cast-struct,efficiency,pkg/apis/core/v1/zz_generated.conversion.go
2,2,2,2,2,2,2,16,uintptr(unsafe.Pointer(&y[0])) <= uintptr(unsa...,// AnyOverlap reports whether x and y share me...,18.0,...,aliasing.go,32.0,1262.0,golang.org/x/crypto/internal/subtle,golang.org/x/crypto,v0.0.0-20200510223506-06a226fb4e37,elastic/beats,pointer-arithmetic,layout,internal/subtle/aliasing.go
3,3,3,3,3,3,3,32,out.Waiting = (*core.ContainerStateWaiting)(un...,func Convert_core_ContainerPort_To_v1_Containe...,2946.0,...,zz_generated.conversion.go,8119.0,431912.0,k8s.io/kubernetes/pkg/apis/core/v1,k8s.io/kubernetes,v1.17.2,rook/rook,cast-struct,efficiency,pkg/apis/core/v1/zz_generated.conversion.go
4,4,4,4,4,4,4,37,a := (*unix.RtAttr)(unsafe.Pointer(&b[0])),"\t}\n\treturn attrs, nil\n}\n\nfunc netlinkRou...",741.0,...,nl_linux.go,760.0,18574.0,github.com/vishvananda/netlink/nl,github.com/vishvananda/netlink,v1.1.0,kubernetes/kubernetes,cast-bytes,serialization,nl/nl_linux.go
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,995,995,995,995,995,995,8338,var _cgo_9599666909ac_Cfunc_deallocate_semapho...,\treturn\n}\n//go:cgo_import_static _cgo_95996...,166.0,...,/root/.cache/go-build/48/48c5a43960c863b51382d...,0.0,0.0,github.com/containers/libpod/libpod/lock/shm,github.com/containers/libpod,project,containers/libpod,cast-pointer,ffi,libpod/lock/shm//root/.cache/go-build/48/48c5a...
996,996,996,996,996,996,996,1502,out.Conditions = *(*[]apps.DeploymentCondition...,\tout.Replicas = in.Replicas\n\tout.UpdatedRep...,1075.0,...,zz_generated.conversion.go,2299.0,123915.0,k8s.io/kubernetes/pkg/apis/extensions/v1beta1,k8s.io/kubernetes,project,kubernetes/kubernetes,cast-struct,efficiency,pkg/apis/extensions/v1beta1/zz_generated.conve...
997,997,997,997,997,997,997,6591,out.Cinder = (*core.CinderVolumeSource)(unsafe...,\tout.ISCSI = (*core.ISCSIVolumeSource)(unsafe...,7968.0,...,zz_generated.conversion.go,8119.0,431912.0,k8s.io/kubernetes/pkg/apis/core/v1,k8s.io/kubernetes,v1.17.2,rook/rook,cast-struct,efficiency,pkg/apis/core/v1/zz_generated.conversion.go
998,998,998,998,998,998,998,7529,var _cgo_6cd21b101e7b_Cfunc_dpiStmt_addRef = u...,\treturn\n}\n//go:cgo_import_static _cgo_6cd21...,4128.0,...,/root/.cache/go-build/3e/3eb68e40305fe81ca0dee...,0.0,0.0,github.com/godror/godror,github.com/godror/godror,v0.16.0,xo/usql,cast-pointer,ffi,//root/.cache/go-build/3e/3eb68e40305fe81ca0de...


In [134]:
for idx, row in sua_tomerge_df.iterrows():
    costa_result_df = costa_tomerge_df[(costa_tomerge_df['File'].str.contains(row['concatenated_file_path']))&
                                      (costa_tomerge_df['Project']==row['module_path'][len("github.com/"):])]
    
    if costa_result_df.empty:
        continue
        
    print(row['concatenated_file_path'])
    print(row['module_path'])
    print(row['line_number'])
    print(costa_result_df['File'].to_string())
    print(costa_result_df['Project'].to_string())
    print()
    print("{} / {}".format(row['label'], row['label2']))
    print(costa_result_df['Chunk 1'].to_string())
    print(costa_result_df['Context 1'].to_string())
    print(costa_result_df['Comment'].to_string())
    print()
    print(row['text'])
    print("\n--------\n")

/mask.go
github.com/gorilla/websocket
43.0
262    gorilla/websocket/mask.go
262    gorilla/websocket

pointer-arithmetic / efficiency
262    Performance Optimization
262    Program
262    NaN

*(*uintptr)(unsafe.Pointer(uintptr(unsafe.Pointer(&b[0])) + uintptr(i))) ^= kw

--------

proto/pointer_unsafe.go
github.com/golang/protobuf
83.0
231    golang/protobuf/proto/pointer_unsafe.go
231    golang/protobuf

cast-pointer / efficiency
231    NaN
231    NaN
231    NaN

return pointer{p: (*[2]unsafe.Pointer)(unsafe.Pointer(i))[1]}

--------

codec/helper_unsafe.go
github.com/ugorji/go
199.0
549    ugorji/go/codec/helper_unsafe.go
549    ugorji/go

cast-struct / serialization
549    NaN
549    NaN
549    NaN

urv := (*unsafeReflectValue)(unsafe.Pointer(&rv))

--------

/alloc.go
github.com/yuin/gopher-lua
62.0
585    yuin/gopher-lua/alloc.go
585    yuin/gopher-lua

cast-header / efficiency
585    NaN
585    NaN
585    NaN

al.fheader = (*reflect.SliceHeader)(unsafe.Pointer(&al.fptrs))

-----

## Vet comparison

In [139]:
dedup_vet_df = vet_df\
    .drop_duplicates(subset=['module_version', 'module_path', 'package_import_path', 'file_name', 'line_number', 'column'])\
    .dropna()

dedup_vet_df[dedup_vet_df['message']=='possible misuse of unsafe.Pointer']

Unnamed: 0,message,context,line_number,column,raw_output,file_name,file_loc,file_byte_size,package_import_path,module_path,module_version,project_name,file_copy_path
0,possible misuse of unsafe.Pointer,"\t\t\tif !CompareAndSwapPointer(&vp.typ, nil, ...",59,44,/usr/local/go/src/sync/atomic/value.go:59:44: ...,value.go,86,2446,sync/atomic,std,std,kubernetes/kubernetes,sync/atomic/value.go
1,possible misuse of unsafe.Pointer,\treturn unsafe.Pointer(x ^ 0),287,9,/root/go/pkg/mod/github.com/modern-go/reflect2...,reflect2.go,298,7272,github.com/modern-go/reflect2,github.com/modern-go/reflect2,v1.0.1,kubernetes/kubernetes,github.com/modern-go/reflect2/reflect2.go
3,possible misuse of unsafe.Pointer,\tsize := *(*uintptr)(unsafe.Pointer(ptr + uns...,56,22,/usr/local/go/src/runtime/alg.go:56:22: possib...,alg.go,389,10428,runtime,std,std,kubernetes/kubernetes,runtime/alg.go
4,possible misuse of unsafe.Pointer,\told := unsafe.Pointer(sync_atomic_SwapUintpt...,63,9,/usr/local/go/src/runtime/atomic_pointer.go:63...,atomic_pointer.go,77,2683,runtime,std,std,kubernetes/kubernetes,runtime/atomic_pointer.go
5,possible misuse of unsafe.Pointer,"\t\treturn unsafe.Pointer(ret), 0",42,10,/usr/local/go/src/runtime/cgo_mmap.go:42:10: p...,cgo_mmap.go,67,2434,runtime,std,std,kubernetes/kubernetes,runtime/cgo_mmap.go
...,...,...,...,...,...,...,...,...,...,...,...,...,...
113554,possible misuse of unsafe.Pointer,"\t\t\tval := reflect.NewAt(t.t.Type, unsafe.Po...",491,35,/root/go/pkg/mod/gorgonia.org/tensor@v0.9.6/ar...,array_getset.go,763,15284,gorgonia.org/tensor,gorgonia.org/tensor,v0.9.6,gorgonia/gorgonia,gorgonia.org/tensor/array_getset.go
113555,possible misuse of unsafe.Pointer,"\t\t\tval := reflect.NewAt(t.t.Type, unsafe.Po...",756,35,/root/go/pkg/mod/gorgonia.org/tensor@v0.9.6/ar...,array_getset.go,763,15284,gorgonia.org/tensor,gorgonia.org/tensor,v0.9.6,gorgonia/gorgonia,gorgonia.org/tensor/array_getset.go
113556,possible misuse of unsafe.Pointer,\t\t\ttt.array.Ptr = unsafe.Pointer(ptr),150,19,/root/go/pkg/mod/gorgonia.org/tensor@v0.9.6/co...,consopt.go,263,6384,gorgonia.org/tensor,gorgonia.org/tensor,v0.9.6,gorgonia/gorgonia,gorgonia.org/tensor/consopt.go
113557,possible misuse of unsafe.Pointer,\t\treturn (*Dense)(unsafe.Pointer(t.viewOf)),371,19,/root/go/pkg/mod/gorgonia.org/tensor@v0.9.6/de...,dense.go,625,13429,gorgonia.org/tensor,gorgonia.org/tensor,v0.9.6,gorgonia/gorgonia,gorgonia.org/tensor/dense.go


## Running my data acquisition tool against the Costa project versions for 10 projects

Projects and versions:

 - jetstack/cert-manager: `0.11.0-alpha.0`
 - kubernetes/kubernetes: `v1.16.1`
 - golang/mobile: `6d0d39b`
 - TykTechnologies/tyk: `v2.8.5`
 - elastic/beats: `v7.4.0`
 - golang/tools: `c337991`
 - peterq/pan-light: `482eb093f`
 - cilium/cilium: `v1.6.2`
 - go-delve/delve: `v1.3.0`
 - ethereum/go-ethereum: `v1.9.5`

In [165]:
geiger_costa_reproduction_df = pd.read_csv('/root/data-costa-comparison/geiger/geiger_findings_0_9.csv')
packages_costa_reproduction_df = pd.read_csv('/root/data-costa-comparison/packages_0_9.csv')

In [166]:
projects_costa_reproduction_df['project_name'].nunique()

10

In [187]:
costa_reproduction_mdf = pd.merge(packages_costa_reproduction_df, projects_df, 
                                  how='left', on='project_name', validate='many_to_one')

costa_reproduction_df1 = costa_reproduction_mdf\
    [(costa_reproduction_mdf['module_path']==costa_reproduction_mdf['project_root_module'])&(costa_reproduction_mdf['is_standard']==False)]\
    .groupby('project_name')['package_unsafe_sum']\
    .agg(project_unsafe_pkg_count=lambda x: np.count_nonzero(x))\
    .reset_index()\
    .sort_values(by='project_unsafe_pkg_count', ascending=False)
                
costa_reproduction_df2 = costa_reproduction_mdf\
    [(costa_reproduction_mdf['module_path']!=costa_reproduction_mdf['project_root_module'])&(costa_reproduction_mdf['is_standard']==False)]\
    .groupby('project_name')['package_unsafe_sum']\
    .agg(dep_unsafe_pkg_count=lambda x: np.count_nonzero(x))\
    .reset_index()
                 
costa_reproduction_df = pd.merge(costa_reproduction_df1, costa_reproduction_df2, how='left',
                                 on='project_name', validate='one_to_one').fillna(0)
costa_reproduction_df['total_unsafe_pkg_count'] = costa_reproduction_df.apply(lambda x: x['project_unsafe_pkg_count'] + x['dep_unsafe_pkg_count'], axis=1)


comparison_projects = costa_reproduction_df['project_name'].unique()

comparison_df = pd.DataFrame(data={'project_name': [], 'my_count': [], 'costa_count': [], 'difference': [],
                                  'difference_abs': []})

for project in comparison_projects:
    costa_project_df = costa_data_3k_curated_projects_df[costa_data_3k_curated_projects_df['full_name']==project]
    if len(costa_project_df) > 0:
        costa_unsafe_df = costa_data_unsafe_usage_per_project_df[costa_data_unsafe_usage_per_project_df['Project']==project]
        if len(costa_unsafe_df) > 0:
            costa_count = costa_unsafe_df.iloc[0]['#']
        else:
            costa_count = 0
        project_root_module = projects_df[projects_df['project_name']==project].iloc[0]['project_root_module']
        my_count_df = packages_costa_reproduction_df\
            [(packages_costa_reproduction_df['project_name']==project)&(packages_costa_reproduction_df['module_path']==project_root_module)]
        my_count = my_count_df['package_geiger_unsafe_pointer_sum'].sum() + \
            my_count_df['package_geiger_unsafe_sizeof_sum'].sum() + \
            my_count_df['package_geiger_unsafe_offsetof_sum'].sum() + \
            my_count_df['package_geiger_unsafe_alignof_sum'].sum()
        comparison_df = comparison_df.append(pd.DataFrame(data={
            'project_name': [project], 
            'my_count': [my_count], 
            'costa_count': [costa_count],
            'difference': [my_count - costa_count],
            'difference_abs': [abs(my_count - costa_count)]}))
            
comparison_df

Unnamed: 0,project_name,my_count,costa_count,difference,difference_abs
0,kubernetes/kubernetes,1688.0,2058.0,-370.0,370.0
0,go-delve/delve,40.0,72.0,-32.0,32.0
0,golang/tools,36.0,113.0,-77.0,77.0
0,golang/mobile,92.0,211.0,-119.0,119.0
0,jetstack/cert-manager,136.0,140.0,-4.0,4.0
0,peterq/pan-light,0.0,64.0,-64.0,64.0


In [183]:
costa_reproduction_mdf\
    [(costa_reproduction_mdf['module_path']==costa_reproduction_mdf['project_root_module'])&(costa_reproduction_mdf['is_standard']==False)]\
    ['project_name'].unique()

array(['kubernetes/kubernetes', 'go-delve/delve', 'peterq/pan-light',
       'golang/tools', 'golang/mobile', 'jetstack/cert-manager'],
      dtype=object)