# Analyse AV Classification (Duplicates)

The following notebook tries a different approach for analysing the FP & FN rate. It filters the first and last submission, then for each vendor, checks if they changed their classification, if they did, it counts as a FP/FN.

In [1]:
import numpy as np
import pandas as pd
from IPython.display import display

pe32_av = pd.read_csv('data/pe32_static_av.csv', dtype=str)
pe32_av['date'] = pd.to_datetime(pe32_av['date'], format='%Y/%m/%d')
# Set date as index
pe32_av = pe32_av.set_index('date')

# First keep all dups
pe32_av_dups = pe32_av[pe32_av.duplicated(subset='md5', keep=False)]

# Take the first submission
pe32_av_first = pe32_av_dups.drop_duplicates(subset='md5', keep='first')
# Take the last submission
pe32_av_last = pe32_av_dups.drop_duplicates(subset='md5', keep='last')
# Join them
pe32_av_dups = pd.concat([pe32_av_first, pe32_av_last])
del pe32_av
del pe32_av_first
del pe32_av_last

In [2]:
data_size = len(pe32_av_dups)
vendors = pe32_av_dups.columns.values[2:]
vendors_presence = dict()

for vendor in vendors:
    vendors_presence[vendor] = pe32_av_dups[vendor].count()

vendors_presence = pd.Series(data=vendors_presence)

# Filter vendors that appear in over 1% of submissions
vendors_filtered = vendors_presence[vendors_presence.divide(data_size) > 0.01]
# Drop vendors below threshold
pe32_av_dups.drop(vendors_presence[vendors_presence.divide(data_size) <= 0.01].keys(), axis=1, inplace=True)
# Drop unclassified samples
pe32_av_dups.dropna(how='all', subset=vendors_filtered.keys(), inplace=True)
pe32_av_dups = pe32_av_dups[pe32_av_dups.duplicated(subset='md5', keep=False)]

In [3]:
tp_count = dict.fromkeys(vendors_filtered.keys(), 0)
tn_count = dict.fromkeys(vendors_filtered.keys(), 0)

fp_count = dict.fromkeys(vendors_filtered.keys(), 0)
fn_count = dict.fromkeys(vendors_filtered.keys(), 0)

for md5 in pe32_av_dups.md5.unique():
    sample = pe32_av_dups[pe32_av_dups.md5 == md5].dropna(axis=1)
    vendors = sample.keys()[2:]
    for vendor in vendors:
        # Comparing both to clean or not clean as to ignore a family name changes
        # True negative
        if sample[vendor].iloc[0] == 'clean' == sample[vendor].iloc[1] == 'clean':
            tn_count[vendor] += 1
        # True positive
        elif sample[vendor].iloc[0] != 'clean' and sample[vendor].iloc[1] != 'clean':
            tp_count[vendor] += 1
        # False negative
        elif sample[vendor].iloc[0] == 'clean' and sample[vendor].iloc[1] != 'clean':
            fn_count[vendor] += 1
        # False positive
        elif sample[vendor].iloc[0] != 'clean' and sample[vendor].iloc[1] == 'clean':
            fp_count[vendor] += 1
        else:
            raise Exception('Ham what?')
            display(sample[vendor])

tp_count = pd.Series(tp_count)
tn_count = pd.Series(tn_count)
fp_count = pd.Series(fp_count)
fn_count = pd.Series(fn_count)

fp_rate = fp_count.divide(tp_count.add(fp_count))
fn_rate = fn_count.divide(tn_count.add(fn_count))
display(fp_rate.describe())
display(fn_rate.describe())
display(fp_rate.sort_values())
display(fn_rate.sort_values())

count    65.000000
mean      0.027799
std       0.023955
min       0.000000
25%       0.012897
50%       0.022012
75%       0.038776
max       0.119064
dtype: float64

count    66.000000
mean      0.133312
std       0.073302
min       0.000000
25%       0.075604
50%       0.144509
75%       0.192150
max       0.252991
dtype: float64

esafe                   0.000000
crowdstrike             0.000000
invincea                0.000000
pctools                 0.001996
thehacker               0.002084
antivir                 0.003908
norman                  0.004983
eset-nod32              0.005726
vipre                   0.006173
comodo                  0.006384
microsoft               0.008216
avg                     0.008248
f-prot                  0.008346
fortinet                0.009723
agnitum                 0.009964
avware                  0.010732
malwarebytes            0.012897
baidu                   0.013353
vba32                   0.013484
nano-antivirus          0.013878
yandex                  0.014035
ahnlab-v3               0.014410
totaldefense            0.015371
sophos                  0.017241
drweb                   0.017395
mcafee                  0.017564
k7gw                    0.017899
commtouch               0.018231
avira                   0.018832
k7antivirus             0.019117
          

invincea                0.000000
alibaba                 0.000000
bytehero                0.003464
cmc                     0.003898
baidu                   0.005734
zoner                   0.006954
esafe                   0.009768
crowdstrike             0.024096
thehacker               0.027625
superantispyware        0.040714
clamav                  0.041922
kingsoft                0.047360
pctools                 0.057735
bkav                    0.058516
totaldefense            0.059256
aegislab                0.073466
f-prot                  0.074864
jiangmin                0.077824
vba32                   0.078572
yandex                  0.083391
rising                  0.092004
commtouch               0.098646
zillya                  0.106533
virobot                 0.108888
cat-quickheal           0.111530
agnitum                 0.125721
nprotect                0.126374
tencent                 0.131981
comodo                  0.132579
qihoo-360               0.136033
          