In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import glob
from parse_log import parse
import pprint
%pprint

In [3]:
# Get all log files in directory
EXPERIMENTS_DIR = 'experiments'
logfile = open(f'{EXPERIMENTS_DIR}/experiments.log', 'r')
parsed_logs = parse(logfile)
print(len(parsed_logs))

1188


In [31]:
df = pd.DataFrame(parsed_logs)


def n_files_flagged_ge_1(df):
    return df[df["flagged"] >= 1].groupby(["algorithm"])["conversations"].count()

def print_results(section):

    print("Number of conversations flagged:")
    n_flagged = section.groupby(["algorithm"])["flagged"].sum()
    print(n_flagged)

    print("Number of total conversations:")
    print("_"*50)
    n_conversations = section.groupby(["algorithm"])["conversations"].sum()
    print(n_conversations)
    print("_"*50)

    print("% of conversations flagged:")
    percentage = ((n_flagged / n_conversations) * 100).round(2)
    print(percentage)
    print("_"*50)

    print("Number of files with conversations flagged >= 1:")
    n_files_ge_1 = section[section["flagged"] >= 1].groupby(["algorithm"])["conversations"].count()
    print(n_files_ge_1)
    print("_"*50)

    print("Total number of files:")
    n_files = section.groupby(["algorithm"])["conversations"].count()
    print(n_files)
    print("_"*50)

    print("% of files flagged (>= 1):")
    percentage_files = ((n_files_ge_1 / n_files) * 100).round(0)
    print(f"{percentage_files}%")
    

# ISCX

In [32]:
# ISCX: VPN Connections 
section = df[df["file"].apply(lambda x: "ISCX" in x and "NonVPN" not in x)]
#print_results(section)
## ISCX: NonVPN Connections 
section = df[df["file"].apply(lambda x: "ISCX" in x and "NonVPN" in x)]
#print_results(section)

In [38]:
def confusion_matrix(df, vpnfilter, nonvpnfilter):
    nonvpn = df[df["file"].apply(nonvpnfilter)]
    vpn = df[df["file"].apply(vpnfilter)]

    print(f"TOTAL conversations (vpn): {vpn['conversations'].sum()}")
    print(f"TOTAL conversations (nonvpn): {nonvpn['conversations'].sum()}")

    print("Total number of files (vpn):")
    n_files = vpn.groupby(["algorithm"])["conversations"].count()
    print(n_files)
    print("_"*50)

    print("Total number of files (nonvpn):")
    n_files = nonvpn.groupby(["algorithm"])["conversations"].count()
    print(n_files)
    print("_"*50)

    non_vpn_total = nonvpn.groupby(["algorithm"])["conversations"].count()["VPN Detection ACK"]
    vpn_total = vpn.groupby(["algorithm"])["conversations"].count()["VPN Detection ACK"]
    total = non_vpn_total + vpn_total

    non_vpn_flagged_ge_1 = n_files_flagged_ge_1(nonvpn)
    non_vpn_flagged_ge_1_inv = non_vpn_total - non_vpn_flagged_ge_1
    vpn_flagged_ge_1 = n_files_flagged_ge_1(vpn)
    vpn_flagged_ge_1_inv = vpn_total - vpn_flagged_ge_1

    percentage_non_vpn = ((non_vpn_flagged_ge_1 / total) * 100).round(2)
    percentage_non_vpn_inv = ((non_vpn_flagged_ge_1_inv / total) * 100).round(2)
    percentage_vpn = ((vpn_flagged_ge_1 / total) * 100).round(2)
    percentage_vpn_inv = ((vpn_flagged_ge_1_inv / total) * 100).round(2)

    print(f"vpn_flagged_ge_1:\n{vpn_flagged_ge_1}")
    print("-"*50)
    print(f"vpn_flagged_ge_1_inv:\n{vpn_flagged_ge_1_inv}")
    print("-"*50)

    print(f"percentage_vpn: \n{percentage_vpn}")
    print("-"*50)
    print(f"percentage_vpn_inv \n{percentage_vpn_inv}")
    print("-"*50)

    print(f"non_vpn_flagged_ge_1:\n{non_vpn_flagged_ge_1}")
    print("-"*50)
    print(f"non_vpn_flagged_ge_1_inv:\n{non_vpn_flagged_ge_1_inv}")
    print("-"*50)
    
    print(f"percentage_non_vpn: \n{percentage_non_vpn}")
    print("-"*50)
    print(f"percentage_non_vpn_inv \n{percentage_non_vpn_inv}")
    print("-"*50)

    data = {
        "VPN": [({"flagged": vpn_flagged_ge_1, "percentage": percentage_vpn}, {"flagged": vpn_flagged_ge_1_inv, "percentage": percentage_vpn_inv})],
        "NonVPN": [({"flagged": non_vpn_flagged_ge_1, "percentage": percentage_non_vpn}, {"flagged": non_vpn_flagged_ge_1_inv, "percentage": percentage_non_vpn_inv})]
    }
    return data

In [40]:
#data = confusion_matrix(df, vpnfilter=lambda x: "ISCX" in x and "NonVPN" not in x, nonvpnfilter=lambda x: "ISCX" in x and "NonVPN" in x)
data = confusion_matrix(df, vpnfilter=lambda x: "VNAT_VPN" in x and "nonvpn" not in x, nonvpnfilter=lambda x: "VNAT_VPN" in x and "nonvpn" in x)


TOTAL conversations (vpn): 1516
TOTAL conversations (nonvpn): 84676
Total number of files (vpn):
algorithm
VPN Detection ACK                             77
VPN Detection ACK with MOD improvement        77
VPN Detection Opcode                          77
VPN Detection Opcode with XOR optimization    77
Name: conversations, dtype: int64
__________________________________________________
Total number of files (nonvpn):
algorithm
VPN Detection ACK                             80
VPN Detection ACK with MOD improvement        80
VPN Detection Opcode                          80
VPN Detection Opcode with XOR optimization    80
Name: conversations, dtype: int64
__________________________________________________
vpn_flagged_ge_1:
Series([], Name: conversations, dtype: int64)
--------------------------------------------------
vpn_flagged_ge_1_inv:
Series([], Name: conversations, dtype: int64)
--------------------------------------------------
percentage_vpn: 
Series([], Name: conversations, dtype:

In [30]:
pp = pprint.PrettyPrinter(indent=4)
pp.pprint(data)


{   'NonVPN': [   (   {   'flagged': algorithm
VPN Detection ACK                         23
VPN Detection ACK with MOD improvement    14
VPN Detection Opcode                      26
Name: conversations, dtype: int64,
                          'percentage': algorithm
VPN Detection ACK                         17.42
VPN Detection ACK with MOD improvement    10.61
VPN Detection Opcode                      19.70
Name: conversations, dtype: float64},
                      {   'flagged': algorithm
VPN Detection ACK                         79
VPN Detection ACK with MOD improvement    88
VPN Detection Opcode                      76
Name: conversations, dtype: int64,
                          'percentage': algorithm
VPN Detection ACK                         59.85
VPN Detection ACK with MOD improvement    66.67
VPN Detection Opcode                      57.58
Name: conversations, dtype: float64})],
    'VPN': [   (   {   'flagged': algorithm
VPN Detection ACK                              9
VPN Det