In [1]:
import pandas as pd, numpy as np
import ipaddress

In [2]:
names = ['SrcIP', 'SrcPort', 'DstIP', 'DstPort',
       'Proto', 'Proto_l7', 'Bytes_in', 'Pkts_in', 'Bytes_out', 'Pkts_out',
       'TCP_Flag', 'Client_TCP_Flag', 'Server_TCP_Flag',
       'Duration(ms)', 'Duration_in', 'Duration_out', 'TTL_min',
       'TTL_max', 'MaxPkts', 'MinPkts', 'MinLen',
       'MaxLen', 'BytSec_SrcDst', 'BytSec_DstSrc',
       'RetrBytes_in', 'RetrPkts_in',
       'RetrBytes_out', 'RetrPkts_out',
       'Throughput_SrcDst', 'Throughput_DstSrc',
       'Pkt <128 Byt', 'Pkt 128<Byt<256',
       'Pkt 256<Byt<512', 'Pkt 512<Byt<1024',
       'Pkt 1024<Byt<1514', 'MaxTCPWin_in', 'MaxTCPWin_out',
       'ICMP', 'ICMP_IPV4', 'DNS_QUERY_ID', 'DNS_QUERY_TYPE',
       'DNS_TTL_ANSWER', 'FTP_COMMAND_RET_CODE', 'Nature', 'Label']

In [3]:
source_folder = "source_folder" # this is the folder that contains the four CSV files obtained after downloading and extracting the dataset
output_folder = "../data/UF-NB15/flows/" # the output of this notebook will be saved in this folder

file_names = [
    "UFNB15.csv"
]


df = pd.DataFrame()

for f in file_names:
    input_file = source_folder + f
    temp_df = pd.read_csv(input_file, names=names, skiprows=1, encoding = 'cp1252')
    print("Read {} lines...".format(len(temp_df)))
    #df = df.append(temp_df) # We used this command since we used pandas < 2.0
    df = pd.concat([df, temp_df], ignore_index=True) # Use this if pandas >= 2.0
    print("\t...total length: {}".format(len(df)))

print("...done!")

Read 2390275 lines...
	...total length: 2390275
...done!


In [4]:
df.columns

Index(['SrcIP', 'SrcPort', 'DstIP', 'DstPort', 'Proto', 'Proto_l7', 'Bytes_in',
       'Pkts_in', 'Bytes_out', 'Pkts_out', 'TCP_Flag', 'Client_TCP_Flag',
       'Server_TCP_Flag', 'Duration(ms)', 'Duration_in', 'Duration_out',
       'TTL_min', 'TTL_max', 'MaxPkts', 'MinPkts', 'MinLen', 'MaxLen',
       'BytSec_SrcDst', 'BytSec_DstSrc', 'RetrBytes_in', 'RetrPkts_in',
       'RetrBytes_out', 'RetrPkts_out', 'Throughput_SrcDst',
       'Throughput_DstSrc', 'Pkt <128 Byt', 'Pkt 128<Byt<256',
       'Pkt 256<Byt<512', 'Pkt 512<Byt<1024', 'Pkt 1024<Byt<1514',
       'MaxTCPWin_in', 'MaxTCPWin_out', 'ICMP', 'ICMP_IPV4', 'DNS_QUERY_ID',
       'DNS_QUERY_TYPE', 'DNS_TTL_ANSWER', 'FTP_COMMAND_RET_CODE', 'Nature',
       'Label'],
      dtype='object')

In [5]:
df.head()

Unnamed: 0,SrcIP,SrcPort,DstIP,DstPort,Proto,Proto_l7,Bytes_in,Pkts_in,Bytes_out,Pkts_out,...,MaxTCPWin_in,MaxTCPWin_out,ICMP,ICMP_IPV4,DNS_QUERY_ID,DNS_QUERY_TYPE,DNS_TTL_ANSWER,FTP_COMMAND_RET_CODE,Nature,Label
0,59.166.0.5,1305,149.171.126.8,21,6,1.0,9,1,193,3,...,0,7240,0,0,0,0,0,331.0,0,Benign
1,59.166.0.5,1305,149.171.126.8,21,6,1.0,261,5,469,7,...,8688,8688,18944,74,0,0,0,230.0,0,Benign
2,59.166.0.5,1305,149.171.126.8,21,6,1.0,481,9,750,11,...,10136,10136,33792,132,0,0,0,229.0,0,Benign
3,59.166.0.5,1305,149.171.126.8,21,6,1.0,701,13,1054,15,...,11584,11584,48640,190,0,0,0,125.0,0,Benign
4,59.166.0.5,1305,149.171.126.8,21,6,1.0,1031,19,1474,21,...,14480,13032,64256,251,0,0,0,230.0,0,Benign


In [6]:
df['Label'].unique()

array(['Benign', 'Exploits', 'Generic', 'Fuzzers', 'Backdoor', 'DoS',
       'Reconnaissance', 'Shellcode', 'Worms', 'Analysis'], dtype=object)

In [7]:
df['Label'] = np.where(df['Nature']==0, 'BENIGN', df['Label'])
df['Label'] = np.where(df['Label'].str.contains('Reconnaissance'), 'recon', df['Label'])
df['Label'] = np.where(df['Label'].str.contains('Backdoor'), 'bdoor', df['Label'])
df['Label'] = np.where(df['Label'].str.contains('Fuzzers'), 'fuzz', df['Label'])
df['Label'] = np.where(df['Label'].str.contains('Shellcode'), 'shell', df['Label'])
df['Label'] = np.where(df['Label'].str.contains('DoS'), 'dos', df['Label'])
df['Label'] = np.where(df['Label'].str.contains('Generic'), 'other', df['Label'])
df['Label'] = np.where(df['Label'].str.contains('Worms'), 'worm', df['Label'])
df['Label'] = np.where(df['Label'].str.contains('Analysis'), 'ana', df['Label'])
df['Label'] = np.where(df['Label'].str.contains('Exploits'), 'expl', df['Label'])


print(df['Label'].unique())
print(df['SrcIP'].unique())

['BENIGN' 'expl' 'other' 'fuzz' 'bdoor' 'dos' 'recon' 'shell' 'worm' 'ana']
['59.166.0.5' '175.45.176.1' '59.166.0.0' '59.166.0.7' '59.166.0.3'
 '59.166.0.2' '59.166.0.1' '59.166.0.6' '59.166.0.9' '59.166.0.4'
 '59.166.0.8' '175.45.176.3' '149.171.126.6' '149.171.126.9'
 '149.171.126.2' '149.171.126.1' '149.171.126.5' '149.171.126.15'
 '149.171.126.10' '149.171.126.19' '149.171.126.3' '149.171.126.13'
 '10.40.85.1' '10.40.182.1' '175.45.176.0' '149.171.126.4' '175.45.176.2'
 '10.40.85.10' '149.171.126.11' '149.171.126.14' '149.171.126.17'
 '149.171.126.12' '149.171.126.7' '10.40.182.6' '149.171.126.18'
 '149.171.126.0' '149.171.126.16' '192.168.241.243' '149.171.126.8'
 '127.0.0.1']


In [8]:
def fixValues(df):
    # Function to fix NaNs and Infinite values
    # NaNs are replaced with the MEAN
    # Infinite are replaced with the MAX
    
    
    import numpy as np
    x = df#.copy(deep=True)
    for c in x.columns:
        if x[c].dtype == 'int' or x[c].dtype == 'float':
            temp = np.asarray(x[c], dtype=np.float64)
            # remove NaN & Infinity (if there are)
            temp = temp[np.isfinite(temp)]
            mean_value = temp.mean()
            max_value = temp.max()
            x[c].replace([np.inf, -np.inf], max_value, inplace=True)
            x[c].replace([np.nan], mean_value, inplace=True)
    
    return x


def uniformPorts(df, srcPort_name, dstPort_name):
    # Function to uniformize well-known, registered and dynamic ports 

    df.rename({srcPort_name: 'SrcPort', dstPort_name: 'DstPort'}, axis=1, inplace=True)
        
    #converting strings to numeric
    df['SrcPort_num'] = pd.to_numeric(df['SrcPort'], errors='coerce')
    df['SrcPort_num'].replace([np.nan], -1, inplace=True)
    df['DstPort_num'] = pd.to_numeric(df['DstPort'], errors='coerce')
    df['DstPort_num'].replace([np.nan], -1, inplace=True)
    #determining low&high ports
    srcPort_conditions = [
        (df['SrcPort_num'] == -1),
        (df['SrcPort_num'] >= 0) & (df['SrcPort_num'] <= 1023),
        (df['SrcPort_num'] >= 1024) & (df['SrcPort_num'] <= 49151),
        (df['SrcPort_num'] > 49151)
    ]
    dstPort_conditions = [
        (df['DstPort_num'] == -1),
        (df['DstPort_num'] >= 0) & (df['DstPort_num'] <= 1023),
        (df['DstPort_num'] >= 1024) & (df['DstPort_num'] <= 49151),
        (df['DstPort_num'] > 49151)
    ]    
    port_choices = ['none','well-known','registered','dynamic']
    df['SrcPort_type'] = np.select(srcPort_conditions, port_choices)
    df['DstPort_type'] = np.select(dstPort_conditions, port_choices)
    
    return df

def uniformIP(df, srcIP_name, dstIP_name, internal_network, *, internal_network2 = None):
    # Function for assigning IPs to internal/external network
    df.rename({srcIP_name: 'SrcIP', dstIP_name: 'DstIP'}, axis=1, inplace=True)
    
    if internal_network2 == None:
        df['SrcIP_internal'] = df['SrcIP'].apply(ipaddress.ip_address).isin(ipaddress.ip_network(internal_network))
        df['DstIP_internal'] = df['DstIP'].apply(ipaddress.ip_address).isin(ipaddress.ip_network(internal_network))
    else:
        df['SrcIP_internal1'] = df['SrcIP'].apply(ipaddress.ip_address).isin(ipaddress.ip_network(internal_network))
        df['DstIP_internal1'] = df['DstIP'].apply(ipaddress.ip_address).isin(ipaddress.ip_network(internal_network))
        df['SrcIP_internal2'] = df['SrcIP'].apply(ipaddress.ip_address).isin(ipaddress.ip_network(internal_network2))
        df['DstIP_internal2'] = df['DstIP'].apply(ipaddress.ip_address).isin(ipaddress.ip_network(internal_network2))
        
        df['DstIP_internal'] = (df['DstIP_internal1']) | (df['DstIP_internal2'])
        df['SrcIP_internal'] = (df['SrcIP_internal1']) | (df['SrcIP_internal2'])
        
        df.drop(columns=['SrcIP_internal1', 'SrcIP_internal2', 'DstIP_internal1', 'DstIP_internal2'], inplace=True)
        
    # check internal/external
    int_int = df.loc[(df['SrcIP_internal'] == True) & (df['DstIP_internal'] == True)]
    int_ext = df.loc[(df['SrcIP_internal'] == True) & (df['DstIP_internal'] == False)]
    ext_int = df.loc[(df['SrcIP_internal'] == False) & (df['DstIP_internal'] == True)]
    ext_ext = df.loc[(df['SrcIP_internal'] == False) & (df['DstIP_internal'] == False)]

    print("int_int = {}\n int_ext = {}\n ext_int = {}\n ext_ext = {}".format(len(int_int), len(int_ext), len(ext_int), len(ext_ext)))
        
    return df

In [9]:
df.columns = df.columns.str.replace(' ', '')
df.columns = df.columns.str.replace("'", "")
df = fixValues(df)
print(df.isna().any().any())
print(df.columns[df.isna().any()])

False
Index([], dtype='object')


In [10]:
srcPort_name = 'SrcPort'
dstPort_name = 'DstPort'
srcIP_name = 'SrcIP'
dstIP_name = 'DstIP'



internal_network1 = "59.166.0.0/16"
internal_network2 = "175.45.0.0/16"



df = uniformPorts(df, srcPort_name, dstPort_name)
df = uniformIP(df, srcIP_name, dstIP_name, internal_network=internal_network1, internal_network2=internal_network2)
df.head()

int_int = 0
 int_ext = 2371916
 ext_int = 17499
 ext_ext = 860


Unnamed: 0,SrcIP,SrcPort,DstIP,DstPort,Proto,Proto_l7,Bytes_in,Pkts_in,Bytes_out,Pkts_out,...,DNS_TTL_ANSWER,FTP_COMMAND_RET_CODE,Nature,Label,SrcPort_num,DstPort_num,SrcPort_type,DstPort_type,DstIP_internal,SrcIP_internal
0,59.166.0.5,1305,149.171.126.8,21,6,1.0,9,1,193,3,...,0,331.0,0,BENIGN,1305,21,registered,well-known,False,True
1,59.166.0.5,1305,149.171.126.8,21,6,1.0,261,5,469,7,...,0,230.0,0,BENIGN,1305,21,registered,well-known,False,True
2,59.166.0.5,1305,149.171.126.8,21,6,1.0,481,9,750,11,...,0,229.0,0,BENIGN,1305,21,registered,well-known,False,True
3,59.166.0.5,1305,149.171.126.8,21,6,1.0,701,13,1054,15,...,0,125.0,0,BENIGN,1305,21,registered,well-known,False,True
4,59.166.0.5,1305,149.171.126.8,21,6,1.0,1031,19,1474,21,...,0,230.0,0,BENIGN,1305,21,registered,well-known,False,True


In [11]:
#swap
df['DstIP_internal'] = np.where(df['DstIP_internal']==True, False, True)
df['SrcIP_internal'] = np.where(df['SrcIP_internal']==True, False, True)

# check internal/external
int_int = df.loc[(df['SrcIP_internal'] == True) & (df['DstIP_internal'] == True)]
int_ext = df.loc[(df['SrcIP_internal'] == True) & (df['DstIP_internal'] == False)]
ext_int = df.loc[(df['SrcIP_internal'] == False) & (df['DstIP_internal'] == True)]
ext_ext = df.loc[(df['SrcIP_internal'] == False) & (df['DstIP_internal'] == False)]

print("int_int = {}\n int_ext = {}\n ext_int = {}\n ext_ext = {}".format(len(int_int), len(int_ext), len(ext_int), len(ext_ext)))

int_int = 860
 int_ext = 17499
 ext_int = 2371916
 ext_ext = 0


In [12]:
df.columns

Index(['SrcIP', 'SrcPort', 'DstIP', 'DstPort', 'Proto', 'Proto_l7', 'Bytes_in',
       'Pkts_in', 'Bytes_out', 'Pkts_out', 'TCP_Flag', 'Client_TCP_Flag',
       'Server_TCP_Flag', 'Duration(ms)', 'Duration_in', 'Duration_out',
       'TTL_min', 'TTL_max', 'MaxPkts', 'MinPkts', 'MinLen', 'MaxLen',
       'BytSec_SrcDst', 'BytSec_DstSrc', 'RetrBytes_in', 'RetrPkts_in',
       'RetrBytes_out', 'RetrPkts_out', 'Throughput_SrcDst',
       'Throughput_DstSrc', 'Pkt<128Byt', 'Pkt128<Byt<256', 'Pkt256<Byt<512',
       'Pkt512<Byt<1024', 'Pkt1024<Byt<1514', 'MaxTCPWin_in', 'MaxTCPWin_out',
       'ICMP', 'ICMP_IPV4', 'DNS_QUERY_ID', 'DNS_QUERY_TYPE', 'DNS_TTL_ANSWER',
       'FTP_COMMAND_RET_CODE', 'Nature', 'Label', 'SrcPort_num', 'DstPort_num',
       'SrcPort_type', 'DstPort_type', 'DstIP_internal', 'SrcIP_internal'],
      dtype='object')

In [13]:
df = df.drop(columns=['SrcPort', 'DstPort'])
df = df.rename(columns={"SrcPort_num":"SrcPort", 
                        "DstPort_num":"DstPort", 
                        })

In [14]:
df['Label'].unique()

array(['BENIGN', 'expl', 'other', 'fuzz', 'bdoor', 'dos', 'recon',
       'shell', 'worm', 'ana'], dtype=object)

In [15]:
label_list = df['Label'].unique()
label_list[1]

'expl'

In [16]:
summa = 0
for l in label_list:
    exec(f"{l}_df = df[df['Label']=='{l}']")
    exec(f"summa = summa + len({l}_df)")
    exec(f"print('\t {l}: ', len({l}_df))")
print("Overall samples: ", len(df))
print(summa)

	 BENIGN:  2295222
	 expl:  31551
	 other:  16560
	 fuzz:  22310
	 bdoor:  2169
	 dos:  5794
	 recon:  12779
	 shell:  1427
	 worm:  164
	 ana:  2299
Overall samples:  2390275
2390275


In [30]:
malicious_output_folder = output_folder + "malicious\\"

benign_file = output_folder + "benign.csv"

BENIGN_df.to_csv(benign_file, index=False)

expl_df.to_csv(malicious_output_folder + "expl.csv", index=False)
recon_df.to_csv(malicious_output_folder + "recon.csv", index=False)
dos_df.to_csv(malicious_output_folder + "dos.csv", index=False)
other_df.to_csv(malicious_output_folder + "other.csv", index=False)
shell_df.to_csv(malicious_output_folder + "shell.csv", index=False)
fuzz_df.to_csv(malicious_output_folder + "fuzz.csv", index=False)
worm_df.to_csv(malicious_output_folder + "worm.csv", index=False)
bdoor_df.to_csv(malicious_output_folder + "bdoor.csv", index=False)
ana_df.to_csv(malicious_output_folder + "ana.csv", index=False)