In [1]:
import pandas as pd, numpy as np
import ipaddress

In [2]:
names=["SrcIP", 
       "SrcPort", 
       "DstIP", 
       "DstPort",
       "Proto",
       "State",
       "Duration",
       "SrcBytes",
       "DstBytes",
       "SrcTTL",
       "DstTTL",
       "SrcLoss",
       "DstLoss",
       "Service",
       "SrcBpS",
       "DstBpS",
       "SrcPkts",
       "DstPkts",
       "SrcWin",
       "DstWin",
       "SrcTcpb",
       "DstTcpb",
       "SrcMean",
       "DstMean",
       "Trans_Depth",
       "res_bdy_len",
       "SrcJit",
       "DstJit",
       "FlowStart",
       "FlowEnd",
       "SrcAit",
       "DstAit",
       "TcpRrt",
       "SynAck",
       "AckDat",
       "is_sm_ips_ports",
       "ct_state_ttl",
       "ct_flw_http_mthd",
       "is_ftp_login",
       "ct_ftp_cmd",
       "ct_srv_src",
       "ct_srv_dst",
       "ct_dst_ltm",
       "ct_src_ltm",
       "ct_src_dport",
       "ct_dst_sport",
       "ct_dst_src_ltm",
       "Label",
       "Nature"]

In [None]:
source_folder = "source_folder" # this is the folder that contains the four CSV files obtained after downloading and extracting the dataset
output_folder = "../data/NB15/flows/" # the output of this notebook will be saved in this folder

file_names = [
    "UNSW-NB15_1.csv",
    "UNSW-NB15_2.csv",
    "UNSW-NB15_3.csv",
    "UNSW-NB15_4.csv",
]

df = pd.DataFrame()

for f in file_names:
    input_file = source_folder + f
    temp_df = pd.read_csv(input_file, names=names, encoding = 'cp1252')
    print("Read {} lines...".format(len(temp_df)))
    #df = df.append(temp_df) # We used this command since we used pandas < 2.0
    df = pd.concat([df, temp_df], ignore_index=True) # Use this if pandas >= 2.0
    print("\t...total length: {}".format(len(df)))

print("...done!")

In [5]:
df.columns

Index(['SrcIP', 'SrcPort', 'DstIP', 'DstPort', 'Proto', 'State', 'Duration',
       'SrcBytes', 'DstBytes', 'SrcTTL', 'DstTTL', 'SrcLoss', 'DstLoss',
       'Service', 'SrcBpS', 'DstBpS', 'SrcPkts', 'DstPkts', 'SrcWin', 'DstWin',
       'SrcTcpb', 'DstTcpb', 'SrcMean', 'DstMean', 'Trans_Depth',
       'res_bdy_len', 'SrcJit', 'DstJit', 'FlowStart', 'FlowEnd', 'SrcAit',
       'DstAit', 'TcpRrt', 'SynAck', 'AckDat', 'is_sm_ips_ports',
       'ct_state_ttl', 'ct_flw_http_mthd', 'is_ftp_login', 'ct_ftp_cmd',
       'ct_srv_src', 'ct_srv_dst', 'ct_dst_ltm', 'ct_src_ltm', 'ct_src_dport',
       'ct_dst_sport', 'ct_dst_src_ltm', 'Label', 'Nature'],
      dtype='object')

In [6]:
df.head()

Unnamed: 0,SrcIP,SrcPort,DstIP,DstPort,Proto,State,Duration,SrcBytes,DstBytes,SrcTTL,...,ct_ftp_cmd,ct_srv_src,ct_srv_dst,ct_dst_ltm,ct_src_ltm,ct_src_dport,ct_dst_sport,ct_dst_src_ltm,Label,Nature
0,ï»¿59.166.0.0,1390,149.171.126.6,53,udp,CON,0.001055,132,164,31,...,0,3,7,1,3,1,1,1,,0
1,59.166.0.0,33661,149.171.126.9,1024,udp,CON,0.036133,528,304,31,...,0,2,4,2,3,1,1,2,,0
2,59.166.0.6,1464,149.171.126.7,53,udp,CON,0.001119,146,178,31,...,0,12,8,1,2,2,1,1,,0
3,59.166.0.5,3593,149.171.126.5,53,udp,CON,0.001209,132,164,31,...,0,6,9,1,1,1,1,1,,0
4,59.166.0.3,49664,149.171.126.0,53,udp,CON,0.001169,146,178,31,...,0,7,9,1,1,1,1,1,,0


In [7]:
# Small fixes

df['Label'] = np.where(df['Nature']==0, 'BENIGN', df['Label'])
df['Label'] = np.where(df['Label'].str.contains('Reconnaissance'), 'recon', df['Label'])
df['Label'] = np.where(df['Label'].str.contains('Backdoor'), 'bdoor', df['Label'])
df['Label'] = np.where(df['Label'].str.contains('Fuzzers'), 'fuzz', df['Label'])
df['Label'] = np.where(df['Label'].str.contains('Shellcode'), 'shell', df['Label'])
df['Label'] = np.where(df['Label'].str.contains('DoS'), 'dos', df['Label'])
df['Label'] = np.where(df['Label'].str.contains('Generic'), 'other', df['Label'])
df['Label'] = np.where(df['Label'].str.contains('Worms'), 'worm', df['Label'])
df['Label'] = np.where(df['Label'].str.contains('Analysis'), 'ana', df['Label'])
df['Label'] = np.where(df['Label'].str.contains('Exploits'), 'expl', df['Label'])
df['SrcIP'] = np.where(df['SrcIP'].str.contains('ï»¿59.166.0.0'), '59.166.0.0', df['SrcIP'])
df['SrcIP'] = np.where(df['SrcIP'].str.contains('ï»¿59.166.0.0'), '59.166.0.0', df['SrcIP'])

print(df['Label'].unique())
print(df['SrcIP'].unique())

['BENIGN' 'expl' 'recon' 'dos' 'other' 'shell' 'fuzz' 'worm' 'bdoor' 'ana']
['59.166.0.0' '59.166.0.6' '59.166.0.5' '59.166.0.3' '10.40.182.3'
 '59.166.0.7' '10.40.170.2' '59.166.0.1' '59.166.0.2' '59.166.0.4'
 '175.45.176.3' '175.45.176.2' '175.45.176.0' '59.166.0.8' '59.166.0.9'
 '175.45.176.1' '10.40.182.1' '10.40.85.1' '192.168.241.243' '10.40.85.30'
 '149.171.126.16' '149.171.126.2' '149.171.126.11' '149.171.126.4'
 '149.171.126.5' '149.171.126.17' '149.171.126.19' '149.171.126.9'
 '149.171.126.8' '149.171.126.7' '149.171.126.15' '149.171.126.6'
 '149.171.126.0' '149.171.126.1' '149.171.126.3' '149.171.126.13'
 '149.171.126.12' '149.171.126.10' '149.171.126.18' '127.0.0.1'
 '149.171.126.14' '10.40.85.10' '10.40.182.6']


In [8]:
def fixValues(df):
    # Function to fix NaNs and Infinite values
    # NaNs are replaced with the MEAN
    # Infinite are replaced with the MAX
    
    
    import numpy as np
    x = df#.copy(deep=True)
    for c in x.columns:
        if x[c].dtype == 'int' or x[c].dtype == 'float':
            temp = np.asarray(x[c], dtype=np.float64)
            # remove NaN & Infinity (if there are)
            temp = temp[np.isfinite(temp)]
            mean_value = temp.mean()
            max_value = temp.max()
            x[c].replace([np.inf, -np.inf], max_value, inplace=True)
            x[c].replace([np.nan], mean_value, inplace=True)
    
    return x


def uniformPorts(df, srcPort_name, dstPort_name):
    # Function to uniformize well-known, registered and dynamic ports 

    df.rename({srcPort_name: 'SrcPort', dstPort_name: 'DstPort'}, axis=1, inplace=True)
        
    #converting strings to numeric
    df['SrcPort_num'] = pd.to_numeric(df['SrcPort'], errors='coerce')
    df['SrcPort_num'].replace([np.nan], -1, inplace=True)
    df['DstPort_num'] = pd.to_numeric(df['DstPort'], errors='coerce')
    df['DstPort_num'].replace([np.nan], -1, inplace=True)
    #determining low&high ports
    srcPort_conditions = [
        (df['SrcPort_num'] == -1),
        (df['SrcPort_num'] >= 0) & (df['SrcPort_num'] <= 1023),
        (df['SrcPort_num'] >= 1024) & (df['SrcPort_num'] <= 49151),
        (df['SrcPort_num'] > 49151)
    ]
    dstPort_conditions = [
        (df['DstPort_num'] == -1),
        (df['DstPort_num'] >= 0) & (df['DstPort_num'] <= 1023),
        (df['DstPort_num'] >= 1024) & (df['DstPort_num'] <= 49151),
        (df['DstPort_num'] > 49151)
    ]    
    port_choices = ['none','well-known','registered','dynamic']
    df['SrcPort_type'] = np.select(srcPort_conditions, port_choices)
    df['DstPort_type'] = np.select(dstPort_conditions, port_choices)
    
    return df

def uniformIP(df, srcIP_name, dstIP_name, internal_network, *, internal_network2 = None):
    # Function for assigning IPs to internal/external network
    df.rename({srcIP_name: 'SrcIP', dstIP_name: 'DstIP'}, axis=1, inplace=True)
    
    if internal_network2 == None:
        df['SrcIP_internal'] = df['SrcIP'].apply(ipaddress.ip_address).isin(ipaddress.ip_network(internal_network))
        df['DstIP_internal'] = df['DstIP'].apply(ipaddress.ip_address).isin(ipaddress.ip_network(internal_network))
    else:
        df['SrcIP_internal1'] = df['SrcIP'].apply(ipaddress.ip_address).isin(ipaddress.ip_network(internal_network))
        df['DstIP_internal1'] = df['DstIP'].apply(ipaddress.ip_address).isin(ipaddress.ip_network(internal_network))
        df['SrcIP_internal2'] = df['SrcIP'].apply(ipaddress.ip_address).isin(ipaddress.ip_network(internal_network2))
        df['DstIP_internal2'] = df['DstIP'].apply(ipaddress.ip_address).isin(ipaddress.ip_network(internal_network2))
        
        df['DstIP_internal'] = (df['DstIP_internal1']) | (df['DstIP_internal2'])
        df['SrcIP_internal'] = (df['SrcIP_internal1']) | (df['SrcIP_internal2'])
        
        df.drop(columns=['SrcIP_internal1', 'SrcIP_internal2', 'DstIP_internal1', 'DstIP_internal2'], inplace=True)
        
    # check internal/external
    int_int = df.loc[(df['SrcIP_internal'] == True) & (df['DstIP_internal'] == True)]
    int_ext = df.loc[(df['SrcIP_internal'] == True) & (df['DstIP_internal'] == False)]
    ext_int = df.loc[(df['SrcIP_internal'] == False) & (df['DstIP_internal'] == True)]
    ext_ext = df.loc[(df['SrcIP_internal'] == False) & (df['DstIP_internal'] == False)]

    print("int_int = {}\n int_ext = {}\n ext_int = {}\n ext_ext = {}".format(len(int_int), len(int_ext), len(ext_int), len(ext_ext)))
        
    return df

In [9]:
df.columns = df.columns.str.replace(' ', '')
df.columns = df.columns.str.replace("'", "")
df = fixValues(df)
print(df.isna().any().any())
print(df.columns[df.isna().any()])

False
Index([], dtype='object')


In [10]:
srcPort_name = 'SrcPort'
dstPort_name = 'DstPort'
srcIP_name = 'SrcIP'
dstIP_name = 'DstIP'



internal_network1 = "59.166.0.0/16"
internal_network2 = "175.45.0.0/16"



df = uniformPorts(df, srcPort_name, dstPort_name)
df = uniformIP(df, srcIP_name, dstIP_name, internal_network=internal_network1, internal_network2=internal_network2)
df.head()

int_int = 0
 int_ext = 2303998
 ext_int = 217339
 ext_ext = 18710


Unnamed: 0,SrcIP,SrcPort,DstIP,DstPort,Proto,State,Duration,SrcBytes,DstBytes,SrcTTL,...,ct_dst_sport,ct_dst_src_ltm,Label,Nature,SrcPort_num,DstPort_num,SrcPort_type,DstPort_type,DstIP_internal,SrcIP_internal
0,59.166.0.0,1390,149.171.126.6,53,udp,CON,0.001055,132,164,31,...,1,1,BENIGN,0,1390.0,53.0,registered,well-known,False,True
1,59.166.0.0,33661,149.171.126.9,1024,udp,CON,0.036133,528,304,31,...,1,2,BENIGN,0,33661.0,1024.0,registered,registered,False,True
2,59.166.0.6,1464,149.171.126.7,53,udp,CON,0.001119,146,178,31,...,1,1,BENIGN,0,1464.0,53.0,registered,well-known,False,True
3,59.166.0.5,3593,149.171.126.5,53,udp,CON,0.001209,132,164,31,...,1,1,BENIGN,0,3593.0,53.0,registered,well-known,False,True
4,59.166.0.3,49664,149.171.126.0,53,udp,CON,0.001169,146,178,31,...,1,1,BENIGN,0,49664.0,53.0,dynamic,well-known,False,True


In [11]:
#swap
df['DstIP_internal'] = np.where(df['DstIP_internal']==True, False, True)
df['SrcIP_internal'] = np.where(df['SrcIP_internal']==True, False, True)

# check internal/external
int_int = df.loc[(df['SrcIP_internal'] == True) & (df['DstIP_internal'] == True)]
int_ext = df.loc[(df['SrcIP_internal'] == True) & (df['DstIP_internal'] == False)]
ext_int = df.loc[(df['SrcIP_internal'] == False) & (df['DstIP_internal'] == True)]
ext_ext = df.loc[(df['SrcIP_internal'] == False) & (df['DstIP_internal'] == False)]

print("int_int = {}\n int_ext = {}\n ext_int = {}\n ext_ext = {}".format(len(int_int), len(int_ext), len(ext_int), len(ext_ext)))

int_int = 18710
 int_ext = 217339
 ext_int = 2303998
 ext_ext = 0


In [12]:
df.columns

Index(['SrcIP', 'SrcPort', 'DstIP', 'DstPort', 'Proto', 'State', 'Duration',
       'SrcBytes', 'DstBytes', 'SrcTTL', 'DstTTL', 'SrcLoss', 'DstLoss',
       'Service', 'SrcBpS', 'DstBpS', 'SrcPkts', 'DstPkts', 'SrcWin', 'DstWin',
       'SrcTcpb', 'DstTcpb', 'SrcMean', 'DstMean', 'Trans_Depth',
       'res_bdy_len', 'SrcJit', 'DstJit', 'FlowStart', 'FlowEnd', 'SrcAit',
       'DstAit', 'TcpRrt', 'SynAck', 'AckDat', 'is_sm_ips_ports',
       'ct_state_ttl', 'ct_flw_http_mthd', 'is_ftp_login', 'ct_ftp_cmd',
       'ct_srv_src', 'ct_srv_dst', 'ct_dst_ltm', 'ct_src_ltm', 'ct_src_dport',
       'ct_dst_sport', 'ct_dst_src_ltm', 'Label', 'Nature', 'SrcPort_num',
       'DstPort_num', 'SrcPort_type', 'DstPort_type', 'DstIP_internal',
       'SrcIP_internal'],
      dtype='object')

In [13]:
df = df.drop(columns=['SrcPort', 'DstPort'])
df = df.rename(columns={"SrcPort_num":"SrcPort", 
                        "DstPort_num":"DstPort", 
                        })

In [14]:
df['Label'].unique()

array(['BENIGN', 'expl', 'recon', 'dos', 'other', 'shell', 'fuzz', 'worm',
       'bdoor', 'ana'], dtype=object)

In [15]:
label_list = df['Label'].unique()
label_list[1]

'expl'

In [17]:
summa = 0
for l in label_list:
    exec(f"{l}_df = df[df['Label']=='{l}']")
    exec(f"summa = summa + len({l}_df)")
    exec(f"print('\t {l}: ', len({l}_df))")
print("Overall samples: ", len(df))
print(summa)

	 BENIGN:  2218764
	 expl:  44525
	 recon:  13987
	 dos:  16353
	 other:  215481
	 shell:  1511
	 fuzz:  24246
	 worm:  174
	 bdoor:  2329
	 ana:  2677
Overall samples:  2540047
2540047


In [22]:
malicious_output_folder = output_folder + "malicious\\"

benign_file = output_folder + "benign.csv"

BENIGN_df.to_csv(benign_file, index=False)

expl_df.to_csv(malicious_output_folder + "expl.csv", index=False)
recon_df.to_csv(malicious_output_folder + "recon.csv", index=False)
dos_df.to_csv(malicious_output_folder + "dos.csv", index=False)
other_df.to_csv(malicious_output_folder + "other.csv", index=False)
shell_df.to_csv(malicious_output_folder + "shell.csv", index=False)
fuzz_df.to_csv(malicious_output_folder + "fuzz.csv", index=False)
worm_df.to_csv(malicious_output_folder + "worm.csv", index=False)
bdoor_df.to_csv(malicious_output_folder + "bdoor.csv", index=False)
ana_df.to_csv(malicious_output_folder + "ana.csv", index=False)