In [2]:
import pandas as pd, numpy as np
import ipaddress

In [3]:
source_folder = "source_folder" # this is the folder that contains the four CSV files obtained after downloading and extracting the dataset
output_folder = "../data/IDS17/flows/" # the output of this notebook will be saved in this folder

file_names = [
    "Friday-WorkingHours.pcap_REVI.csv",
    "Monday-WorkingHours.pcap_REVI.csv",
    "Tuesday-WorkingHours.pcap_REVI.csv",
    "Wednesday-WorkingHours.pcap_REVI.csv",
    "Thursday-WorkingHours.pcap_REVI.csv"
]

df = pd.DataFrame()

for f in file_names:
    input_file = source_folder + f
    temp_df = pd.read_csv(input_file)
    print("Read {} lines...".format(len(temp_df)))
    #df = df.append(temp_df) # We used this command since we used pandas < 2.0
    df = pd.concat([df, temp_df], ignore_index=True) # Use this if pandas >= 2.0
    print("\t...total length: {}".format(len(df)))

print("...done!")

Read 547915 lines...
	...total length: 547915
Read 371749 lines...
	...total length: 919664
Read 322003 lines...
	...total length: 1241667
Read 496779 lines...
	...total length: 1738446
Read 362368 lines...
	...total length: 2100814
...done!


In [4]:
df['Label'].unique()

array(['BENIGN', 'Bot - Attempted', 'Bot', 'PortScan', 'DDoS',
       'FTP-Patator', 'SSH-Patator', 'FTP-Patator - Attempted',
       'SSH-Patator - Attempted', 'DoS slowloris',
       'DoS slowloris - Attempted', 'DoS Slowhttptest',
       'DoS Slowhttptest - Attempted', 'DoS Hulk', 'DoS Hulk - Attempted',
       'DoS GoldenEye', 'Heartbleed', 'DoS GoldenEye - Attempted',
       'Web Attack - Brute Force', 'Web Attack - Brute Force - Attempted',
       'Infiltration - Attempted', 'Infiltration',
       'Web Attack - XSS - Attempted', 'Web Attack - XSS',
       'Web Attack - Sql Injection'], dtype=object)

In [5]:
def fixValues(df):
    # Function to fix NaNs and Infinite values
    # NaNs are replaced with the MEAN
    # Infinite are replaced with the MAX
    
    
    import numpy as np
    x = df#.copy(deep=True)
    for c in x.columns:
        if x[c].dtype == 'int' or x[c].dtype == 'float':
            temp = np.asarray(x[c], dtype=np.float64)
            # remove NaN & Infinity (if there are)
            temp = temp[np.isfinite(temp)]
            mean_value = temp.mean()
            max_value = temp.max()
            x[c].replace([np.inf, -np.inf], max_value, inplace=True)
            x[c].replace([np.nan], mean_value, inplace=True)
    
    return x


def uniformPorts(df, srcPort_name, dstPort_name):
    # Function to uniformize well-known, registered and dynamic ports 

    df.rename({srcPort_name: 'SrcPort', dstPort_name: 'DstPort'}, axis=1, inplace=True)
        
    #converting strings to numeric
    df['SrcPort_num'] = pd.to_numeric(df['SrcPort'], errors='coerce')
    df['SrcPort_num'].replace([np.nan], -1, inplace=True)
    df['DstPort_num'] = pd.to_numeric(df['DstPort'], errors='coerce')
    df['DstPort_num'].replace([np.nan], -1, inplace=True)
    #determining low&high ports
    srcPort_conditions = [
        (df['SrcPort_num'] == -1),
        (df['SrcPort_num'] >= 0) & (df['SrcPort_num'] <= 1023),
        (df['SrcPort_num'] >= 1024) & (df['SrcPort_num'] <= 49151),
        (df['SrcPort_num'] > 49151)
    ]
    dstPort_conditions = [
        (df['DstPort_num'] == -1),
        (df['DstPort_num'] >= 0) & (df['DstPort_num'] <= 1023),
        (df['DstPort_num'] >= 1024) & (df['DstPort_num'] <= 49151),
        (df['DstPort_num'] > 49151)
    ]    
    port_choices = ['none','well-known','registered','dynamic']
    df['SrcPort_type'] = np.select(srcPort_conditions, port_choices)
    df['DstPort_type'] = np.select(dstPort_conditions, port_choices)
    
    return df

def uniformIP(df, srcIP_name, dstIP_name, internal_network, *, internal_network2 = None):
    # Function for assigning IPs to internal/external network
    df.rename({srcIP_name: 'SrcIP', dstIP_name: 'DstIP'}, axis=1, inplace=True)
    
    if internal_network2 == None:
        df['SrcIP_internal'] = df['SrcIP'].apply(ipaddress.ip_address).isin(ipaddress.ip_network(internal_network))
        df['DstIP_internal'] = df['DstIP'].apply(ipaddress.ip_address).isin(ipaddress.ip_network(internal_network))
    else:
        df['SrcIP_internal1'] = df['SrcIP'].apply(ipaddress.ip_address).isin(ipaddress.ip_network(internal_network))
        df['DstIP_internal1'] = df['DstIP'].apply(ipaddress.ip_address).isin(ipaddress.ip_network(internal_network))
        df['SrcIP_internal2'] = df['SrcIP'].apply(ipaddress.ip_address).isin(ipaddress.ip_network(internal_network2))
        df['DstIP_internal2'] = df['DstIP'].apply(ipaddress.ip_address).isin(ipaddress.ip_network(internal_network2))
        
        df['DstIP_internal'] = (df['DstIP_internal1']) | (df['DstIP_internal2'])
        df['SrcIP_internal'] = (df['SrcIP_internal1']) | (df['SrcIP_internal2'])
        
        df.drop(columns=['SrcIP_internal1', 'SrcIP_internal2', 'DstIP_internal1', 'DstIP_internal2'], inplace=True)
        
    # check internal/external
    int_int = df.loc[(df['SrcIP_internal'] == True) & (df['DstIP_internal'] == True)]
    int_ext = df.loc[(df['SrcIP_internal'] == True) & (df['DstIP_internal'] == False)]
    ext_int = df.loc[(df['SrcIP_internal'] == False) & (df['DstIP_internal'] == True)]
    ext_ext = df.loc[(df['SrcIP_internal'] == False) & (df['DstIP_internal'] == False)]

    print("int_int = {}\n int_ext = {}\n ext_int = {}\n ext_ext = {}".format(len(int_int), len(int_ext), len(ext_int), len(ext_ext)))
        
    return df

In [6]:
df.columns = df.columns.str.replace(' ', '')
df = fixValues(df)
df['Label'].unique()

array(['BENIGN', 'Bot - Attempted', 'Bot', 'PortScan', 'DDoS',
       'FTP-Patator', 'SSH-Patator', 'FTP-Patator - Attempted',
       'SSH-Patator - Attempted', 'DoS slowloris',
       'DoS slowloris - Attempted', 'DoS Slowhttptest',
       'DoS Slowhttptest - Attempted', 'DoS Hulk', 'DoS Hulk - Attempted',
       'DoS GoldenEye', 'Heartbleed', 'DoS GoldenEye - Attempted',
       'Web Attack - Brute Force', 'Web Attack - Brute Force - Attempted',
       'Infiltration - Attempted', 'Infiltration',
       'Web Attack - XSS - Attempted', 'Web Attack - XSS',
       'Web Attack - Sql Injection'], dtype=object)

In [7]:
df.columns

Index(['FlowID', 'SrcIP', 'SrcPort', 'DstIP', 'DstPort', 'Protocol',
       'Timestamp', 'FlowDuration', 'TotalFwdPacket', 'TotalBwdpackets',
       'TotalLengthofFwdPacket', 'TotalLengthofBwdPacket',
       'FwdPacketLengthMax', 'FwdPacketLengthMin', 'FwdPacketLengthMean',
       'FwdPacketLengthStd', 'BwdPacketLengthMax', 'BwdPacketLengthMin',
       'BwdPacketLengthMean', 'BwdPacketLengthStd', 'FlowBytes/s',
       'FlowPackets/s', 'FlowIATMean', 'FlowIATStd', 'FlowIATMax',
       'FlowIATMin', 'FwdIATTotal', 'FwdIATMean', 'FwdIATStd', 'FwdIATMax',
       'FwdIATMin', 'BwdIATTotal', 'BwdIATMean', 'BwdIATStd', 'BwdIATMax',
       'BwdIATMin', 'FwdPSHFlags', 'BwdPSHFlags', 'FwdURGFlags', 'BwdURGFlags',
       'FwdHeaderLength', 'BwdHeaderLength', 'FwdPackets/s', 'BwdPackets/s',
       'PacketLengthMin', 'PacketLengthMax', 'PacketLengthMean',
       'PacketLengthStd', 'PacketLengthVariance', 'FINFlagCount',
       'SYNFlagCount', 'RSTFlagCount', 'PSHFlagCount', 'ACKFlagCount',
       '

In [8]:
print(df.isna().any().any())
print(df.columns[df.isna().any()])

False
Index([], dtype='object')


In [9]:
srcPort_name = 'SourcePort'
dstPort_name = 'DestinationPort'
srcIP_name = 'SourceIP'
dstIP_name = 'DestinationIP'

internal_network1 = "192.168.0.0/16"
internal_network2 = "8.6.0.0/16"


df = uniformPorts(df, srcPort_name, dstPort_name)
df = uniformIP(df, srcIP_name, dstIP_name, internal_network=internal_network1, internal_network2=internal_network2)
df.head()

int_int = 1069184
 int_ext = 589433
 ext_int = 442196
 ext_ext = 1


Unnamed: 0,FlowID,SrcIP,SrcPort,DstIP,DstPort,Protocol,Timestamp,FlowDuration,TotalFwdPacket,TotalBwdpackets,...,IdleStd,IdleMax,IdleMin,Label,SrcPort_num,DstPort_num,SrcPort_type,DstPort_type,DstIP_internal,SrcIP_internal
0,192.168.10.50-192.168.10.3-56108-3268-6,192.168.10.50,56108,192.168.10.3,3268,6,07/07/2017 01:59:50 PM,112740690,32,16,...,498804.8,16399772.0,15375229.0,BENIGN,56108,3268,dynamic,registered,True,True
1,192.168.10.50-192.168.10.3-42144-389-6,192.168.10.50,42144,192.168.10.3,389,6,07/07/2017 01:59:50 PM,112740560,32,16,...,498793.7,16399782.0,15375263.0,BENIGN,42144,389,registered,well-known,True,True
2,8.6.0.1-8.0.6.4-0-0-0,8.6.0.1,0,8.0.6.4,0,0,07/07/2017 02:00:31 PM,113757377,545,0,...,6935824.0,20757030.0,5504997.0,BENIGN,0,0,well-known,well-known,False,True
3,192.168.10.25-224.0.0.251-5353-5353-17,192.168.10.25,5353,224.0.0.251,5353,17,07/07/2017 02:00:42 PM,91997219,388,0,...,5826905.0,19776791.0,5817470.0,BENIGN,5353,5353,registered,registered,False,True
4,192.168.10.25-17.253.14.125-123-123-17,192.168.10.25,123,17.253.14.125,123,17,07/07/2017 02:00:42 PM,66966070,6,6,...,0.0,64974431.0,64974431.0,BENIGN,123,123,well-known,well-known,False,True


In [10]:
df.columns

Index(['FlowID', 'SrcIP', 'SrcPort', 'DstIP', 'DstPort', 'Protocol',
       'Timestamp', 'FlowDuration', 'TotalFwdPacket', 'TotalBwdpackets',
       'TotalLengthofFwdPacket', 'TotalLengthofBwdPacket',
       'FwdPacketLengthMax', 'FwdPacketLengthMin', 'FwdPacketLengthMean',
       'FwdPacketLengthStd', 'BwdPacketLengthMax', 'BwdPacketLengthMin',
       'BwdPacketLengthMean', 'BwdPacketLengthStd', 'FlowBytes/s',
       'FlowPackets/s', 'FlowIATMean', 'FlowIATStd', 'FlowIATMax',
       'FlowIATMin', 'FwdIATTotal', 'FwdIATMean', 'FwdIATStd', 'FwdIATMax',
       'FwdIATMin', 'BwdIATTotal', 'BwdIATMean', 'BwdIATStd', 'BwdIATMax',
       'BwdIATMin', 'FwdPSHFlags', 'BwdPSHFlags', 'FwdURGFlags', 'BwdURGFlags',
       'FwdHeaderLength', 'BwdHeaderLength', 'FwdPackets/s', 'BwdPackets/s',
       'PacketLengthMin', 'PacketLengthMax', 'PacketLengthMean',
       'PacketLengthStd', 'PacketLengthVariance', 'FINFlagCount',
       'SYNFlagCount', 'RSTFlagCount', 'PSHFlagCount', 'ACKFlagCount',
       '

In [11]:
df['Label2'] = np.where(df['Label'].str.contains('Attempted'), 'BENIGN', df['Label'])
df['Label2'] = np.where(df['Label2'].str.contains('DoS slowloris'), 'DoS-Slowloris', df['Label2'])
df['Label2'] = np.where(df['Label2'].str.contains('DoS Slowhttptest'), 'DoS-Slowhttptest', df['Label2'])
df['Label2'] = np.where(df['Label2'].str.contains('DoS Hulk'), 'DoS-Hulk', df['Label2'])
df['Label2'] = np.where(df['Label2'].str.contains('DoS GoldenEye'), 'DoS-Goldeneye', df['Label2'])
df['Label2'] = np.where(df['Label2'].str.contains('DDoS'), 'DoS-Ddos', df['Label2'])
df['Label2'] = np.where(df['Label2'].str.contains('Web Attack - Brute Force'), 'WebAttack-Bruteforce', df['Label2'])
df['Label2'] = np.where(df['Label2'].str.contains('Web Attack - Sql Injection'), 'WebAttack-Sqlinjection', df['Label2'])
df['Label2'] = np.where(df['Label2'].str.contains('Web Attack - XSS'), 'WebAttack-Xss', df['Label2'])
df['Label_original'] = df['Label']
df['Label'] = df['Label2']
df = df.drop(['Label2'], axis=1)

print(df['Label_original'].unique())
print(df['Label'].unique())


['BENIGN' 'Bot - Attempted' 'Bot' 'PortScan' 'DDoS' 'FTP-Patator'
 'SSH-Patator' 'FTP-Patator - Attempted' 'SSH-Patator - Attempted'
 'DoS slowloris' 'DoS slowloris - Attempted' 'DoS Slowhttptest'
 'DoS Slowhttptest - Attempted' 'DoS Hulk' 'DoS Hulk - Attempted'
 'DoS GoldenEye' 'Heartbleed' 'DoS GoldenEye - Attempted'
 'Web Attack - Brute Force' 'Web Attack - Brute Force - Attempted'
 'Infiltration - Attempted' 'Infiltration' 'Web Attack - XSS - Attempted'
 'Web Attack - XSS' 'Web Attack - Sql Injection']
['BENIGN' 'Bot' 'PortScan' 'DoS-Ddos' 'FTP-Patator' 'SSH-Patator'
 'DoS-Slowloris' 'DoS-Slowhttptest' 'DoS-Hulk' 'DoS-Goldeneye'
 'Heartbleed' 'WebAttack-Bruteforce' 'Infiltration' 'WebAttack-Xss'
 'WebAttack-Sqlinjection']


In [12]:
df.columns

Index(['FlowID', 'SrcIP', 'SrcPort', 'DstIP', 'DstPort', 'Protocol',
       'Timestamp', 'FlowDuration', 'TotalFwdPacket', 'TotalBwdpackets',
       'TotalLengthofFwdPacket', 'TotalLengthofBwdPacket',
       'FwdPacketLengthMax', 'FwdPacketLengthMin', 'FwdPacketLengthMean',
       'FwdPacketLengthStd', 'BwdPacketLengthMax', 'BwdPacketLengthMin',
       'BwdPacketLengthMean', 'BwdPacketLengthStd', 'FlowBytes/s',
       'FlowPackets/s', 'FlowIATMean', 'FlowIATStd', 'FlowIATMax',
       'FlowIATMin', 'FwdIATTotal', 'FwdIATMean', 'FwdIATStd', 'FwdIATMax',
       'FwdIATMin', 'BwdIATTotal', 'BwdIATMean', 'BwdIATStd', 'BwdIATMax',
       'BwdIATMin', 'FwdPSHFlags', 'BwdPSHFlags', 'FwdURGFlags', 'BwdURGFlags',
       'FwdHeaderLength', 'BwdHeaderLength', 'FwdPackets/s', 'BwdPackets/s',
       'PacketLengthMin', 'PacketLengthMax', 'PacketLengthMean',
       'PacketLengthStd', 'PacketLengthVariance', 'FINFlagCount',
       'SYNFlagCount', 'RSTFlagCount', 'PSHFlagCount', 'ACKFlagCount',
       '

In [13]:
summa = 0
print("Overall samples: ", len(df))
benign_df = df[df['Label']=='BENIGN']
summa = summa + len(benign_df)
print("\t Benign: ", len(benign_df))

bot_df = df[df['Label']=='Bot']
summa = summa + len(bot_df)
print("\t Bot: ", len(bot_df))


portscan_df = df[df['Label']=='PortScan']
summa = summa + len(portscan_df)
print("\t PortScan: ", len(portscan_df))

ddos_df = df[df['Label']=='DoS-Ddos']
summa = summa + len(ddos_df)
print("\t DDoS: ", len(ddos_df))

ftp_df = df[df['Label']=='FTP-Patator']
summa = summa + len(ftp_df)
print("\t FTP-Patator: ", len(ftp_df))


ssh_df = df[df['Label']=='SSH-Patator']
summa = summa + len(ssh_df)
print("\t SSH-Patator: ", len(ssh_df))


slowloris_df = df[df['Label']=='DoS-Slowloris']
summa = summa + len(slowloris_df)
print("\t DoS-Slowloris: ", len(slowloris_df))

slowhttp_df = df[df['Label']=='DoS-Slowhttptest']
summa = summa + len(slowhttp_df)
print("\t DoS-Slowhttp: ", len(slowhttp_df))

hulk_df = df[df['Label']=='DoS-Hulk']
summa = summa + len(hulk_df)
print("\t DoS-Hulk: ", len(hulk_df))

goldeneye_df = df[df['Label']=='DoS-Goldeneye']
summa = summa + len(goldeneye_df)
print("\t DoS-Goldeneye: ", len(goldeneye_df))

hb_df = df[df['Label']=='Heartbleed']
summa = summa + len(hb_df)
print("\t Heartbleed: ", len(hb_df))

brute_df = df[df['Label']=='WebAttack-Bruteforce']
summa = summa + len(brute_df)
print("\t WebAttack-Bruteforce: ", len(brute_df))

inf_df = df[df['Label']=='Infiltration']
summa = summa + len(inf_df)
print("\t Infiltration: ", len(inf_df))

xss_df = df[df['Label']=='WebAttack-Xss']
summa = summa + len(xss_df)
print("\t WebAttack-Xss: ", len(xss_df))

sql_df = df[df['Label']=='WebAttack-Sqlinjection']
summa = summa + len(sql_df)
print("\t WebAttack-Sqlinjection: ", len(sql_df))

print(summa)

Overall samples:  2100814
	 Benign:  1666837
	 Bot:  738
	 PortScan:  159151
	 DDoS:  95123
	 FTP-Patator:  3973
	 SSH-Patator:  2980
	 DoS-Slowloris:  4001
	 DoS-Slowhttp:  1742
	 DoS-Hulk:  158469
	 DoS-Goldeneye:  7567
	 Heartbleed:  11
	 WebAttack-Bruteforce:  151
	 Infiltration:  32
	 WebAttack-Xss:  27
	 WebAttack-Sqlinjection:  12
2100814


In [14]:
other_df = pd.concat([bot_df, hb_df, inf_df, xss_df, sql_df, brute_df])
len(other_df)

971

In [44]:
malicious_output_folder = output_folder + "malicious\\"

other_df.to_csv(malicious_output_folder + "other.csv")

small_malicious_output_folder = malicious_output_folder + "small\\"

benign_file = output_folder + "benign.csv"

benign_df.to_csv(benign_file)

bot_df.to_csv(small_malicious_output_folder + "bot.csv")
hb_df.to_csv(small_malicious_output_folder + "heartbleed.csv")
brute_df.to_csv(small_malicious_output_folder + "webattack-brute.csv")
inf_df.to_csv(small_malicious_output_folder + "infiltration.csv")
xss_df.to_csv(small_malicious_output_folder + "webattack-xss.csv")
sql_df.to_csv(small_malicious_output_folder + "webattack-sql.csv")

portscan_df.to_csv(malicious_output_folder + "portscan.csv")
ddos_df.to_csv(malicious_output_folder + "dos-ddos.csv")
ftp_df.to_csv(malicious_output_folder + "ftp-patator.csv")
ssh_df.to_csv(malicious_output_folder + "ssh-patator.csv")
slowloris_df.to_csv(malicious_output_folder + "dos-slowloris.csv")
slowhttp_df.to_csv(malicious_output_folder + "dos-slowhttp.csv")
hulk_df.to_csv(malicious_output_folder + "dos-hulk.csv")
goldeneye_df.to_csv(malicious_output_folder + "dos-goldeneye.csv")