In [1]:
import pandas as pd, numpy as np
import ipaddress

In [2]:
source_folder = "source_folder" # this is the folder that contains the four CSV files obtained after downloading and extracting the dataset
output_folder = "../data/GTCS/flows/" # the output of this notebook will be saved in this folder


file_names = [
    "GTCS.csv",
]

df = pd.DataFrame()

for f in file_names:
    input_file = source_folder + f
    temp_df = pd.read_csv(input_file)
    print("Read {} lines...".format(len(temp_df)))
    #df = df.append(temp_df) # We used this command since we used pandas < 2.0
    df = pd.concat([df, temp_df], ignore_index=True) # Use this if pandas >= 2.0
    print("\t...total length: {}".format(len(df)))

print("...done!")

Read 517477 lines...
	...total length: 517477
...done!


In [3]:
df['Label'].unique()

array(['Normal', 'Infilteration', 'Botnet', 'DDoS', 'BruteForce'],
      dtype=object)

In [4]:
def fixValues(df):
    # Function to fix NaNs and Infinite values
    # NaNs are replaced with the MEAN
    # Infinite are replaced with the MAX
    
    
    import numpy as np
    x = df#.copy(deep=True)
    for c in x.columns:
        if x[c].dtype == 'int' or x[c].dtype == 'float':
            temp = np.asarray(x[c], dtype=np.float64)
            # remove NaN & Infinity (if there are)
            temp = temp[np.isfinite(temp)]
            mean_value = temp.mean()
            max_value = temp.max()
            x[c].replace([np.inf, -np.inf], max_value, inplace=True)
            x[c].replace([np.nan], mean_value, inplace=True)
    
    return x


def uniformPorts(df, srcPort_name, dstPort_name):
    # Function to uniformize well-known, registered and dynamic ports 

    df.rename({srcPort_name: 'SrcPort', dstPort_name: 'DstPort'}, axis=1, inplace=True)
        
    #converting strings to numeric
    df['SrcPort_num'] = pd.to_numeric(df['SrcPort'], errors='coerce')
    df['SrcPort_num'].replace([np.nan], -1, inplace=True)
    df['DstPort_num'] = pd.to_numeric(df['DstPort'], errors='coerce')
    df['DstPort_num'].replace([np.nan], -1, inplace=True)
    #determining low&high ports
    srcPort_conditions = [
        (df['SrcPort_num'] == -1),
        (df['SrcPort_num'] >= 0) & (df['SrcPort_num'] <= 1023),
        (df['SrcPort_num'] >= 1024) & (df['SrcPort_num'] <= 49151),
        (df['SrcPort_num'] > 49151)
    ]
    dstPort_conditions = [
        (df['DstPort_num'] == -1),
        (df['DstPort_num'] >= 0) & (df['DstPort_num'] <= 1023),
        (df['DstPort_num'] >= 1024) & (df['DstPort_num'] <= 49151),
        (df['DstPort_num'] > 49151)
    ]    
    port_choices = ['none','well-known','registered','dynamic']
    df['SrcPort_type'] = np.select(srcPort_conditions, port_choices)
    df['DstPort_type'] = np.select(dstPort_conditions, port_choices)
    
    return df

def uniformIP(df, srcIP_name, dstIP_name, internal_network, *, internal_network2 = None):
    # Function for assigning IPs to internal/external network
    df.rename({srcIP_name: 'SrcIP', dstIP_name: 'DstIP'}, axis=1, inplace=True)
    
    if internal_network2 == None:
        df['SrcIP_internal'] = df['SrcIP'].apply(ipaddress.ip_address).isin(ipaddress.ip_network(internal_network))
        df['DstIP_internal'] = df['DstIP'].apply(ipaddress.ip_address).isin(ipaddress.ip_network(internal_network))
    else:
        df['SrcIP_internal1'] = df['SrcIP'].apply(ipaddress.ip_address).isin(ipaddress.ip_network(internal_network))
        df['DstIP_internal1'] = df['DstIP'].apply(ipaddress.ip_address).isin(ipaddress.ip_network(internal_network))
        df['SrcIP_internal2'] = df['SrcIP'].apply(ipaddress.ip_address).isin(ipaddress.ip_network(internal_network2))
        df['DstIP_internal2'] = df['DstIP'].apply(ipaddress.ip_address).isin(ipaddress.ip_network(internal_network2))
        
        df['DstIP_internal'] = (df['DstIP_internal1']) | (df['DstIP_internal2'])
        df['SrcIP_internal'] = (df['SrcIP_internal1']) | (df['SrcIP_internal2'])
        
        df.drop(columns=['SrcIP_internal1', 'SrcIP_internal2', 'DstIP_internal1', 'DstIP_internal2'], inplace=True)
        
    # check internal/external
    int_int = df.loc[(df['SrcIP_internal'] == True) & (df['DstIP_internal'] == True)]
    int_ext = df.loc[(df['SrcIP_internal'] == True) & (df['DstIP_internal'] == False)]
    ext_int = df.loc[(df['SrcIP_internal'] == False) & (df['DstIP_internal'] == True)]
    ext_ext = df.loc[(df['SrcIP_internal'] == False) & (df['DstIP_internal'] == False)]

    print("int_int = {}\n int_ext = {}\n ext_int = {}\n ext_ext = {}".format(len(int_int), len(int_ext), len(ext_int), len(ext_ext)))
        
    return df

In [5]:
df.columns = df.columns.str.replace(' ', '')
df.columns = df.columns.str.replace("'", "")
df = fixValues(df)
df['Label'].unique()

array(['Normal', 'Infilteration', 'Botnet', 'DDoS', 'BruteForce'],
      dtype=object)

In [6]:
df.columns

Index(['FlowID', 'SrcIP', 'SrcPort', 'DstIP', 'DstPort', 'Protocol',
       'Timestamp', 'FlowDuration', 'TotFwdPkts', 'TotBwdPkts',
       'TotLenFwdPkts', 'TotLenBwdPkts', 'FwdPktLenMax', 'FwdPktLenMin',
       'FwdPktLenMean', 'FwdPktLenStd', 'BwdPktLenMax', 'BwdPktLenMin',
       'BwdPktLenMean', 'BwdPktLenStd', 'FlowByts/s', 'FlowPkts/s',
       'FlowIATMean', 'FlowIATStd', 'FlowIATMax', 'FlowIATMin', 'FwdIATTot',
       'FwdIATMean', 'FwdIATStd', 'FwdIATMax', 'FwdIATMin', 'BwdIATTot',
       'BwdIATMean', 'BwdIATStd', 'BwdIATMax', 'BwdIATMin', 'FwdPSHFlags',
       'BwdPSHFlags', 'FwdURGFlags', 'BwdURGFlags', 'FwdHeaderLen',
       'BwdHeaderLen', 'FwdPkts/s', 'BwdPkts/s', 'PktLenMin', 'PktLenMax',
       'PktLenMean', 'PktLenStd', 'PktLenVar', 'FINFlagCnt', 'SYNFlagCnt',
       'RSTFlagCnt', 'PSHFlagCnt', 'ACKFlagCnt', 'URGFlagCnt', 'CWEFlagCount',
       'ECEFlagCnt', 'Down/UpRatio', 'PktSizeAvg', 'FwdSegSizeAvg',
       'BwdSegSizeAvg', 'FwdByts/bAvg', 'FwdPkts/bAvg', 'FwdBlkR

In [7]:
print(df.isna().any().any())
print(df.columns[df.isna().any()])

False
Index([], dtype='object')


In [8]:
srcPort_name = 'SourcePort'
dstPort_name = 'DestinationPort'
srcIP_name = 'SourceIP'
dstIP_name = 'DestinationIP'

internal_network1 = "10.128.0.0/16"
internal_network2 = "10.128.0.0/16"


df = uniformPorts(df, srcPort_name, dstPort_name)
df = uniformIP(df, srcIP_name, dstIP_name, internal_network=internal_network1, internal_network2=internal_network2)
df.head()

int_int = 104401
 int_ext = 0
 ext_int = 413076
 ext_ext = 0


Unnamed: 0,FlowID,SrcIP,SrcPort,DstIP,DstPort,Protocol,Timestamp,FlowDuration,TotFwdPkts,TotBwdPkts,...,IdleStd,IdleMax,IdleMin,Label,SrcPort_num,DstPort_num,SrcPort_type,DstPort_type,DstIP_internal,SrcIP_internal
0,10.128.0.89.80.10.128.0.156.37986.6,10.128.0.89,80,10.128.0.156,37986,6,11/14/2019 20:01,127,0.0,2.6e-05,...,0.0,0.0,0.0,Normal,80,37986,well-known,registered,True,True
1,76.107.124.18.60734.10.128.0.89.443.6,76.107.124.18,60734,10.128.0.89,443,6,11/21/2019 2:13,1101,0.000175,0.0,...,0.0,0.0,0.0,Infilteration,60734,443,dynamic,well-known,True,False
2,76.107.124.22.53913.10.128.0.156.8080.6,76.107.124.22,53913,10.128.0.156,8080,6,11/18/2019 16:51,11036,0.000175,0.000104,...,0.0,0.0,0.0,Botnet,53913,8080,dynamic,registered,True,False
3,76.107.124.18.59881.10.128.0.183.3389.6,76.107.124.18,59881,10.128.0.183,3389,6,11/21/2019 2:09,1257069,0.000613,0.000182,...,0.0,0.0,0.0,Infilteration,59881,3389,dynamic,registered,True,False
4,76.107.124.18.62747.10.128.0.183.443.6,76.107.124.18,62747,10.128.0.183,443,6,11/21/2019 9:59,243913,0.000789,0.000208,...,0.0,0.0,0.0,Infilteration,62747,443,dynamic,well-known,True,False


In [9]:
df.columns

Index(['FlowID', 'SrcIP', 'SrcPort', 'DstIP', 'DstPort', 'Protocol',
       'Timestamp', 'FlowDuration', 'TotFwdPkts', 'TotBwdPkts',
       'TotLenFwdPkts', 'TotLenBwdPkts', 'FwdPktLenMax', 'FwdPktLenMin',
       'FwdPktLenMean', 'FwdPktLenStd', 'BwdPktLenMax', 'BwdPktLenMin',
       'BwdPktLenMean', 'BwdPktLenStd', 'FlowByts/s', 'FlowPkts/s',
       'FlowIATMean', 'FlowIATStd', 'FlowIATMax', 'FlowIATMin', 'FwdIATTot',
       'FwdIATMean', 'FwdIATStd', 'FwdIATMax', 'FwdIATMin', 'BwdIATTot',
       'BwdIATMean', 'BwdIATStd', 'BwdIATMax', 'BwdIATMin', 'FwdPSHFlags',
       'BwdPSHFlags', 'FwdURGFlags', 'BwdURGFlags', 'FwdHeaderLen',
       'BwdHeaderLen', 'FwdPkts/s', 'BwdPkts/s', 'PktLenMin', 'PktLenMax',
       'PktLenMean', 'PktLenStd', 'PktLenVar', 'FINFlagCnt', 'SYNFlagCnt',
       'RSTFlagCnt', 'PSHFlagCnt', 'ACKFlagCnt', 'URGFlagCnt', 'CWEFlagCount',
       'ECEFlagCnt', 'Down/UpRatio', 'PktSizeAvg', 'FwdSegSizeAvg',
       'BwdSegSizeAvg', 'FwdByts/bAvg', 'FwdPkts/bAvg', 'FwdBlkR

In [10]:
#'Normal', 'Infilteration', 'Botnet', 'DDoS', 'BruteForce'

df['Label2'] = df['Label']
df['Label2'] = np.where(df['Label2'].str.contains('Normal'), 'BENIGN', df['Label2'])
df['Label2'] = np.where(df['Label2'].str.contains('Infilteration'), 'Infiltration', df['Label2'])
df['Label_original'] = df['Label']
df['Label'] = df['Label2']
df = df.drop(['Label2'], axis=1)

print(df['Label_original'].unique())
print(df['Label'].unique())


['Normal' 'Infilteration' 'Botnet' 'DDoS' 'BruteForce']
['BENIGN' 'Infiltration' 'Botnet' 'DDoS' 'BruteForce']


In [11]:
df.columns

Index(['FlowID', 'SrcIP', 'SrcPort', 'DstIP', 'DstPort', 'Protocol',
       'Timestamp', 'FlowDuration', 'TotFwdPkts', 'TotBwdPkts',
       'TotLenFwdPkts', 'TotLenBwdPkts', 'FwdPktLenMax', 'FwdPktLenMin',
       'FwdPktLenMean', 'FwdPktLenStd', 'BwdPktLenMax', 'BwdPktLenMin',
       'BwdPktLenMean', 'BwdPktLenStd', 'FlowByts/s', 'FlowPkts/s',
       'FlowIATMean', 'FlowIATStd', 'FlowIATMax', 'FlowIATMin', 'FwdIATTot',
       'FwdIATMean', 'FwdIATStd', 'FwdIATMax', 'FwdIATMin', 'BwdIATTot',
       'BwdIATMean', 'BwdIATStd', 'BwdIATMax', 'BwdIATMin', 'FwdPSHFlags',
       'BwdPSHFlags', 'FwdURGFlags', 'BwdURGFlags', 'FwdHeaderLen',
       'BwdHeaderLen', 'FwdPkts/s', 'BwdPkts/s', 'PktLenMin', 'PktLenMax',
       'PktLenMean', 'PktLenStd', 'PktLenVar', 'FINFlagCnt', 'SYNFlagCnt',
       'RSTFlagCnt', 'PSHFlagCnt', 'ACKFlagCnt', 'URGFlagCnt', 'CWEFlagCount',
       'ECEFlagCnt', 'Down/UpRatio', 'PktSizeAvg', 'FwdSegSizeAvg',
       'BwdSegSizeAvg', 'FwdByts/bAvg', 'FwdPkts/bAvg', 'FwdBlkR

In [12]:
summa = 0
print("Overall samples: ", len(df))
benign_df = df[df['Label']=='BENIGN']
summa = summa + len(benign_df)
print("\t Benign: ", len(benign_df))

bot_df = df[df['Label']=='Botnet']
summa = summa + len(bot_df)
print("\t Bot: ", len(bot_df))

brute_df = df[df['Label']=='BruteForce']
summa = summa + len(brute_df)
print("\t PortScan: ", len(brute_df))

ddos_df = df[df['Label']=='DDoS']
summa = summa + len(ddos_df)
print("\t DDoS: ", len(ddos_df))


inf_df = df[df['Label']=='Infiltration']
summa = summa + len(inf_df)
print("\t Infiltration: ", len(inf_df))

print(summa)

Overall samples:  517477
	 Benign:  139186
	 Bot:  93021
	 PortScan:  83857
	 DDoS:  131211
	 Infiltration:  70202
517477


In [13]:
malicious_output_folder = output_folder + "malicious\\"

benign_file = output_folder + "benign.csv"

benign_df.to_csv(benign_file)

bot_df.to_csv(malicious_output_folder + "botnet.csv")
brute_df.to_csv(malicious_output_folder + "bruteforce.csv")
inf_df.to_csv(malicious_output_folder + "infiltration.csv")
ddos_df.to_csv(malicious_output_folder + "ddos.csv")