In [1]:
import pandas as pd, numpy as np
import ipaddress

In [None]:
source_folder = "source_folder" # this is the folder that contains the four CSV files obtained after downloading and extracting the dataset
output_folder = "../data/CTU13/flows/" # the output of this notebook will be saved in this folder


file_names = [
    "2-neris.csv",
    "3-rbot.csv",
    "4-rbot.csv",
    "5-virut.csv",
    "6-donbot.csv",
    "7-sogou.csv",
    "8-murlo.csv",
    "9-neris.csv",
    "10-rbot.csv",
    "11-rbot.csv",
    "12-nsis.csv",
    "13-virut.csv"
]


df = pd.DataFrame()

for f in file_names:
    input_file = source_folder + f
    temp_df = pd.read_csv(input_file)
    print("Read {} lines...".format(len(temp_df)))
    #df = df.append(temp_df) # We used this command since we used pandas < 2.0
    df = pd.concat([df, temp_df], ignore_index=True) # Use this if pandas >= 2.0
    print("\t...total length: {}".format(len(df)))

print("...done!")

In [5]:
df['Label2'].unique()

array(['BENIGN', 'neris', 'rbot', 'virut', 'donbot', 'sogou', 'murlo',
       'nsis'], dtype=object)

In [6]:
df.columns

Index(['SrcAddr', 'DstAddr', 'Proto', 'Sport', 'Dport', 'State', 'sTos',
       'dTos', 'SrcWin', 'DstWin', 'sHops', 'dHops', 'StartTime', 'LastTime',
       'sTtl', 'dTtl', 'TcpRtt', 'SynAck', 'AckDat', 'SrcPkts', 'DstPkts',
       'SrcBytes', 'DstBytes', 'SAppBytes', 'DAppBytes', 'Dur', 'TotPkts',
       'TotBytes', 'TotAppByte', 'Rate', 'SrcRate', 'DstRate', 'Label',
       'Label2'],
      dtype='object')

In [7]:
df['SrcIP_dummy'] = df['SrcAddr']
df['DstIP_dummy'] = df['DstAddr']
df['DstIP_dummy'] = np.where(df['DstIP_dummy'].str.contains(':'), '0.0.0.0', df['DstIP_dummy'])
df['SrcIP_dummy'] = np.where(df['SrcIP_dummy'].str.contains(':'), '0.0.0.0', df['SrcIP_dummy'])

In [8]:
def fixValues(df):
    # Function to fix NaNs and Infinite values
    # NaNs are replaced with the MEAN
    # Infinite are replaced with the MAX
    
    
    import numpy as np
    x = df#.copy(deep=True)
    for c in x.columns:
        if x[c].dtype == 'int' or x[c].dtype == 'float':
            temp = np.asarray(x[c], dtype=np.float64)
            # remove NaN & Infinity (if there are)
            temp = temp[np.isfinite(temp)]
            mean_value = temp.mean()
            max_value = temp.max()
            x[c].replace([np.inf, -np.inf], max_value, inplace=True)
            x[c].replace([np.nan], mean_value, inplace=True)
    
    return x


def uniformPorts(df, srcPort_name, dstPort_name):
    # Function to uniformize well-known, registered and dynamic ports 

    df.rename({srcPort_name: 'SrcPort', dstPort_name: 'DstPort'}, axis=1, inplace=True)
        
    #converting strings to numeric
    df['SrcPort_num'] = pd.to_numeric(df['SrcPort'], errors='coerce')
    df['SrcPort_num'].replace([np.nan], -1, inplace=True)
    df['DstPort_num'] = pd.to_numeric(df['DstPort'], errors='coerce')
    df['DstPort_num'].replace([np.nan], -1, inplace=True)
    #determining low&high ports
    srcPort_conditions = [
        (df['SrcPort_num'] == -1),
        (df['SrcPort_num'] >= 0) & (df['SrcPort_num'] <= 1023),
        (df['SrcPort_num'] >= 1024) & (df['SrcPort_num'] <= 49151),
        (df['SrcPort_num'] > 49151)
    ]
    dstPort_conditions = [
        (df['DstPort_num'] == -1),
        (df['DstPort_num'] >= 0) & (df['DstPort_num'] <= 1023),
        (df['DstPort_num'] >= 1024) & (df['DstPort_num'] <= 49151),
        (df['DstPort_num'] > 49151)
    ]    
    port_choices = ['none','well-known','registered','dynamic']
    df['SrcPort_type'] = np.select(srcPort_conditions, port_choices)
    df['DstPort_type'] = np.select(dstPort_conditions, port_choices)
    
    return df

def uniformIP(df, srcIP_name, dstIP_name, internal_network, *, internal_network2 = None):
    # Function for assigning IPs to internal/external network
    df.rename({srcIP_name: 'SrcIP', dstIP_name: 'DstIP'}, axis=1, inplace=True)
    
    if internal_network2 == None:
        df['SrcIP_internal'] = df['SrcIP'].apply(ipaddress.ip_address).isin(ipaddress.ip_network(internal_network))
        df['DstIP_internal'] = df['DstIP'].apply(ipaddress.ip_address).isin(ipaddress.ip_network(internal_network))
    else:
        df['SrcIP_internal1'] = df['SrcIP'].apply(ipaddress.ip_address).isin(ipaddress.ip_network(internal_network))
        df['DstIP_internal1'] = df['DstIP'].apply(ipaddress.ip_address).isin(ipaddress.ip_network(internal_network))
        df['SrcIP_internal2'] = df['SrcIP'].apply(ipaddress.ip_address).isin(ipaddress.ip_network(internal_network2))
        df['DstIP_internal2'] = df['DstIP'].apply(ipaddress.ip_address).isin(ipaddress.ip_network(internal_network2))
        
        df['DstIP_internal'] = (df['DstIP_internal1']) | (df['DstIP_internal2'])
        df['SrcIP_internal'] = (df['SrcIP_internal1']) | (df['SrcIP_internal2'])
        
        df.drop(columns=['SrcIP_internal1', 'SrcIP_internal2', 'DstIP_internal1', 'DstIP_internal2'], inplace=True)
        
    # check internal/external
    int_int = df.loc[(df['SrcIP_internal'] == True) & (df['DstIP_internal'] == True)]
    int_ext = df.loc[(df['SrcIP_internal'] == True) & (df['DstIP_internal'] == False)]
    ext_int = df.loc[(df['SrcIP_internal'] == False) & (df['DstIP_internal'] == True)]
    ext_ext = df.loc[(df['SrcIP_internal'] == False) & (df['DstIP_internal'] == False)]

    print("int_int = {}\n int_ext = {}\n ext_int = {}\n ext_ext = {}".format(len(int_int), len(int_ext), len(ext_int), len(ext_ext)))
        
    return df

In [9]:
df.columns = df.columns.str.replace(' ', '')
df.columns = df.columns.str.replace("'", "")
df = fixValues(df)
df['Label2'].unique()

array(['BENIGN', 'neris', 'rbot', 'virut', 'donbot', 'sogou', 'murlo',
       'nsis'], dtype=object)

In [10]:
df.columns

Index(['SrcAddr', 'DstAddr', 'Proto', 'Sport', 'Dport', 'State', 'sTos',
       'dTos', 'SrcWin', 'DstWin', 'sHops', 'dHops', 'StartTime', 'LastTime',
       'sTtl', 'dTtl', 'TcpRtt', 'SynAck', 'AckDat', 'SrcPkts', 'DstPkts',
       'SrcBytes', 'DstBytes', 'SAppBytes', 'DAppBytes', 'Dur', 'TotPkts',
       'TotBytes', 'TotAppByte', 'Rate', 'SrcRate', 'DstRate', 'Label',
       'Label2', 'SrcIP_dummy', 'DstIP_dummy'],
      dtype='object')

In [11]:
print(df.isna().any().any())
print(df.columns[df.isna().any()])

True
Index(['Sport', 'Dport', 'State'], dtype='object')


In [12]:
srcPort_name = 'Sport'
dstPort_name = 'Dport'
srcIP_name = 'SrcIP_dummy'
dstIP_name = 'DstIP_dummy'



internal_network1 = "147.32.0.0/16"
internal_network2 = "147.32.0.0/16"


df = uniformPorts(df, srcPort_name, dstPort_name)
df = uniformIP(df, srcIP_name, dstIP_name, internal_network=internal_network1, internal_network2=internal_network2)
df.head()

int_int = 8145092
 int_ext = 3677886
 ext_int = 5289411
 ext_ext = 39675


Unnamed: 0,SrcAddr,DstAddr,Proto,SrcPort,DstPort,State,sTos,dTos,SrcWin,DstWin,...,Label,Label2,SrcIP,DstIP,SrcPort_num,DstPort_num,SrcPort_type,DstPort_type,DstIP_internal,SrcIP_internal
0,31.96.153.11,147.32.84.229,tcp,60257,443,RST,0.0,0.0,2097152.0,65535.0,...,flow=Background-TCP-Established,BENIGN,31.96.153.11,147.32.84.229,60257.0,443.0,dynamic,well-known,True,False
1,83.228.37.92,147.32.84.229,tcp,2571,13363,RST,0.0,0.0,65535.0,65535.0,...,flow=Background-TCP-Established,BENIGN,83.228.37.92,147.32.84.229,2571.0,13363.0,registered,registered,True,False
2,83.228.37.92,147.32.84.229,tcp,2574,443,RST,0.0,0.0,65535.0,65535.0,...,flow=Background-TCP-Established,BENIGN,83.228.37.92,147.32.84.229,2574.0,443.0,registered,well-known,True,False
3,83.228.37.92,147.32.84.229,tcp,2576,80,RST,0.0,0.0,65535.0,65535.0,...,flow=Background-TCP-Established,BENIGN,83.228.37.92,147.32.84.229,2576.0,80.0,registered,well-known,True,False
4,83.228.37.92,147.32.84.229,tcp,2571,13363,RST,0.0,0.0,65535.0,65535.0,...,flow=Background-TCP-Established,BENIGN,83.228.37.92,147.32.84.229,2571.0,13363.0,registered,registered,True,False


In [13]:
df.columns

Index(['SrcAddr', 'DstAddr', 'Proto', 'SrcPort', 'DstPort', 'State', 'sTos',
       'dTos', 'SrcWin', 'DstWin', 'sHops', 'dHops', 'StartTime', 'LastTime',
       'sTtl', 'dTtl', 'TcpRtt', 'SynAck', 'AckDat', 'SrcPkts', 'DstPkts',
       'SrcBytes', 'DstBytes', 'SAppBytes', 'DAppBytes', 'Dur', 'TotPkts',
       'TotBytes', 'TotAppByte', 'Rate', 'SrcRate', 'DstRate', 'Label',
       'Label2', 'SrcIP', 'DstIP', 'SrcPort_num', 'DstPort_num',
       'SrcPort_type', 'DstPort_type', 'DstIP_internal', 'SrcIP_internal'],
      dtype='object')

In [14]:
df = df.drop(columns=['SrcIP', 'DstIP', 'SrcPort', 'DstPort'])
df = df.rename(columns={"SrcAddr": "SrcIP", 
                        "DstAddr": "DstIP", 
                        "SrcPort_num":"SrcPort", 
                        "DstPort_num":"DstPort", 
                        "Label": "Label_original", 
                        "Label2": "Label"})

In [15]:
df['Label'].unique()

array(['BENIGN', 'neris', 'rbot', 'virut', 'donbot', 'sogou', 'murlo',
       'nsis'], dtype=object)

In [16]:
label_list = df['Label'].unique()
label_list[1]

'neris'

In [17]:
summa = 0
for l in label_list:
    exec(f"{l}_df = df[df['Label']=='{l}']")
    exec(f"summa = summa + len({l}_df)")
    exec(f"print('\t {l}: ', len({l}_df))")
print("Overall samples: ", len(df))
print(summa)

	 BENIGN:  16748326
	 neris:  205928
	 rbot:  143918
	 virut:  40904
	 donbot:  4630
	 sogou:  63
	 murlo:  6127
	 nsis:  2168
Overall samples:  17152064
17152064


In [18]:
# 'neris', 'rbot', 'virut', 'donbot', 'sogou', 'murlo', 'nsis'

len(neris_df)


205928

In [None]:
malicious_output_folder = output_folder + "malicious\\"

benign_file = output_folder + "benign.csv"

BENIGN_df.to_csv(benign_file, index=False)

neris_df.to_csv(malicious_output_folder + "neris.csv", index=False)
rbot_df.to_csv(malicious_output_folder + "rbot.csv", index=False)
virut_df.to_csv(malicious_output_folder + "virut.csv", index=False)
donbot_df.to_csv(malicious_output_folder + "donbot.csv", index=False)
sogou_df.to_csv(malicious_output_folder + "sogou.csv", index=False)
murlo_df.to_csv(malicious_output_folder + "murlo.csv", index=False)
nsis_df.to_csv(malicious_output_folder + "nsis.csv", index=False)