In [10]:
import pandas as pd
import numpy as np
from scipy.stats import entropy

In [16]:
def adjust_data(df, window):

    df['length'] = df['length'].astype(int)
    df['time'] = df['time'].astype(float)
    df['ip_dst'] = df['ip_dst'].astype(str)
    df['ip_src'] = df['ip_src'].astype(str)
    df['protocol'] = df['protocol'].astype(str)
    
    df['time_interval'] = (df['time'] // window)
    
    grouped = df.groupby('time_interval')
    
    # Creating the new dataframe with the specified features
    
    new_df = grouped['length'].agg(
        avg_length='mean',
        var_length='var',
        min_length='min',
        max_length='max',
        count_rows='size'
    )
    
    new_df
    # Calculating unique counts for ip_src, ip_dst, and their pairs
    
    new_df['unique_ip_src'] = grouped['ip_src'].nunique()
    
    new_df['unique_ip_dst'] = grouped['ip_dst'].nunique()
    
    new_df['unique_ip_src_dst'] = grouped.apply(lambda x: len(x[['ip_src', 'ip_dst']].drop_duplicates()))
    
     
    # Calculating ratios
    
    new_df['rows_per_unique_ip_src'] = new_df['count_rows'] / new_df['unique_ip_src']
    
    new_df['rows_per_unique_ip_dst'] = new_df['count_rows'] / new_df['unique_ip_dst']
    
    new_df['rows_per_unique_ip_src_dst'] = new_df['count_rows'] / new_df['unique_ip_src_dst']
    
    
    # Function to calculate entropy
    def calculate_entropy(series):
        value_counts = series.value_counts()
        probabilities = value_counts / len(series)
        return entropy(probabilities)
    
    # Adding entropy calculations for source and destination IPs
    new_df['entropy_ip_src'] = grouped['ip_src'].apply(calculate_entropy)
    new_df['entropy_ip_dst'] = grouped['ip_dst'].apply(calculate_entropy)
    
    new_df['repeated_connections'] = grouped.apply(lambda x: x.duplicated(subset=['ip_src', 'ip_dst']).sum())
    
    
    # Handling division by zero
    
    new_df.replace([np.inf, -np.inf], np.nan, inplace=True)
    
    new_df.fillna(0, inplace=True)
    
    
     
    
    # Calculating the percentage of each protocol in each interval
    
    # protocols = df['protocol'].unique()
    
    # for protocol in protocols:
    
    #     protocol_column = f'percent_{protocol}'
    
    #     new_df[protocol_column] = grouped.apply(lambda x: (x['protocol'] == protocol).sum() / x.shape[0])
    
     
    
    new_df.reset_index(inplace=True)
    return new_df

In [17]:
ddos = pd.read_csv('DDOS.csv')
ddos.set_index('Unnamed: 0', inplace=True)
ddos

Unnamed: 0_level_0,time,ip_dst,ip_src,length,protocol
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0.000000e+00,192.168.137.206,192.168.137.150,60,UDP
1,6.482500e-02,192.168.137.175,52.48.53.214,2482,TCP
2,6.482700e-02,192.168.137.175,52.48.53.214,100,TCP
3,7.956600e-02,192.168.137.2,35.185.101.66,66,TCP
4,8.622800e-02,192.168.137.206,192.168.137.150,60,UDP
...,...,...,...,...,...
2879829,4.687140e+06,192.168.137.20,192.168.137.150,74,TCP
2879830,4.687140e+06,192.168.137.17,192.168.137.150,74,TCP
2879831,4.687140e+06,192.168.137.186,192.168.137.150,74,TCP
2879832,4.687140e+06,192.168.137.17,192.168.137.150,74,TCP


In [18]:
benign = pd.read_csv("BenignTraffic.csv")
benign.set_index('Unnamed: 0', inplace=True)
benign

Unnamed: 0_level_0,time,ip_dst,ip_src,length,protocol
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0.000000,99.81.244.93,192.168.137.175,2962,TCP
1,0.000164,99.81.244.93,192.168.137.175,2962,TCP
2,0.000269,99.81.244.93,192.168.137.175,1514,TCP
3,0.000414,99.81.244.93,192.168.137.175,1514,TCP
4,0.001800,99.81.244.93,192.168.137.175,1514,TCP
...,...,...,...,...,...
3644531,33327.964738,255.255.255.255,192.168.137.82,214,UDP
3644532,33327.967599,192.168.137.253,23.78.206.51,2962,TCP
3644533,33327.967997,192.168.137.253,23.78.206.51,4410,TCP
3644534,33327.968290,192.168.137.253,23.78.206.51,2962,TCP


In [19]:
ddos_adjusted = adjust_data(ddos, 5)
ddos_adjusted

Unnamed: 0,time_interval,avg_length,var_length,min_length,max_length,count_rows,unique_ip_src,unique_ip_dst,unique_ip_src_dst,rows_per_unique_ip_src,rows_per_unique_ip_dst,rows_per_unique_ip_src_dst,entropy_ip_src,entropy_ip_dst,repeated_connections
0,0.0,446.636583,493550.950502,60,7306,4565,57,34,72,80.087719,134.264706,63.402778,2.122806,0.988053,4493
1,1.0,441.576400,455055.961425,60,2962,6034,69,50,102,87.449275,120.680000,59.156863,1.840858,0.726670,5932
2,2.0,213.461435,170262.815176,60,2962,5225,61,50,91,85.655738,104.500000,57.417582,2.115520,0.955799,5134
3,3.0,320.276911,336099.076051,60,2962,6345,82,70,123,77.378049,90.642857,51.585366,2.099015,1.090799,6222
4,4.0,440.391181,430906.890101,60,5858,4241,70,55,106,60.585714,77.109091,40.009434,1.597893,1.343256,4135
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1140,937424.0,136.052239,149067.979140,60,15774,2814,33,29,58,85.272727,97.034483,48.517241,0.322383,2.114859,2756
1141,937425.0,111.924249,11457.425867,60,1487,2165,38,36,62,56.973684,60.138889,34.919355,0.279560,2.051427,2103
1142,937426.0,104.437385,9786.030354,60,1474,2707,30,31,54,90.233333,87.322581,50.129630,0.234974,2.013531,2653
1143,937427.0,105.299748,9265.266686,60,790,2382,33,30,54,72.181818,79.400000,44.111111,0.201990,2.029118,2328


In [20]:
benign_adjusted = adjust_data(benign, 5)
benign_adjusted

Unnamed: 0,time_interval,avg_length,var_length,min_length,max_length,count_rows,unique_ip_src,unique_ip_dst,unique_ip_src_dst,rows_per_unique_ip_src,rows_per_unique_ip_dst,rows_per_unique_ip_src_dst,entropy_ip_src,entropy_ip_dst,repeated_connections
0,0.0,569.061516,9.761190e+05,60,8754,699,65,55,112,10.753846,12.709091,6.241071,2.690884,2.661448,587
1,1.0,534.213650,6.073021e+05,60,7791,674,58,44,81,11.620690,15.318182,8.320988,2.561900,2.474491,593
2,2.0,671.580316,5.720322e+05,60,4410,1077,58,48,85,18.568966,22.437500,12.670588,2.433629,2.378519,992
3,3.0,600.027066,1.089716e+06,60,10202,702,53,37,68,13.245283,18.972973,10.323529,2.357291,2.287758,634
4,4.0,582.752053,9.781522e+05,60,11650,609,57,50,84,10.684211,12.180000,7.250000,2.435547,2.374926,525
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6661,6661.0,814.809961,1.373199e+06,60,13098,763,62,51,84,12.306452,14.960784,9.083333,2.030041,1.972895,679
6662,6662.0,492.996988,1.151472e+06,60,10202,332,54,40,65,6.148148,8.300000,5.107692,2.688147,2.550740,267
6663,6663.0,703.522541,1.154508e+06,60,11650,488,55,40,69,8.872727,12.200000,7.072464,2.335044,2.227144,419
6664,6664.0,878.558621,1.388055e+06,60,15031,870,53,36,64,16.415094,24.166667,13.593750,1.602473,1.532754,806
