In [3]:
from netml.pparser.parser import PCAP
import pandas as pd
from scipy.stats import entropy
import numpy as np

pcap = PCAP("BenignTraffic.pcap")
pcap.pcap2pandas()

In [4]:
df = pd.DataFrame(pcap.df)
keep = ["time", "ip_dst", 'ip_src', 'length', 'protocol']
df = df[keep].dropna()
df['time'] = df['time'] - df.iloc[0, 0]
df.to_csv('BenignTraffic.csv')

In [5]:
df

Unnamed: 0,time,ip_dst,ip_src,length,protocol
0,0.000000,99.81.244.93,192.168.137.175,2962,TCP
1,0.000164,99.81.244.93,192.168.137.175,2962,TCP
2,0.000269,99.81.244.93,192.168.137.175,1514,TCP
3,0.000414,99.81.244.93,192.168.137.175,1514,TCP
4,0.001800,99.81.244.93,192.168.137.175,1514,TCP
...,...,...,...,...,...
3644531,33327.964738,255.255.255.255,192.168.137.82,214,UDP
3644532,33327.967599,192.168.137.253,23.78.206.51,2962,TCP
3644533,33327.967997,192.168.137.253,23.78.206.51,4410,TCP
3644534,33327.968290,192.168.137.253,23.78.206.51,2962,TCP


In [8]:
df['length'] = df['length'].astype(int)
df['time'] = df['time'].astype(float)
df['ip_dst'] = df['ip_dst'].astype(str)
df['ip_src'] = df['ip_src'].astype(str)
df['protocol'] = df['protocol'].astype(str)

df['time_interval'] = (df['time'] // 2)

grouped = df.groupby('time_interval')

# Creating the new dataframe with the specified features

new_df = grouped['length'].agg(
    avg_length='mean',
    var_length='var',
    min_length='min',
    max_length='max',
    count_rows='size'
)

new_df
# Calculating unique counts for ip_src, ip_dst, and their pairs

new_df['unique_ip_src'] = grouped['ip_src'].nunique()

new_df['unique_ip_dst'] = grouped['ip_dst'].nunique()

new_df['unique_ip_src_dst'] = grouped.apply(lambda x: len(x[['ip_src', 'ip_dst']].drop_duplicates()))

 
# Calculating ratios

new_df['rows_per_unique_ip_src'] = new_df['count_rows'] / new_df['unique_ip_src']

new_df['rows_per_unique_ip_dst'] = new_df['count_rows'] / new_df['unique_ip_dst']

new_df['rows_per_unique_ip_src_dst'] = new_df['count_rows'] / new_df['unique_ip_src_dst']


# Function to calculate entropy
def calculate_entropy(series):
    value_counts = series.value_counts()
    probabilities = value_counts / len(series)
    return entropy(probabilities)

# Adding entropy calculations for source and destination IPs
new_df['entropy_ip_src'] = grouped['ip_src'].apply(calculate_entropy)
new_df['entropy_ip_dst'] = grouped['ip_dst'].apply(calculate_entropy)

new_df['repeated_connections'] = grouped.apply(lambda x: x.duplicated(subset=['ip_src', 'ip_dst']).sum())


# Handling division by zero

new_df.replace([np.inf, -np.inf], np.nan, inplace=True)

new_df.fillna(0, inplace=True)


 

# Calculating the percentage of each protocol in each interval

# protocols = df['protocol'].unique()

# for protocol in protocols:

#     protocol_column = f'percent_{protocol}'

#     new_df[protocol_column] = grouped.apply(lambda x: (x['protocol'] == protocol).sum() / x.shape[0])

 

new_df.reset_index(inplace=True)

In [9]:
new_df

Unnamed: 0,time_interval,avg_length,var_length,min_length,max_length,count_rows,unique_ip_src,unique_ip_dst,unique_ip_src_dst,rows_per_unique_ip_src,rows_per_unique_ip_dst,rows_per_unique_ip_src_dst,entropy_ip_src,entropy_ip_dst,repeated_connections
0,0.0,624.908108,1.019762e+06,60,7306,185,30,29,46,6.166667,6.379310,4.021739,2.563669,2.561037,139
1,1.0,562.184000,9.819709e+05,60,8754,375,46,38,72,8.152174,9.868421,5.208333,2.540818,2.494277,303
2,2.0,514.052000,7.419119e+05,60,7306,250,29,24,41,8.620690,10.416667,6.097561,2.477444,2.446358,209
3,3.0,611.446602,7.684756e+05,60,7791,309,36,27,46,8.583333,11.444444,6.717391,2.386062,2.296546,263
4,4.0,448.649606,4.306519e+05,60,4410,254,28,22,36,9.071429,11.545455,7.055556,2.339265,2.254979,218
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16659,16659.0,855.020619,1.630702e+06,60,11650,194,22,18,28,8.818182,10.777778,6.928571,1.946084,1.914562,166
16660,16660.0,792.511029,1.850283e+06,60,15031,272,31,23,35,8.774194,11.826087,7.771429,1.899648,1.824193,237
16661,16661.0,946.372587,1.158516e+06,66,10202,518,22,15,24,23.545455,34.533333,21.583333,1.211047,1.178932,494
16662,16662.0,549.484211,9.524260e+05,60,5858,190,26,19,32,7.307692,10.000000,5.937500,2.358966,2.278627,158
