In [5]:
from netml.pparser.parser import PCAP
import pandas as pd
from scipy.stats import entropy
import numpy as np

pcap = PCAP("BrowserHijacking.pcap")
pcap.pcap2pandas()

In [76]:
df = pd.DataFrame(pcap.df)
keep = ["time", "ip_dst", 'ip_src', 'length', 'protocol']
df = df[keep].dropna()
df['time'] = df['time'] - df.iloc[0, 0]

In [77]:
df['length'] = df['length'].astype(int)
df['time'] = df['time'].astype(float)
df['ip_dst'] = df['ip_dst'].astype(str)
df['ip_src'] = df['ip_src'].astype(str)
df['protocol'] = df['protocol'].astype(str)

df['time_interval'] = (df['time'] // 5)

grouped = df.groupby('time_interval')

# Creating the new dataframe with the specified features

new_df = grouped['length'].agg(
    avg_length='mean',
    var_length='var',
    min_length='min',
    max_length='max',
    count_rows='size'
)

new_df
# Calculating unique counts for ip_src, ip_dst, and their pairs

new_df['unique_ip_src'] = grouped['ip_src'].nunique()

new_df['unique_ip_dst'] = grouped['ip_dst'].nunique()

new_df['unique_ip_src_dst'] = grouped.apply(lambda x: len(x[['ip_src', 'ip_dst']].drop_duplicates()))

 
# Calculating ratios

new_df['rows_per_unique_ip_src'] = new_df['count_rows'] / new_df['unique_ip_src']

new_df['rows_per_unique_ip_dst'] = new_df['count_rows'] / new_df['unique_ip_dst']

new_df['rows_per_unique_ip_src_dst'] = new_df['count_rows'] / new_df['unique_ip_src_dst']


# Function to calculate entropy
def calculate_entropy(series):
    value_counts = series.value_counts()
    probabilities = value_counts / len(series)
    return entropy(probabilities)

# Adding entropy calculations for source and destination IPs
new_df['entropy_ip_src'] = grouped['ip_src'].apply(calculate_entropy)
new_df['entropy_ip_dst'] = grouped['ip_dst'].apply(calculate_entropy)

new_df['repeated_connections'] = grouped.apply(lambda x: x.duplicated(subset=['ip_src', 'ip_dst']).sum())


# Handling division by zero

new_df.replace([np.inf, -np.inf], np.nan, inplace=True)

new_df.fillna(0, inplace=True)


 

# Calculating the percentage of each protocol in each interval

# protocols = df['protocol'].unique()

# for protocol in protocols:

#     protocol_column = f'percent_{protocol}'

#     new_df[protocol_column] = grouped.apply(lambda x: (x['protocol'] == protocol).sum() / x.shape[0])

 

new_df.reset_index(inplace=True)

In [78]:
new_df

Unnamed: 0,time_interval,avg_length,var_length,min_length,max_length,count_rows,unique_ip_src,unique_ip_dst,unique_ip_src_dst,rows_per_unique_ip_src,rows_per_unique_ip_dst,rows_per_unique_ip_src_dst,percent_TCP,percent_UDP,percent_ICMP
0,0.0,196.589109,142447.686050,60,3492,202,65,49,85,3.107692,4.122449,2.376471,0.569307,0.425743,0.004950
1,1.0,193.663043,138782.924091,60,2962,184,63,51,90,2.920635,3.607843,2.044444,0.663043,0.304348,0.032609
2,2.0,150.115854,34708.115330,60,1726,164,52,41,70,3.153846,4.000000,2.342857,0.652439,0.317073,0.030488
3,3.0,190.634518,97662.753496,60,2922,197,60,48,86,3.283333,4.104167,2.290698,0.664975,0.319797,0.015228
4,4.0,767.980349,582603.542283,60,4410,458,60,44,79,7.633333,10.409091,5.797468,0.897380,0.100437,0.002183
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
178,307.0,133.487047,26794.417800,60,1760,193,60,46,85,3.216667,4.195652,2.270588,0.616580,0.347150,0.036269
179,308.0,517.331104,674722.336311,60,2962,299,66,49,85,4.530303,6.102041,3.517647,0.842809,0.147157,0.010033
180,309.0,139.006897,18942.756897,60,761,145,58,46,74,2.500000,3.152174,1.959459,0.724138,0.262069,0.013793
181,310.0,224.600000,102183.696104,60,1747,155,52,38,72,2.980769,4.078947,2.152778,0.496774,0.458065,0.045161
