# Statistical Analysis of Network Capture File (PCAP)

This notebook performs a forensic analysis on a PCAP file.
We extract packet-level insights such as source/destination IPs, protocol usage, DNS activity, and time-series traffic behavior.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load the parsed summary CSV
df = pd.read_csv('/content/unsw_sample_20000_summary.csv')
df.head()

## Top 10 Protocols

In [None]:
# Count top 10 protocols
protocol_counts = df['protocol'].value_counts().head(10)
protocol_labels = protocol_counts.index
protocol_sizes = protocol_counts.values
protocol_percent = protocol_sizes / protocol_sizes.sum() * 100
custom_labels = [f"{label} ({percent:.1f}%)" for label, percent in zip(protocol_labels, protocol_percent)]

# Create tighter figure
fig, ax = plt.subplots(figsize=(6, 5))
wedges, _ = ax.pie(
    protocol_sizes,
    startangle=140,
    wedgeprops=dict(width=0.4),
    labels=None
)
ax.legend(
    wedges,
    custom_labels,
    title="Protocol",
    loc="center left",
    bbox_to_anchor=(0.95, 0.5),
    fontsize=9
)
plt.title("Top 10 Protocols", pad=10)
plt.axis('equal')
plt.tight_layout(pad=0.5)
plt.show()

## Top 20 Source IPs

In [None]:
top_src = df['src'].value_counts().head(20)
print("Top 20 Source IPs by Packet Count:")
print(top_src)
top_src.plot(kind='barh', figsize=(8, 6), title='Top 20 Source IPs')
plt.xlabel("Number of Packets")
plt.ylabel("Source IP")
plt.gca().invert_yaxis()
plt.grid(True)
plt.tight_layout()
plt.show()

## Top 20 Destination IPs

In [None]:
top_dst = df['dst'].value_counts().head(20)
print("Top 20 Destination IPs by Packet Count:")
print(top_dst)
top_dst.plot(kind='barh', figsize=(8, 6), title='Top 20 Destination IPs', color='orange')
plt.xlabel("Number of Packets")
plt.ylabel("Destination IP")
plt.gca().invert_yaxis()
plt.grid(True)
plt.tight_layout()
plt.show()

## DNS Source and Destination Analysis

In [None]:
dns_df = df[df['protocol'] == 'DNS']
dns_df['src'].value_counts().head(10).plot.barh(title='Top 10 DNS Source IPs')
plt.xlabel("Number of DNS Packets")
plt.ylabel("Source IP")
plt.tight_layout()
plt.show()

dns_df['dst'].value_counts().head(10).plot.barh(title='Top 10 DNS Destination IPs', color='green')
plt.xlabel("Number of DNS Packets")
plt.ylabel("Destination IP")
plt.tight_layout()
plt.show()

## Traffic Volume Over Time

In [None]:
df['time_dt'] = pd.to_datetime(df['time'], unit='s', errors='coerce')
df = df.dropna(subset=['time_dt'])
df.set_index('time_dt', inplace=True)
traffic_per_interval = df['length'].resample('10S').sum()
traffic_per_interval.plot(figsize=(12, 4), title="Traffic Volume Over Time (Bytes / 10s)")
plt.xlabel("Time")
plt.ylabel("Bytes")
plt.grid()
plt.tight_layout()
plt.show()

## Source-Destination Heatmap

In [None]:
flow_counts = df.groupby(['src', 'dst']).size().reset_index(name='count')
flow_matrix = flow_counts.pivot_table(index='src', columns='dst', values='count', fill_value=0)
plt.figure(figsize=(10, 8))
sns.heatmap(flow_matrix, cmap="Blues", cbar_kws={'label': 'Packet Count'})
plt.title("Source-Destination Flow Heatmap")
plt.xlabel("Destination IP")
plt.ylabel("Source IP")
plt.tight_layout()
plt.show()