In [3]:
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import plotly.express as px
from ipwhois import IPWhois

# Given SFlow Format in Table 1
format = ['Type', 'sflow_agent_address', 'inputPort', 'outputPort',
               'src_MAC', 'dst_MAC', 'ethernet_type', 'in_vlan', 'out_vlan',
               'src_IP', 'dst_IP', 'IP_protocol', 'ip_tos', 'ip_ttl',
               'src_transport_port', 'dst_transport_port', 'tcp_flags',
               'packet_size', 'IP_size', 'sampling_rate', '???']

4a: top talkers and listeners
top talkers: hosts that send out large amount of data
top listeners: hosts that receive large amount of data

In [4]:
df = pd.read_csv('sampledata.csv',header=None,names=format)
df.drop('???',axis=1,inplace=True)
df.head(10)

Unnamed: 0,Type,sflow_agent_address,inputPort,outputPort,src_MAC,dst_MAC,ethernet_type,in_vlan,out_vlan,src_IP,dst_IP,IP_protocol,ip_tos,ip_ttl,src_transport_port,dst_transport_port,tcp_flags,packet_size,IP_size,sampling_rate
0,FLOW,aa.aa.aa.aa,258,20,204e71cf1b0f,002438aa0a00,0x0800,3012.0,3012,152.3.219.19,198.71.44.98,50,0x00,245,0.0,0,0x00,1518,1496,2048
1,FLOW,aa.aa.aa.aa,258,20,204e71cf1b0f,002438aa0a00,0x0800,3012.0,3012,152.3.219.19,198.71.44.98,50,0x00,245,0.0,0,0x00,1518,1496,2048
2,FLOW,aa.aa.aa.aa,258,20,204e71cf1b0f,002438aa0a00,0x0800,3012.0,3012,152.3.219.19,198.71.44.98,50,0x00,245,0.0,0,0x00,1518,1496,2048
3,FLOW,aa.aa.aa.aa,258,20,204e71cf1b0f,002438aa0a00,0x0800,3012.0,3012,152.3.219.19,198.71.44.98,50,0x00,245,0.0,0,0x00,1518,1496,2048
4,FLOW,aa.aa.aa.aa,258,20,204e71cf1b0f,002438aa0a00,0x0800,3012.0,3012,152.3.219.19,198.71.44.98,50,0x00,245,0.0,0,0x00,1518,1496,2048
5,FLOW,aa.aa.aa.aa,258,20,204e71cf1b0f,002438aa0a00,0x0800,3012.0,3012,152.3.219.19,198.71.44.98,50,0x00,245,0.0,0,0x00,1518,1496,2048
6,FLOW,aa.aa.aa.aa,200,147,80711fc76001,f4e9d4a46432,0x0800,280.0,900,203.185.93.2,203.30.39.13,6,0x00,61,5614.0,44476,0x10,74,52,2048
7,FLOW,aa.aa.aa.aa,199,3,544b8cf9a7df,00235ed9b680,0x0800,600.0,32,193.62.192.6,202.6.241.101,17,0x00,54,33001.0,45512,0x10,1514,1492,2048
8,FLOW,aa.aa.aa.aa,200,3,80711fc76001,00235ed9b680,0x0800,280.0,32,137.189.133.62,123.136.64.7,6,0x00,58,6270.0,41467,0x10,1422,1400,2048
9,FLOW,aa.aa.aa.aa,193,131,0031466b23cf,00a742233e9e,0x0800,919.0,43,216.58.203.234,192.122.131.36,6,0x00,60,443.0,4920,0x10,58,40,2048


In [5]:
#finding org given ip
def find_org(ip_addr):
    ip = IPWhois(ip_addr)
    result = ip.lookup_rdap()
    return result.get('network',{}).get('name')

In [None]:
#top 5 unique ips by number of packets sent (talkers)
top_talkers = df['src_IP'].value_counts().nlargest(5).to_frame()
org = []
for ip_addr in top_talkers.index:
    org.append(find_org(ip_addr))
top_talkers['Organisation'] = org

top_talkers = top_talkers.reset_index().rename(columns = {'index':'IP Address', 
                                                        'src_IP':'No. of Packets'})
top_talkers

Unnamed: 0,No. of Packets,count,Organisation
0,152.3.219.19,126,DUKE-NET
1,207.241.228.157,66,INTERNET-ARCHIVE-1
2,130.14.250.13,63,NLM-ETHER
3,193.62.192.8,46,EUR-BIO-INST
4,192.122.131.36,40,A-STAR-AS-AP


In [7]:
#top 5 listeners
top_listeners = df['dst_IP'].value_counts().nlargest(5).to_frame()
org = []
for ip_addr in top_listeners.index:
    org.append(find_org(ip_addr))
top_listeners['Organisation'] = org

top_listeners = top_listeners.reset_index().rename(columns = {'index':'IP Address', 
                                                        'dst_IP':'No. of Packets'})
top_listeners

Unnamed: 0,No. of Packets,count,Organisation
0,198.71.44.98,126,INTERNET2
1,103.37.198.100,98,A-STAR-AS-AP
2,210.48.222.9,66,IIUM-MY
3,137.132.228.15,50,NUSNET
4,202.21.159.244,38,RPNET


4b: transport protocol (% of TCP & UDP protocol)

In [22]:
packet_df = df['IP_protocol'].value_counts().to_frame()
packet_df = packet_df.reset_index().rename(columns={'index':'Header Value',
                                                    'IP_protocol':'No. of Packets'})

# Create counts dataframe
packet_df = df['IP_protocol'].value_counts().reset_index()
packet_df.columns = ['Header value', 'No. of Packets']

# Calculate percentage
packet_df['Percentage of Packets'] = packet_df['No. of Packets'] * 100 / len(df)

# Filter for TCP (6) and UDP (17)
finaldf = packet_df[packet_df['Header value'].isin([6, 17])]

print(finaldf)

   Header value  No. of Packets  Percentage of Packets
0             6             879                  73.25
2            17             135                  11.25


4c: application protocol

In [None]:
dest_port_df = df['dst_transport_port'].value_counts().nlargest(5).reset_index()
dest_port_df.columns = ['Destination Port', 'No. of Packets']

port_mapping = {45512: 'Unassigned', 
                443: 'HTTPS',
                80: 'HTTP',
                52866: 'Dynamic/Private Ports',
                56152: 'Dynamic/Private Ports',
                0: 'Reserved Port'}
service = []

# Assign service names (default to 'Unknown' if not in mapping)
for i in dest_port_df['Destination Port']:
    service.append(port_mapping.get(i, 'Unknown'))

dest_port_df['Service'] = service
dest_port_df

4d: traffic

In [29]:
total_traffic = sum(df['IP_size'])
# Assuming IP_size is in number of bits
total_traffic_Mb = total_traffic / (8 * pow(2, 20))
print(f"Total Traffic (Mb) = {total_traffic_Mb:.3f} Mb")

Total Traffic (Mb) = 0.129 Mb


4e: additional analysis

In [30]:
# Top 5 unique communication pairs
comm_pairs_df = df.groupby(['src_IP', 'dst_IP']).size().sort_values(ascending = False).to_frame()
comm_pairs_df.columns = ['No. of Packets']
comm_pairs_df = comm_pairs_df.reset_index()
# comm_pairs_df

top_comm_df = comm_pairs_df[:5]
top_comm_df

Unnamed: 0,src_IP,dst_IP,No. of Packets
0,152.3.219.19,198.71.44.98,126
1,207.241.228.157,210.48.222.9,66
2,130.14.250.13,103.37.198.100,63
3,193.62.192.8,137.132.228.15,46
4,130.14.250.11,103.37.198.100,35


In [31]:
src_org = []
dst_org = []

for i in range(5):
    src_org.append(find_org(top_comm_df['src_IP'][i]))
    dst_org.append(find_org(top_comm_df['dst_IP'][i]))
    
top_comm_df['Source Organisation'] = src_org
top_comm_df['Destination Organisation'] = dst_org

top_comm_df = top_comm_df.reindex(['src_IP', 'Source Organisation', 'dst_IP', 
                                   'Destination Organisation', 
                                   'No. of Packets'], axis = 'columns')
top_comm_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  top_comm_df['Source Organisation'] = src_org
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  top_comm_df['Destination Organisation'] = dst_org


Unnamed: 0,src_IP,Source Organisation,dst_IP,Destination Organisation,No. of Packets
0,152.3.219.19,DUKE-NET,198.71.44.98,INTERNET2,126
1,207.241.228.157,INTERNET-ARCHIVE-1,210.48.222.9,IIUM-MY,66
2,130.14.250.13,NLM-ETHER,103.37.198.100,A-STAR-AS-AP,63
3,193.62.192.8,EUR-BIO-INST,137.132.228.15,NUSNET,46
4,130.14.250.11,NLM-ETHER,103.37.198.100,A-STAR-AS-AP,35
