In [2]:
# ――――――――――――――――――――――――――――――――――――――――――――――
# ANALYSIS OF TOR NETWORK TRAFFIC 2023-07-21
# ――――――――――――――――――――――――――――――――――――――――――――――
# Dataset Source: https://www.unb.ca/cic/datasets/tor.html
import pandas as pd

import matplotlib.pyplot as plt

pd.options.display.float_format = "{:.2f}".format
plt.style.use("ggplot")

In [3]:
traffic_data = pd.read_csv(
    "../data/processed/tor-traffic-proc.csv"
)
traffic_data.head()

Unnamed: 0,Source IP,Source Port,Destination IP,Destination Port,Protocol,Flow Duration,Flow Bytes/s,Flow Packets/s,Flow IAT Mean,Flow IAT Std,...,Bwd IAT Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,label
0,100215,53913,2165820846,80,1,435,0.0,4597.7,435.0,0.0,...,0,0,0,0,0,0,0,0,0,0
1,100215,53913,2165820846,80,1,259,0.0,7722.01,259.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2,100215,53913,2165820846,80,1,891,0.0,2244.67,891.0,0.0,...,0,0,0,0,0,0,0,0,0,0
3,100215,53913,2165820846,80,1,1074,0.0,1862.2,1074.0,0.0,...,0,0,0,0,0,0,0,0,0,0
4,100215,53913,2165820846,80,1,315,0.0,6349.21,315.0,0.0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
traffic_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 67828 entries, 0 to 67827
Data columns (total 29 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Source IP         67828 non-null  int64  
 1   Source Port       67828 non-null  int64  
 2   Destination IP    67828 non-null  int64  
 3   Destination Port  67828 non-null  int64  
 4   Protocol          67828 non-null  int64  
 5   Flow Duration     67828 non-null  int64  
 6   Flow Bytes/s      67828 non-null  float64
 7   Flow Packets/s    67828 non-null  float64
 8   Flow IAT Mean     67828 non-null  float64
 9   Flow IAT Std      67828 non-null  float64
 10  Flow IAT Max      67828 non-null  int64  
 11  Flow IAT Min      67828 non-null  int64  
 12  Fwd IAT Mean      67828 non-null  float64
 13  Fwd IAT Std       67828 non-null  float64
 14  Fwd IAT Max       67828 non-null  int64  
 15  Fwd IAT Min       67828 non-null  int64  
 16  Bwd IAT Mean      67828 non-null  float6

In [4]:
traffic_data[traffic_data.columns.difference(["Source IP", "Source Port", "Destination IP", "Destination Port", "Protocol", "label"], sort=False)].describe()

Unnamed: 0,Flow Duration,Flow Bytes/s,Flow Packets/s,Flow IAT Mean,Flow IAT Std,Flow IAT Max,Flow IAT Min,Fwd IAT Mean,Fwd IAT Std,Fwd IAT Max,...,Bwd IAT Max,Bwd IAT Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min
count,67828.0,67828.0,67828.0,67828.0,67828.0,67828.0,67828.0,67828.0,67828.0,67828.0,...,67828.0,67828.0,67828.0,67828.0,67828.0,67828.0,67828.0,67828.0,67828.0,67828.0
mean,2992148.97,253525.53,3304.67,315620.57,220985.77,898465.16,192460.19,350086.06,230174.96,869726.27,...,476063.7,51590.32,38879.99,0.0,38879.99,38879.99,308532.66,0.0,308532.66,308532.66
std,4063087.13,5624429.23,49559.65,698831.53,640975.61,1738532.73,578054.01,839156.95,679874.25,1752007.1,...,1409703.68,487373.93,315477.14,0.0,315477.14,315477.14,1454014.4,0.0,1454014.4,1454014.4
min,1.0,0.0,0.2,0.5,0.0,1.0,-2255.0,0.0,0.0,0.0,...,0.0,-327.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,44413.75,104.66,4.86,9951.65,0.0,28991.25,26.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,410859.0,1331.27,16.32,83450.0,0.0,178766.0,1373.0,19968.37,0.0,98652.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,7328698.0,21686.78,100.7,410609.0,57935.78,464202.5,207536.5,411238.0,30368.32,439586.75,...,82805.25,17.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,10000000.0,600000000.0,3000000.0,9987113.0,7045491.05,9998126.0,9987113.0,9997140.0,7065085.69,9997140.0,...,9996903.0,9996903.0,4999893.0,0.0,4999893.0,4999893.0,9998126.0,0.0,9998126.0,9998126.0


In [5]:
traffic_data["label"].value_counts(normalize=True) * 100

label
0   88.14
1   11.86
Name: proportion, dtype: float64

## Protocol and Ports 
- Tor uses TCP as its transport protocol  
- TLS port for Tor is 443, but commonly uses ports 9001 and 9030.
- Reference: [https://wiki.wireshark.org/Tor.md](Wireshark)
---
- May also use TCP or UDP on ports 9001,
9030, 9040, 9050, 9051, 9150, TCP: 443* and 8443
- Reference [CISA](https://www.cisa.gov/sites/default/files/publications/AA20-183A_Defending_Against_Malicious_Cyber_Activity_Originating_from_Tor_S508C.pdf)


TCP is more reliable, but tends to be slower than UDP.

In [7]:
traffic_data.groupby("label")["Protocol"].value_counts(normalize=True)*100

label  Protocol
0      0           63.61
       1           36.39
1      1          100.00
Name: proportion, dtype: float64

In [44]:
source_ports = [9001, 9030, 9040, 9050, 9051, 443, 8843]
pd.DataFrame(traffic_data.groupby("label")[["Source Port", "Destination Port"]].value_counts(normalize=True)).query("`Source Port` in @source_ports and label == 1").rename(columns={"proportion": "Percentage"}) * 100

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Percentage
label,Source Port,Destination Port,Unnamed: 3_level_1
1,443,49580,9.55
1,443,37652,7.96
1,443,54130,5.21
1,443,55139,4.87
1,443,41994,4.3
1,443,45788,0.66
1,443,37132,0.1
1,443,40146,0.1
1,443,49924,0.06
1,9001,32788,0.05


In [45]:
flow_desc = traffic_data.groupby("label")[["Flow Duration", "Flow Bytes/s", "Flow Packets/s"]].describe().T
flow_desc.style.background_gradient(axis=1)

Unnamed: 0,label,0,1
Flow Duration,count,59784.0,8044.0
Flow Duration,mean,2262882.639285,8412146.288041
Flow Duration,std,3603284.458456,3062479.726972
Flow Duration,min,1.0,22.0
Flow Duration,25%,38398.75,8906491.5
Flow Duration,50%,297390.5,9965426.0
Flow Duration,75%,3001986.25,9989333.5
Flow Duration,max,9999999.0,10000000.0
Flow Bytes/s,count,59784.0,8044.0
Flow Bytes/s,mean,251213.846851,270706.24294


## Flow Duration
- The flow duration describes the length of the connection in seconds. 
- Looking at the average connection time in hours, TOR looks to be much longer compared to nonTOR traffic. 

In [53]:
flow_desc.loc[("Flow Duration"), [0, 1]].apply(lambda x: x/3600).rename(columns={0: "nonTOR", 1: "TOR"})

label,nonTOR,TOR
count,16.61,2.23
mean,628.58,2336.71
std,1000.91,850.69
min,0.0,0.01
25%,10.67,2474.03
50%,82.61,2768.17
75%,833.89,2774.81
max,2777.78,2777.78
