In [1]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
spark = SparkSession.builder.master("local[4]").appName("Darknet").getOrCreate()

In [2]:
data = spark.read.csv('Darknet2.csv',header=True)

In [3]:
data.count()

141530

In [4]:
data.printSchema()

root
 |-- _c0: string (nullable = true)
 |-- Flow ID: string (nullable = true)
 |-- Src IP: string (nullable = true)
 |-- Src Port: string (nullable = true)
 |-- Dst IP: string (nullable = true)
 |-- Dst Port: string (nullable = true)
 |-- Protocol: string (nullable = true)
 |-- Timestamp: string (nullable = true)
 |-- Flow Duration: string (nullable = true)
 |-- Total Fwd Packet: string (nullable = true)
 |-- Total Bwd packets: string (nullable = true)
 |-- Total Length of Fwd Packet: string (nullable = true)
 |-- Total Length of Bwd Packet: string (nullable = true)
 |-- Fwd Packet Length Max: string (nullable = true)
 |-- Fwd Packet Length Min: string (nullable = true)
 |-- Fwd Packet Length Mean: string (nullable = true)
 |-- Fwd Packet Length Std: string (nullable = true)
 |-- Bwd Packet Length Max: string (nullable = true)
 |-- Bwd Packet Length Min: string (nullable = true)
 |-- Bwd Packet Length Mean: string (nullable = true)
 |-- Bwd Packet Length Std: string (nullable = true)


In [5]:
# Preview the column which shows types of traffic for the packets in the dataset
rdd = data.rdd
rdd.map(lambda x:x[-1]).distinct().take(15)

['Email',
 'File-transfer',
 'Browsing',
 'Chat',
 'Video-streaming',
 'AUDIO-STREAMING',
 'File-Transfer',
 'P2P',
 'Video-Streaming',
 'Audio-Streaming',
 'VOIP']

In [6]:
# Preview the column which indicates if the packet flows through the surface web or dark web
rdd.map(lambda x:x[-2]).distinct().take(15)

['Tor', 'Non-Tor', 'NonVPN', 'VPN']

In [7]:
import time
from datetime import datetime
#Returns a timestamp based on the input date (format must be d/m/y)
def date_to_timestamp(date):
    return time.mktime(datetime.strptime(date,'%d/%m/%Y').timetuple())

In [8]:
#Select columns we want to preview
df1 = data.select(F.col('Src IP'), F.col('Dst IP'), F.col('Timestamp'), F.col('Total Length of Fwd Packet'), F.col('Total Length of Bwd Packet'), F.col('Label1'), F.col('Label2'))

#Filter Data based on type of traffic on the dark net
df1 = df1.where(F.col('Label2') == 'P2P').where((F.col('Label1')=='Tor') or (F.col('Label1')='VPN'))

start_date = '23/02/2016'
end_date = '24/02/2016'

start_timestamp = date_to_timestamp(start_date)
end_timestamp = date_to_timestamp(end_date)

#Filter data within a selected time frame
df1 = df1.where(F.col('Timestamp').between(start_timestamp,end_timestamp))

In [12]:
df1.show(20)

+--------------+-----------+------------+--------------------------+--------------------------+------+------+
|        Src IP|     Dst IP|   Timestamp|Total Length of Fwd Packet|Total Length of Bwd Packet|Label1|Label2|
+--------------+-----------+------------+--------------------------+--------------------------+------+------+
|195.154.107.23|  10.0.2.15|1456243608.0|                  33081404|                   2191365|   Tor|   P2P|
|     10.0.2.15|37.97.149.8|1456243611.0|                     40556|                      1629|   Tor|   P2P|
|     10.0.2.15|37.97.149.8|1456243838.0|                         0|                         0|   Tor|   P2P|
|195.154.107.23|  10.0.2.15|1456243728.0|                  32947670|                   1805843|   Tor|   P2P|
|195.154.107.23|  10.0.2.15|1456243848.0|                  32432503|                   1759010|   Tor|   P2P|
|195.154.107.23|  10.0.2.15|1456243968.0|                  32478375|                   1697794|   Tor|   P2P|
|195.154.1

In [13]:
df1.summary().show()

+-------+--------------+-----------+--------------------+--------------------------+--------------------------+------+------+
|summary|        Src IP|     Dst IP|           Timestamp|Total Length of Fwd Packet|Total Length of Bwd Packet|Label1|Label2|
+-------+--------------+-----------+--------------------+--------------------------+--------------------------+------+------+
|  count|            98|         98|                  98|                        98|                        98|    98|    98|
|   mean|          null|       null|1.4562455131020408E9|       2.311138751020408E7|        1746902.4897959183|  null|  null|
| stddev|          null|       null|  1122.8870605135437|      1.4932938765724642E7|        1690018.8331980957|  null|  null|
|    min|     10.0.2.15|  10.0.2.15|        1456243608.0|                         0|                         0|   Tor|   P2P|
|    25%|          null|       null|       1.456244688E9|                   22347.0|                   41286.0|  null|