In [2]:
# Nadjib Achir
# some important imports 
import math
import numpy as np
import scipy as sp
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

# WiFi-de-randomization

This document contains the code to match random MAC address coming from the same WiFi devices.

Some of the important steps are :

- STEP 1: Get the datasets
    1. Download the Sapienza datase
    2. Merge pcaps
    3. Extract pcap fields using tshark
- STEP 2: Randomization
    1. Upload the dataset
    2. Groupe all frames per MAC address
    3. Remove Locally Administered MAC addresses
    4. Deduce bursts lists
    5. Randomize the MAC Addresses
    6. Do timing sort
    7. Save the new dataset and the ground truth
- STEP 3: Create the time signature
    - ...

## STEP 1: Get the datasets

**1 - Download the Sapienza dataset**

Use the following link and login to download the datasets:

https://crawdad.org/sapienza/probe-requests/20130910/

**2- Merge pcaps**

Each dataset is composed of many .pcap files. We need to merge all the files. For example, to merge politics1
dataset, use:

```bash
mergecap -w politics1.pcap probes-2013-02-22.pcap*
```


**3- Extract pcap fields using tshark**

Using the following command we can extract from the dataset only relevent informations. Here I kept as much as possible informations from the original dataset, in case we do more analysis in the future. However, if we focus only on timing attack, we can only keep the timestamp. In this example, we collect many fields from each **probe_request** frame:

```bash
chmod +x tshark_extraction.sh
./tshark_extraction.sh dataset_name
```


## STEP 2: Randomization

In [3]:
# some useful imports 
from collections import defaultdict
import random
import os

In [34]:
BURST_TIME_FRAME = 0.01 # the maximum duration of a burst in time (second)
p = 4 # change the mac address every p bursts

#
dataset = 'politics1'

#
dataset_file_path = './' + dataset + '/' + dataset + '_dataset'
print('dataset folder: ', dataset_file_path)
isExist = os.path.exists(dataset_file_path)
assert isExist == True, 'The dataset does not exist'

#
dataset_randomized_path = './' + dataset + '/' + dataset + '_randomized_p' + str(p) + '_btf' + str(BURST_TIME_FRAME) + '_dataset'
print('dataset randomized path: ', dataset_randomized_path)

#
dataset_randomized_label_path = dataset_randomized_path + '_label'
print('dataset randomized path label: ', dataset_randomized_label_path)

#
sanitization = True

dataset folder:  ./politics1/politics1_dataset
dataset randomized path:  ./politics1/politics1_randomized_p4_btf0.01_dataset
dataset randomized path label:  ./politics1/politics1_randomized_p4_btf0.01_dataset_label


**1- Upload the dataset**

In [8]:
# function used to load the frames
def load_dataset(dataset_path: str) -> list:
    """
    Read dataset from disk
    """
    frames = []
    with open(dataset_path) as data:
        for frame in data:
            frame_tmp = frame.strip('\n').split(';')
            frames.append(frame_tmp)
    return frames

In [9]:
# load the frames
frames = load_dataset(dataset_file_path)

In [10]:
len(frames)

1189482

**2- Groupe all frames per MAC address**

In [11]:
# This function group all the frames with the same MAC address into one single dictionary
def separete_macs(frames: list) -> dict:
    """
    Group same MAC address
    """
    macs = defaultdict(list)
    for frame in frames:
        fm = [frame[0]] + frame[2:]
        macs[frame[1]].append(fm)
    return macs


In [12]:
# we create a dictionary per mac address with all frames belonging to the same mac address
mac_dict = separete_macs(frames)

In [13]:
len(mac_dict)

16691

**3- Remove Locally Administered MAC addresses**

![image.png](https://upload.wikimedia.org/wikipedia/commons/9/94/MAC-48_Address.svg)

In [14]:
# This function remove the Locally Administered MAC addresses 
# A locally administered address is assigned to a device by 
# a network administrator, overriding the burned-in address. 
# Here the second-least-significant bit of the first octet of 
# the address is a 1.
# There are 4 ranges of Locally Administered Address Ranges 
# that can be used on a local network:
# x2-xx-xx-xx-xx-xx
# x6-xx-xx-xx-xx-xx
# xA-xx-xx-xx-xx-xx
# xE-xx-xx-xx-xx-xx
def verify_randomized(mac_dict):
    count = 0
    for mac in mac_dict.copy():
        if  mac != '' and bin(int(mac[0:2], 16))[2:].zfill(8)[-2] == '1':
            count += 1
            del mac_dict[mac]
    print("Locally Administered Removed MAC Address: {}%".format((count * 100) / len(mac_dict)))
    return mac_dict

In [15]:
# remove the Locally Administered MAC addresses
mac_dict = verify_randomized(mac_dict)

Locally Administered Removed MAC Address: 0.095952023988006%


In [16]:
len(mac_dict)

16675

In [17]:
# delete frames. No more used
del frames

**4- Deduce bursts lists**

In [18]:
# This function transforme a list of frames belonging to the same MAC address into a list of burst.
# Each burst is also a list of frames belonging to the same burst (all frames alre within the same BURST_TIME_FRAME)
def group_bursts(time_frames: list) -> list:
    """
    Create burst lists based in burst time size
    """
    # list of bursts
    bursts = []
    # list of frames of one burst
    burst = []
    
    i = 0
    # while not processign all frames belonging to the same MAC address
    while i < len(time_frames):
        # add the first frame to the burst
        if len(burst) == 0:
            burst.append(time_frames[i])
            i += 1
        else:
            # if the time separating two consecutive frames is less than a BURST_TIME_FRAME
            if float(time_frames[i][0]) - float(burst[0][0]) <= BURST_TIME_FRAME:
                # add the current frame to the current burst
                burst.append(time_frames[i])
                i += 1
            elif len(burst) > 0:
                # add the burst list to the list of bursts.
                # do not add empty burst
                # remove burst less than two frames ??? do not know if useful yest!!!
                bursts.append(burst)
                # initialize the burst list to empty
                burst = []
            else:
                # initialize the burst list to empty
                burst = []
        # if we processed all the burst. Add the last burst to the list of bursts
        if i >= len(time_frames):
            # check if we have at least 2 frames in that burst
            if len(burst) > 1:
                bursts.append(burst)
            burst = []
    return bursts

# this function process all MAC addresses
def select_bursts(mac_dict: dict) -> dict:
    """
    For each MAC return a list with burst
    """
    bursts = defaultdict(list)
    for mac in mac_dict:
        if len(mac_dict[mac]) >= 1:
            bursts[mac] = group_bursts(mac_dict[mac])
    return bursts


In [21]:
# return a dicionary of bursts 
burst_set = select_bursts(mac_dict)
len(burst_set)

16675

**5- Randomize the MAC Addresses**

In [22]:
# This function generate a new mac address
def generate_mac() -> str:
    # First byte:
    new_mac = '%02X' % int(bin(random.randint(0, 63)) + '11', 2) + ':'

    # Next 4 bytes:
    for i in range(4):
        new_mac += '%02X' % random.randint(0, 255) + ':'

    # Last byte:
    new_mac += '%02X' % random.randint(0, 255)
    return new_mac


# This function randomize the mac address p bursts per p bursts.
def randomize_mac(burst_set: dict, number_of_devices: int, minimum_burst_len: int = 100) -> tuple:
    bursts = defaultdict(list)
    ground_truth = {}
    dev_number = 0

    # for each mac address
    for mac in burst_set:
        # check if the number of bursts is greater then a given number of bursts.
        if len(burst_set[mac]) > minimum_burst_len:
            # if we reached the number of devices
            if dev_number >= number_of_devices:
                print(dev_number)
                # return the bursts and the grount truth
                return bursts, ground_truth
            # increase the number of devices processed by one
            dev_number += 1
            # change the mac addresses every p bursts ...
            for i in range(0, len(burst_set[mac]), p):
                # generate a new mac address
                new_mac = generate_mac()
                # associate the new mac address with the real mac address
                ground_truth[new_mac] = mac
                # extract the set of burst and affect them to the generated mac address
                for burst in burst_set[mac][i:i + p]:
                    bursts[new_mac].append(burst)
    return bursts, ground_truth

In [24]:
#
minimum_burst_len = 0
random_dataset, ground_truth = randomize_mac(burst_set, len(mac_dict), minimum_burst_len)

**6- Do timing sort**

In [25]:
# This function sort the frames according to time whitin each burst
def sort_timing(bursts):
    frames_sorted = []
    for mac in bursts:
        for burst in bursts[mac]:
            for frame in burst:
                frames_sorted.append([float(frame[0])] + [mac] + frame[1:])
    return sorted(frames_sorted)

In [28]:
# sort the frames
sorted_frames = sort_timing(random_dataset)

**7- Save the new dataset and the ground truth**

In [35]:
# This function save the new dataset
def save_dataset(frames: list, dataset_path: str) -> None:
    with open(dataset_path, 'w') as file:
        for frame in frames:
            str_frame = ''
            for field in frame:
                str_frame += str(field) + ';'
            file.write(str_frame + '\n')

# This function save the ground truth dictionary
def save_ground_truth(ground_truth, dataset_path: str):
    with open(dataset_path, 'w') as file:
        for key, val in ground_truth.items():
            file.write(str(key) + ';' + str(val) + '\n')


In [36]:
save_dataset(sorted_frames, dataset_randomized_path)
save_ground_truth(ground_truth, dataset_randomized_label_path)