In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from pandas import Series, DataFrame
import seaborn as sns
from IPython.display import display
import time
from scipy import stats
import hashlib

NUM_PHONES= 4

In [2]:
def get_data(filename):
    start = time.time()
    df1 = pd.read_csv(filename, sep='\t')
    df1 = df1.set_axis(["ATime", "Channel", "Signal", "Source", "Dest", "Seq", "SSID", "BSSID"], axis=1)

    df1["RTime"] = 0
    df1["Tag"] = "X"
    
    

    df1 = df1.drop(["SSID", "BSSID"], axis=1)
    freq_map = {2412: 1, 2437: 6, 2462: 11}
    df1 = df1.replace({"Channel": freq_map})
    
    # df1 = df1[0:1000000]

    for i in range(1, len(df1)):
        df1.loc[i, "Tag"] = str(df1.loc[i - 1, "Channel"]) + "->" + str(df1.loc[i, "Channel"])
        df1.loc[i, "CType"] = "Same" if str(df1.loc[i - 1, "Channel"])  == str(df1.loc[i, "Channel"]) else "Different"
        df1.loc[i, "RTime"] = df1.loc[i, "ATime"] - df1.loc[i - 1, "ATime"]

    df1 = df1.iloc[1:, :]

    df1 = df1.astype({"ATime": np.float32, "RTime": np.float32, "Signal": np.float32})

    end = time.time()
    print("Total Time: ", end - start)
    
    return df1
    

In [None]:
# get the data
df1 = get_data("../data/text/sample21.txt")
df2 = get_data("../data/text/sample22.txt")
df3 = get_data("../data/text/sample23.txt")

In [None]:
df3["Time"] = df3["ATime"] - df3["ATime"].iloc[0]
df3 = df3[(df3.Source == "e8:50:8b:43:b1:20") | (df3.Source =="a8:86:60:da:7d:7b")].sort_values(by="Time")
df3["Phone"] = 4
switch = list(df3[df3.Source == "e8:50:8b:43:b1:20"].index)
df3.loc[switch, "Phone"] = 3

First, we need to make sure we keep only the packets that we are interested in. We'll do this by looking for the MAC address that occurs the most frequently nearby the network cards.

In [None]:
print("Phone 1")
print("#" * 20)
display(df1[df1.Signal > -45].groupby("Source").size().sort_values(ascending=False))
print("#" * 20)
print("Phone 2")
print("#" * 20)
display(df2[df2.Signal > -45].groupby("Source").size().sort_values(ascending=False))

Let's see what the summary statistics for each phone looks like:

In [None]:
df1 = df1[df1.Source == "e8:50:8b:43:b1:20"].iloc[0:100000]
df2 = df2[df2.Source == "a8:86:60:da:7d:7b"].iloc[0:100000]

df1["Phone"] = 1
df2["Phone"] = 2

df1["Time"] = df1["ATime"] - df1["ATime"].iloc[0]
df2["Time"] = df2["ATime"] - df2["ATime"].iloc[0]

df = pd.concat([df1, df2, df3])
display(df)

Let's try taking 10000 samples and seeing if there is a preference in channel by phone:

In [None]:

df[["Phone", "Tag", "RTime"]].groupby(["Tag", "Phone"]).describe()


ft = df.sort_values("Time").groupby("Phone").apply(lambda x: x.sample(10000))
ft = ft.reset_index(drop=True)
# display(ft.groupby(["Tag", "Phone"]).size())
display(ft.groupby(["Phone", "Tag"]).size())

In [None]:
short = df.loc[df["RTime"] < 0.3, ["RTime", "CType", "Phone"]]
g = sns.FacetGrid(short, col="CType", sharey=True, sharex=True, row="Phone")
g.map(sns.histplot, "RTime", binwidth = 0.02, stat="density")
g.figure.set_size_inches(14, 8)

for ax in g.axes.flat:
    ax.set_xticks(np.arange(0, 0.3, 0.02)) # <--- set the ticks first
plt.show()

In [None]:
ds = df[df.RTime < 0.5].sort_values(by="Time")
sns.violinplot(ds, x="CType", y="RTime", hue="Phone")
plt.show()

In [None]:
sns.displot(df[df.RTime < 0.3], x="RTime", hue="Phone", common_bins=True, bins=10, palette="deep", binwidth=0.05).set(title="Interarrival times under 0.3 seconds")
plt.show()

Let's take a look at the CDFs:

In [None]:
sns.displot(df[df.RTime < 0.4], x="RTime", col="CType", hue="Phone", kind="ecdf", palette="deep")
plt.show()

There appears to be more variance iwhen switching channels. Given that phones 1/3 and 2/4 are the same, we expect their CDFs to be roughly the same. This appears to be more true when we do not swithc channels. Let's try the KS test on each distribution (same channel probe or different channel probe):

In [None]:
ks = []
for dist in ["Same", "Different"]:
    for phone1 in range(1,NUM_PHONES,1):
        for phone2 in range(phone1+1, NUM_PHONES + 1, 1):
            ls = df[(df.Phone == phone1) & (df.CType == dist)]["RTime"]
            rs = df[(df.Phone == phone2) & (df.CType == dist)]["RTime"]
            x = stats.kstest(ls, rs)
            ks.append([phone1, phone2, x[0], x[1]])
            

ks = DataFrame(ks, columns=["Phone1", "Phone2", "Stat", "pValue"])
display(ks)

Since the P values are approximately 0, it appears that we should be able to classify a pair of packets based on the interarrival time. Since the interarrival time for two packets on the same channel approaches 0, we will end up using the interarrival time between two packets that are on different chanels.

In [None]:
df[df.CType == "Different"].loc[:, ["Phone", "RTime"]].groupby("Phone").describe()

In [None]:
for i in range(1, NUM_PHONES + 1, 1):
    mean = df.loc[df.Phone == i, "RTime"].mean()
    std = df.loc[df.Phone == i, "RTime"].std()
    print(f"Variance to Mean ratio for phone {i} is {std**2/mean}")

In [None]:
def to_local(string):
    try:
        x = bytearray(string, encoding="ascii")[1]
    except:
        return True
    
    y = (x >> 1) & 1
    return y == 1
    
df["Random"] = df["Source"].str[0:2].map(to_local)

In [None]:
def hash_string(string):
    x = string.encode("ascii")
    y = hashlib.md5(x)
    return y.hexdigest()
    
# df["Source"] = df["Source"].map(hash_string)