# Assessing the limits of privacy and data usage for web browsing analytics
## Contact
- Daniel Perdices <daniel.perdices at uam.es>
### Imports

In [1]:
import pandas as pd
import glob
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.preprocessing import LabelEncoder 
import numpy as np
from tensorflow.keras.layers import *
from tensorflow.keras.models import *
from tensorflow.keras.layers import Dense
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.metrics import *
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers.experimental.preprocessing import CategoryEncoding
import tensorflow.keras
from sklearn.model_selection import train_test_split
import json
import matplotlib.pyplot as plt
import struct
import socket
import tensorflow as tf
from tensorflow.keras.preprocessing.text import hashing_trick

### Utils

In [2]:
def ip2int(addr):
    return struct.unpack("!I", socket.inet_aton(addr))[0]

def str2IP(addr):
  addr2 = addr.split(",")
  if len(addr2) > 1:
    addr = addr.replace(",172.17.0.3", "")
  pieces = addr.split(".")
  if len(pieces) != 4:
    print(addr)
  return ip2int(addr)

def createDocFromFile(filename, sampling_rate=None):
  domain = filename.split("/")[-2]
  UA = filename.split("/")[-4]
  df = pd.read_csv(filename)
  #df.serverIP = df.serverIP.apply(str2IP)
  dns_server = filename.split("/")[0].replace("pcaps_dns_", "").replace("_", ".")
  if sampling_rate is not None:
    all_packets = df.totalPackets.sum()
    probabilities = df.totalPackets / all_packets
    s = all_packets * sampling_rate
    selected_indices = set(np.random.choice(df.index, size=s, p=probabilities))
    filtered = df.iloc[selected_indices,:]
  else:
    filtered = df

  return {"words": filtered.serverIP.values, "number": np.uint32(filtered.serverIP.apply(str2IP).values), "tags": [domain], "filename": filename, "UA": UA, "dns_server": dns_server}

In [3]:
# Defaults
vocab_length = 60000
max_length = 250
def load_dataset(dnss=["150.244.9.100"], vocab_length = vocab_length, max_length=max_length, sampling_rate=None):
  # Data should be downloaded before calling this function to load the dataset

  # Create corpus (TODO: parallel)
  corpus = [createDocFromFile(filename, sampling_rate=sampling_rate) for filename in glob.glob("pcaps_dns_*/U*/PartialFootprints/*/*.summary")]

  # Build vector Y
  classes = [e["tags"][0] for e in corpus]
  s = [e["dns_server"] for e in corpus]

  # Build hash encoding
  embedded_sentences = [hashing_trick(" ".join(e["words"]), vocab_length, hash_function="md5") for e in corpus]
    
  # Build padded sentences
  padded_sentences = pad_sequences(embedded_sentences, max_length, padding='post')

  return padded_sentences, s, classes


### Load the data and perform train-test-validation split

In [4]:
X, _, classes = load_dataset(dnss=["150.244.9.100", "1.1.1.1", "8.8.8.8", "9.9.9.9", "208.67.222.222"], sampling_rate=None)
nclasses = np.unique(classes).shape[0]
class_encoder = LabelEncoder()
class_encoder.fit(sorted(np.unique(classes)))
Y = to_categorical(class_encoder.fit_transform(classes))
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, stratify=Y, test_size=0.2, random_state=0)
# Validation can be used for hyperparameter searching. Since we already provide hyperparams, it is not used.
X_train, X_val, Y_train, Y_val = train_test_split(X_train, Y_train, stratify=Y_train, test_size=0.2, random_state=1)

### Build the model

In [5]:
model = Sequential()
model.add(Embedding(vocab_length, 20, input_length=max_length))
model.add(GlobalAveragePooling1D())
model.add(Dense(75, activation="relu"))
model.add(Dense(150, activation="relu"))
model.add(Dense(250, activation="relu"))
model.add(Dense(350, activation="relu"))
model.add(Dense(400, activation="relu"))
model.add(Dense(nclasses, activation="softmax"))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=[Accuracy()])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 250, 20)           1200000   
_________________________________________________________________
global_average_pooling1d (Gl (None, 20)                0         
_________________________________________________________________
dense (Dense)                (None, 75)                1575      
_________________________________________________________________
dense_1 (Dense)              (None, 150)               11400     
_________________________________________________________________
dense_2 (Dense)              (None, 250)               37750     
_________________________________________________________________
dense_3 (Dense)              (None, 350)               87850     
_________________________________________________________________
dense_4 (Dense)              (None, 400)               1

In [6]:
(X_train.shape, X_val.shape, X_test.shape), (Y_train.shape, Y_val.shape, Y_test.shape)

(((78306, 250), (19577, 250), (24471, 250)),
 ((78306, 500), (19577, 500), (24471, 500)))

### Training

In [8]:
for i in range(30):
  history = model.fit(X_train, Y_train, epochs=100, batch_size=1000, verbose=0, validation_data=(X_val, Y_val))
  model.evaluate(X_train, Y_train)
  model.evaluate(X_test, Y_test)



In [None]:
model.save("model.h5")