In [1]:
import pandas as pd
from glob import glob
from collections import defaultdict
import itertools, sys, os, math
import utils

sys.path.insert(0, "webpage_fingerprinting_methods/")
import webpage_fingerprinting_methods
methods = webpage_fingerprinting_methods.methods

In [2]:
df_visits = pd.read_csv("../data/visit_log.csv")
visit_files = glob("../data/visits_processed/*/*.json")

In [3]:
visit_id_to_visit_file = dict(zip(map(lambda x: x.split("/")[-1][:-5], visit_files), visit_files))
df_visits['visit_file'] = df_visits['visit_id'].apply(lambda visit_id: visit_id_to_visit_file[visit_id])

# Create Scenarios

In [4]:
scenarios_dir = "../data/scenarios"

In [5]:
hosts = df_visits.host.unique()
client_ids = sorted(map(int, df_visits.client_id.unique()))
urls = list(df_visits.current_url.unique())
url_to_id = dict(zip(urls, range(len(urls))))

In [6]:
assert len(hosts) == 7
assert len(client_ids) == 19
assert len(url_to_id) == 350

In [7]:
client_host_url_visit_files = defaultdict(lambda : defaultdict(lambda: defaultdict(list)))
for _, row in df_visits.iterrows():
    client_host_url_visit_files[row['client_id']][row['host']][row['current_url']].append(row['visit_file'])

In [14]:
def dump_scenario(scenario, use_21_training_samples=False):
    os.makedirs(scenario['output_dir'])
    utils.dump_json(scenario, os.path.join(scenario['output_dir'], "scenario.json"))


    host = scenario['host']

    if use_21_training_samples:
        n_training_samples_from_each_client = math.ceil(21/len(scenario['training_client_ids']))
    else:
        n_training_samples_from_each_client = 21

    training_visit_files = []
    test_visit_files = []
    visit_file_label = {}
    
    for client_id in scenario['training_client_ids']:
        for url, visit_files in client_host_url_visit_files[client_id][host].items():
            training_visit_files += visit_files[21-n_training_samples_from_each_client:21]
            for visit_file in visit_files[21-n_training_samples_from_each_client:21]:
                visit_file_label[visit_file] = url_to_id[url]

    for url, visit_files in client_host_url_visit_files[scenario['test_client_id']][host].items():
        test_visit_files += visit_files[21:28]
        for visit_file in visit_files[21:28]:
            visit_file_label[visit_file] = url_to_id[url]
        
    utils.dump_pickle([training_visit_files, test_visit_files, visit_file_label], 
                     os.path.join(scenario['output_dir'], "split.pickle"))
    
    
def subdir_id_generator(num_sub_dirs):
    i = 0
    while True:
        i = (i+1) % num_sub_dirs
        yield str(i)
        
subdir_id = subdir_id_generator(100)
for method in methods:
    for host in hosts:
        #train one test one client scenarios
        #19 x 19 scenarios
        for client_1, client_2 in list(itertools.product(client_ids, client_ids)):
            scenario = {
                "name": "train_one_test_one",
                "host" : host,
                "method" : method.get_name(),
                "training_client_ids" : [client_1],
                "test_client_id" : client_2,
                "output_dir" : os.path.join(scenarios_dir, next(subdir_id), "train_one_test_one_{}_{}_{}_{}".format(method.get_name(), host, client_1, client_2)),
            }
            dump_scenario(scenario)        
        
        #test one train on remaining 18
        # 19 scenarios
        for test_client_id in client_ids:
            training_client_ids = list(set(client_ids) - set([test_client_id]))
            scenario = {
                "name" : "test_one_train_remaining",
                "host" : host,
                "method" : method.get_name(),
                "training_client_ids" : training_client_ids,
                "test_client_id" : test_client_id,
                "output_dir" : os.path.join(scenarios_dir, next(subdir_id), "test_one_train_remaining_{}_{}_{}".format(method.get_name(), host, test_client_id)),
            }
            dump_scenario(scenario)
            
            
        # 15 scenarios
        same_browser_client_ids_list = [
            [1,2,3,4,5,6,7,8],
            [10,11,12,13],
            [16,17,18]
        ]
        for same_browser_client_ids_client_ids in same_browser_client_ids_list:
            for client_id in same_browser_client_ids_client_ids:
                scenario = {
                    "name" : "same_browser_21_training_samples",
                    "host" : host,
                    "method" : method.get_name(),
                    "training_client_ids" : list(set(same_browser_client_ids_client_ids)-set([client_id])),
                    "test_client_id" : client_id,
                    "output_dir" : os.path.join(scenarios_dir, next(subdir_id), "same_browser_21_training_samples_{}_{}_{}".format(method.get_name(), host, client_id)),
                }
                dump_scenario(scenario, use_21_training_samples=True)

        # 13 scenarios
        training_client_ids = [6,9,10,15,16,19]
        for test_client_id in set(client_ids)-set(training_client_ids):
            scenario = {
                "name" : "6_browser",
                "host" : host,
                "method" : method.get_name(),
                "training_client_ids" : training_client_ids,
                "test_client_id" : test_client_id,
                "output_dir" : os.path.join(scenarios_dir, next(subdir_id), "6_browser_{}_{}_{}".format(method.get_name(), host, test_client_id)),
            }
            dump_scenario(scenario)