#### For use in observing the effect of random data modification on program output.

Use: Run this notebook, specifying which of the 28 data sets to randomize. Original data can always be pulled from the Git repository, but backups are made in the same location as the data files when they are accessed for the first time.

In [None]:
import csv, os, random
from shutil import copyfile # to save backups of csv files


def set_random_delay(dataset, percentage, delay):
    """ Increment RTT of approximately <percentage> of entries in <dataset> to some large value 9999999999. """
    try:
        assert dataset in range(28) # dataset is a number between 0..27
        assert percentage * 100 in range(0, 101)
    except:
        print(f"error: invalid input to add_random_delay. inputs: {dataset}, {percentage}")
        return
    
    # load dataset, loop through it, set delay if random threshold below percentage input.
    numstr = str(dataset)
    if dataset < 10:
        numstr = "0" + numstr # leading zero
    
    filepath = f"dataset_{numstr}.csv"
    backup = f"original/{filepath}"
    
    assert os.path.isfile(filepath)
    
    print(f"found file: {filepath}")
    
    if not os.path.isfile(backup):
#         print(f"error: {backup} already exists. no backup generated.")
#     else:
        print(f"saving backup in {backup}")
        copyfile(filepath, backup)
    
    rows = []
    with open(filepath, "r+", newline='') as csvfile:
        reader = csv.reader(csvfile, delimiter=',')
        header = next(reader) # skip header
        
        for row in reader:
            if (random.uniform(0, 1) <= percentage):
                row[2] = str(delay) # index [2] is the RTT value
            rows.append(row)
    
    with open(filepath, 'w') as outfile:
        writer = csv.writer(outfile, delimiter=',')
        writer.writerow(header)
        writer.writerows(rows)
    return


def scale_rtt(dataset, scalar, percentage=0.5):
    """ Multiply RTT of approximately <percentage> entries in <dataset> by <scalar>. """
    try:
        assert dataset in range(28) # dataset is a number between 0..27
        assert percentage * 100 in range(0, 101)
    except:
        print(f"error: invalid input to scale_rtt. inputs: {dataset}, {scalar}, {percentage}")
        return
    
    numstr = str(dataset)
    if dataset < 10:
        numstr = "0" + numstr # leading zero
    
    filepath = f"dataset_{numstr}.csv"
    backup = f"original/{filepath}"
    
    assert os.path.isfile(filepath)
    print(f"found file: {filepath}")
    
    if os.path.isfile(backup):
        print(f"error: {backup} already exists. no backup generated.")
    else:
        print(f"saving backup in {backup}")
        copyfile(filepath, backup)
    
    rows = []
    with open(filepath, "r+", newline='') as csvfile:
        reader = csv.reader(csvfile, delimiter=',')
        header = next(reader) # skip header
        
        for row in reader:
            if (random.uniform(0, 1) <= percentage):
                row[2] = str(float(row[2]) * scalar)
            rows.append(row)
    
    with open(filepath, 'w') as outfile:
        writer = csv.writer(outfile, delimiter=',')
        writer.writerow(header)
        writer.writerows(rows)
    
    return


def restore(index):
    """ Return a dataset to its original state. """
    numstr = str(index)
    if index < 10:
        numstr = "0" + numstr # leading zero
    
    filepath = f"dataset_{numstr}.csv"
    backup = f"original/{filepath}"
    
    if os.path.isfile(backup):
        copyfile(backup, filepath)
        print(f"restored dataset_{index:02d}")
#     else:
#         print(f"error: {backup} does not exist. restore dataset_{index:02d} via git checkout.")
    return

def restore_all():
    """ Restore all datasets. """
    for i in range(28):
        restore(i)

In [None]:
restore_all()

for i in range(7):
    set_random_delay(i, 0.10, 0) # set ~50% of RTT's to 0 in each dataset
#     scale_rtt(i, 4, percentage=1.0) #  scale all delays by 4

In [None]:
restore_all()