In [None]:
"""
Author: Justin Perez (justin-perez@uiowa.edu)
Date: 2020-09-22
Modified: 2020-09-26 (converted to colab notebook)

don't run this on your local machine, open it in
https://colab.research.google.com

if you plan to commit changes
    * edit -> clear all outputs
    * File -> Download -> ipynb
    * copy the downloaded notebook into your local repository
    * git add/commit/push
"""

import sys
COLAB = 'google.colab' in sys.modules
if not COLAB:
    raise Exception("You shouldn't run this locally.")

#@title Configuration
iterations = 1 #@param {type: "slider", min: 1}
enable_java_assertions = True #@param {type: "boolean"}
branch = "main" #@param {type: "string"}

In [None]:
%%bash -s "$branch"
set -euo pipefail

REPO='/content/ParallelSort'
URL='https://github.com/justin-f-perez/ParallelSort'
OUTDIR="out/production/ParallelSort"
JAVA_FILES="src/hw1/*.java"
BRANCH="$1"
echo $BRANCH

apt-get update > /dev/null
apt-get install -y time > /dev/null

if [ ! -d $REPO ]; then
    echo "cloning repository"
    git clone $URL $REPO
fi

cd $REPO
git checkout "$BRANCH"
git fetch origin
git reset origin/$BRANCH --hard

mkdir -p $OUTDIR
javac -d $OUTDIR $JAVA_FILES

In [None]:
!pip install altair pandas icecream humanize -qq
import sys
import humanize                  # string formatting for humans, e.g. humanize.intcomma(1000) == "1,000"
from itertools import product    # cross product
from pathlib import Path         # fairly similar to Java's Path class
import pandas as pd              # table-oriented data processing
import altair as alt             # charting
from stat import filemode        # for parsing file permissions (debugging)
from icecream import ic         # debugging
from IPython.display import display  # nicer output in notebook
# display() formatting options
pd.options.display.max_colwidth = None
pd.options.display.max_rows = 10

In [None]:
# powers of 2 from 1 to 128
T = [2**i for i in range(8)]
# powers of 10 from 10,000 to 10,000,000
N = [10**i for i in range(4, 8)]
# every combination of (n in N, t in T)
# i.e. [(1, 10000), ... (128, 10000000)]
combinations = list(product(N, T))  

# chart/table labels; use these constants for tab completion and consistency
THREADS = 'T (# of threads)'
INPUT = 'N (# of 64-bit integers)'
REAL = 'real time (seconds)'
USER = 'user execution time (cpu seconds)'
SYS = 'system overhead time (cpu seconds)'
LABELS = [INPUT, THREADS, REAL, USER, SYS]

COLAB = 'google.colab' in sys.modules
# build up paths to things we need to run
base_dir = Path('/content/ParallelSort')
input_dir = base_dir / "tmp"
class_path = (base_dir/"out"/"production"/"ParallelSort").resolve(strict=True)
output_path = (base_dir / 'sorted.bin').resolve(strict=False) # it's ok if input/output don't exist yet

# where to save the timing data from executing all those `combinations` `iterations` times,
# and whether to load an existing save file or collect a fresh set of data
timing_save_path = base_dir / 'observations.csv'

# figure out where java lives
java_command = !which java
assert len(java_command) == 1, java_command
java_command = java_command[0]

# define our runnable commands
ea_flag = '-ea' if enable_java_assertions else ''
base_command = f"{java_command} {ea_flag} -cp {class_path}"
sort_command = f"{base_command} hw1.ParallelExternalLongSorter"
generate_command = f"{base_command} hw1.DataFileGenerator"
time_command = "/usr/bin/time -p"

In [None]:
def generate(filename: Path, length:int):
    output = !$time_command $generate_command $filename $length
    return output
        
def sort(input_path:Path, output_path:Path, threads:int):
    output = !$time_command $sort_command $input_path $output_path $threads
    return output

def get_time(command_output):
    """ returns a dictionary: {"real": seconds, "user": seconds, "sys": seconds}
    seconds: float
    real: the "wall clock time"
    user: time executing the command
    sys: time spent on system overhead
    """
    regex = "(real|user|sys) \d*\.\d*\d"
    matches = command_output.grep(regex).fields()
    assert len(matches) == 3, ic(command_output, matches)
    return dict(matches)

In [None]:
input_paths = {
    n: (base_dir / str(n)).with_suffix('.bin')
    for n in N
}

for n, p in input_paths.items():
    if not p.exists():
        generate(p, n)

In [None]:
def get_sort_time(n, t):
    output = sort(
        input_path=input_paths[n],
        output_path=output_path,
        threads=t,
    )
    return (get_time(output), output)

In [None]:
timings = {
    combo: [] for combo in combinations
}

for combo in combinations:
    n, t = combo
    for i in range(iterations):
        timing, output = get_sort_time(n, t)
        timings[combo].append(timing)

In [None]:
# time to flatten everything out and build charts
rows = []
for (n, t), time_dicts in timings.items():
    for td in time_dicts:
        real, user, sys = [float(td[x]) for x in ['real', 'user', 'sys']]
        rows.append([n, t, real, user, sys])
df = pd.DataFrame(rows, columns=LABELS)
df.to_csv('saved-{}.csv'.format(pd.datetime.now().strftime("%Y-%m-%d_%H%M%S")))

chart_types = [REAL, USER, SYS]
charts = dict()
for chart_type in chart_types:
    line = alt.Chart(df).mark_line().encode(
      x=THREADS,
      y=f'mean({chart_type}):Q',
      color=f'{INPUT}:N'
    )

    band = alt.Chart(df).mark_errorband(extent='ci').encode(
        x=THREADS,
        y=alt.Y(f'{chart_type}:Q'),
        color=f'{INPUT}:N'
    )

    charts[chart_type] = line + band

In [None]:
charts[REAL]

In [None]:
charts[USER]

In [None]:
charts[SYS]