# Benchmark Pandas vs Cudf
- Using *timeit*

### System details

#### GPU

In [1]:
!nvidia-smi -q

/usr/bin/sh: nvidia-smi: command not found


#### CPU

In [2]:
!less /proc/cpuinfo

processor       : 0
vendor_id       : GenuineIntel
cpu family      : 6
model           : 79
model name      : Intel(R) Xeon(R) CPU E5-2686 v4 @ 2.30GHz
stepping        : 1
microcode       : 0xb000038
cpu MHz         : 2699.804
cache size      : 46080 KB
physical id     : 0
siblings        : 16
core id         : 0
cpu cores       : 8
apicid          : 0
initial apicid  : 0
fpu             : yes
fpu_exception   : yes
cpuid level     : 13
wp              : yes
[K:[K         : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx pdpe1gb rdtscp lm constant_tsc rep_good nopl xtopology aperfmperf eagerfpu pni pclmulqdq ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt tsc_deadline_timer aes xsave avx f16c rdrand hy[7m/proc/cpuinfo[m[K

## Benchmark setup

### Installations
Install our v3io-generator to create our 1gb dataset for the benchmark

In [8]:
!pip install -i https://test.pypi.org/simple/ v3io-generator --upgrade

In [12]:
!pip install pytimeparse

In [15]:
!pip install faker

Collecting faker
[?25l  Downloading https://files.pythonhosted.org/packages/d4/ed/2fd5337ed405c4258dde1254e60f4e8ef9f1787576c0a2cd0d750b1716a6/Faker-2.0.3-py2.py3-none-any.whl (892kB)
[K     |████████████████████████████████| 901kB 35.5MB/s eta 0:00:01��██████████████████▋           | 573kB 35.5MB/s eta 0:00:01
Collecting text-unidecode==1.3 (from faker)
[?25l  Downloading https://files.pythonhosted.org/packages/a6/a5/c0b6468d3824fe3fde30dbb5e1f687b291608f9473681bbf7dabbf5a87d7/text_unidecode-1.3-py2.py3-none-any.whl (78kB)
[K     |████████████████████████████████| 81kB 35.1MB/s eta 0:00:01
[?25hInstalling collected packages: text-unidecode, faker
Successfully installed faker-2.0.3 text-unidecode-1.3


### Configurations

In [17]:
# Benchmark configurations
metric_names = ['cpu_utilization', 'latency', 'packet_loss', 'throughput']
nlargest = 10

### Imports

In [18]:
import os
import yaml
import time
import datetime
import json
import itertools

# Generator
from v3io_generator import metrics_generator, deployment_generator

# Dataframes
#import cudf
import pandas as pd


### Create data source
Using our V3IO-Generator we will create a timeseries network-operations dataset for 100 companies including 4 metrics (cpu utilization, latency, throughput, packet loss).

We will then write the dataset to a json file to be used as our source

In [20]:
# Create meta-data factory
dep_gen = deployment_generator.deployment_generator()
faker=dep_gen.get_faker()

# Design meta-data
dep_gen.add_level(name='company',number=100,level_type=faker.company)

# Generate deployment structure
deployment_df = dep_gen.generate_deployment()

# Setup initial values
for metric in metric_names:
    deployment_df[metric] = 0

deployment_df.head()

Unnamed: 0,company,cpu_utilization,latency,packet_loss,throughput
0,Stout_and_Sons,0,0,0,0
1,King_Group,0,0,0,0
2,Davidson-Casey,0,0,0,0
3,Stewart__Mitchell_and_Davies,0,0,0,0
4,Pierce_and_Sons,0,0,0,0


In [21]:
metrics_configuration = yaml.safe_load("""
errors: {length_in_ticks: 50, rate_in_ticks: 150}
timestamps: {interval: 5s, stochastic_interval: false}
metrics:
  cpu_utilization:
    accuracy: 2
    distribution: normal
    distribution_params: {mu: 70, noise: 0, sigma: 10}
    is_threshold_below: true
    past_based_value: false
    produce_max: false
    produce_min: false
    validation:
      distribution: {max: 1, min: -1, validate: false}
      metric: {max: 100, min: 0, validate: true}
  latency:
    accuracy: 2
    distribution: normal
    distribution_params: {mu: 0, noise: 0, sigma: 5}
    is_threshold_below: true
    past_based_value: false
    produce_max: false
    produce_min: false
    validation:
      distribution: {max: 1, min: -1, validate: false}
      metric: {max: 100, min: 0, validate: true}
  packet_loss:
    accuracy: 0
    distribution: normal
    distribution_params: {mu: 0, noise: 0, sigma: 2}
    is_threshold_below: true
    past_based_value: false
    produce_max: false
    produce_min: false
    validation:
      distribution: {max: 1, min: -1, validate: false}
      metric: {max: 50, min: 0, validate: true}
  throughput:
    accuracy: 2
    distribution: normal
    distribution_params: {mu: 250, noise: 0, sigma: 20}
    is_threshold_below: false
    past_based_value: false
    produce_max: false
    produce_min: false
    validation:
      distribution: {max: 1, min: -1, validate: false}
      metric: {max: 300, min: 0, validate: true}
""")

In [22]:
met_gen = metrics_generator.Generator_df(metrics_configuration, 
                                         user_hierarchy=deployment_df, 
                                         initial_timestamp=time.time())

In [23]:
source_file = '/tmp/ops.logs'
metrics = met_gen.generate_range(start_time=datetime.datetime.now(),
                                 end_time=datetime.datetime.now()+datetime.timedelta(hours=62),
                                 as_df=True,
                                 as_iterator=False)

# Generate file from metrics
with open(source_file, 'w') as f:
    metrics_batch = metrics
    metrics_batch.to_json(f,
                          orient='records',
                          lines=True)

## Target file size validation
Set target size (in MB) for the test file

In [10]:
!ls -lah data

total 0
drwxr-xr-x 2 50 nogroup    0 Jul 22 11:32 .ipynb_checkpoints
-rw-r--r-- 1 50 nogroup 1.2G Jul 22 07:17 ops-1gb.logs
-rw-r--r-- 1 50 nogroup 1.2G Jul 22 11:40 ops.logs


In [11]:
!head data/ops.logs

{"company":"Rios__Pope_and_Baird","cpu_utilization":70.6942165035,"cpu_utilization_is_error":false,"latency":3.1373003261,"latency_is_error":false,"packet_loss":0.0,"packet_loss_is_error":false,"throughput":249.7207880994,"throughput_is_error":false,"timestamp":1563795193534}
{"company":"Ross__Calderon_and_Brown","cpu_utilization":56.540474522,"cpu_utilization_is_error":false,"latency":0.0,"latency_is_error":false,"packet_loss":0.0,"packet_loss_is_error":false,"throughput":261.9362588938,"throughput_is_error":false,"timestamp":1563795193534}
{"company":"Jackson_PLC","cpu_utilization":75.7476859549,"cpu_utilization_is_error":false,"latency":0.0,"latency_is_error":false,"packet_loss":1.3991427041,"packet_loss_is_error":false,"throughput":221.8819458316,"throughput_is_error":false,"timestamp":1563795193534}
{"company":"Reyes_Group","cpu_utilization":61.4657850595,"cpu_utilization_is_error":false,"latency":0.0,"latency_is_error":false,"packet_loss":1.7039267608,"packet_loss_is_error":false

## Benchmark

### Flow
- Read file
- Compute aggregations
- get nlargest()

In [3]:
benchmark_file = source_file

#### cudf

In [7]:
%%timeit

# Read file
gdf = cudf.read_json(benchmark_file, lines=True)

# Perform aggregation
ggdf = gdf.groupby(['company']).\
            agg({k: ['min', 'max', 'mean'] for k in metric_names})

# Get N Largest (From original df)
raw_nlargest = gdf.nlargest(nlargest, 'cpu_utilization')

1.44 s ± 23.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


#### Pandas

In [8]:
%%timeit

# Read file
pdf = pd.read_json(benchmark_file, lines=True)

# Perform aggregation
gpdf = pdf.groupby(['company']).\
            agg({k: ['min', 'max', 'mean'] for k in metric_names})

# Get N Largest (From original df)
raw_nlargest = pdf.nlargest(nlargest, 'cpu_utilization')

43.4 s ± 627 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


## Test loading times

#### cudf

In [35]:
%%timeit
gdf = cudf.read_json(benchmark_file, lines=True)

1.2 s ± 120 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


#### Pandas

In [36]:
%%timeit
gdf = pd.read_json(benchmark_file, lines=True)

41.1 s ± 651 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


## Test aggregation
Load the files to memory so we can %timeit on the aggregations only

In [4]:
gdf = cudf.read_json(benchmark_file, lines=True)
pdf = pd.read_json(benchmark_file, lines=True)

#### cudf

In [5]:
%%timeit

ggdf = gdf.groupby(['company']).\
            agg({k: ['min', 'max', 'mean'] for k in metric_names})
raw_nlargest = gdf.nlargest(nlargest, 'cpu_utilization')

212 ms ± 14.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


#### Pandas

In [6]:
%%timeit

gpdf = pdf.groupby(['company']).\
            agg({k: ['min', 'max', 'mean'] for k in metric_names})
raw_nlargest = pdf.nlargest(nlargest, 'cpu_utilization')

2.17 s ± 72.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
