# System Information

In [1]:
versioninfo(verbose=true)

Julia Version 1.3.1
Commit 2d5741174c (2019-12-30 21:36 UTC)
Platform Info:
  OS: Linux (x86_64-pc-linux-gnu)
  uname: Linux 4.15.0-1051-aws #53-Ubuntu SMP Wed Sep 18 13:35:53 UTC 2019 x86_64 x86_64
  CPU: AMD EPYC 7571: 
              speed         user         nice          sys         idle          irq
       #1  2199 MHz      76736 s         92 s       4521 s    1028038 s          0 s
       #2  2199 MHz     108998 s          1 s       4294 s     997057 s          0 s
       #3  2199 MHz      99026 s          0 s       5063 s    1006657 s          0 s
       #4  2199 MHz     109493 s          0 s       4343 s     996895 s          0 s
       #5  2199 MHz     116279 s          0 s       4369 s     990013 s          0 s
       #6  2199 MHz      57962 s          7 s       4620 s    1037769 s          0 s
       #7  2199 MHz     110355 s       1294 s       5515 s     992989 s          0 s
       #8  2199 MHz      99599 s       1276 s       4254 s     994969 s          0 s
       
  Mem

# Packages

In [2]:
using Pkg

In [3]:
Pkg.add(["Distributions", "DataFrames", "DataStructures", "BenchmarkTools"])

[32m[1m  Updating[22m[39m registry at `/opt/julia/registries/General`
[32m[1m  Updating[22m[39m git-repo `https://github.com/JuliaRegistries/General.git`
[?25l[2K[?25h[32m[1m Resolving[22m[39m package versions...
[32m[1m  Updating[22m[39m `/opt/julia/environments/v1.3/Project.toml`
[90m [no changes][39m
[32m[1m  Updating[22m[39m `/opt/julia/environments/v1.3/Manifest.toml`
[90m [no changes][39m


In [4]:
using Distributions
using DataFrames
using DataStructures
using BenchmarkTools

# Timings

Create the dataset:

In [5]:
N = 6000000

6000000

In [6]:
U = DiscreteUniform(1, N)
small_cap = 'a':'z'
large_cap = 'A':'Z'

col1 = rand(large_cap, N)
col2 = rand(small_cap, N)
col3 = rand(U, N)

df = DataFrame(col1 = col1, col2 = col2, col3 = col3)

first(df, 5)

Unnamed: 0_level_0,col1,col2,col3
Unnamed: 0_level_1,Char,Char,Int64
1,'G','p',4517640
2,'J','x',3734956
3,'B','p',1692118
4,'Y','y',1903604
5,'D','l',2902956


Naive solution:

In [7]:
results1 = counter(Pair{Char,Char})

@btime for (c1, c2, c3) in eachrow(df)
    key, val = c1 => c2, c3
    results1
end

  13.635 s (89997487 allocations: 1.88 GiB)


In [8]:
results1 = counter(Pair{Char,Char})

for (c1, c2, c3) in eachrow(df)
    key, val = c1 => c2, c3
    results1[key] = results1[key] + val
end

Faster solution 1:

In [9]:
function f(data)
    results = counter(Pair{Char,Char})

    for (c1, c2, c3) in data
        key, val = c1 => c2, c3
        results[key] = results[key] + val
    end

    return results
end

data = zip(col1, col2, col3)

results2 = f(data)
;

In [10]:
@btime f($data);

  217.914 ms (19 allocations: 92.33 KiB)


Faster solution 2:

In [11]:
function f(data)
    results = counter(Tuple{Char,Char})

    for (c1, c2, c3) in data
        key, val = (c1, c2), c3
        results[key] = results[key] + val
    end

    return results
end

data = zip(col1, col2, col3)

results3 = f(data)
;

In [12]:
@btime f(data);

  219.675 ms (19 allocations: 92.33 KiB)


Faster solution 3:

In [13]:
results4 = aggregate(df, [:col1, :col2], sum)

first(sort!(results4), 5)

Unnamed: 0_level_0,col1,col2,col3_sum
Unnamed: 0_level_1,Char,Char,Int64
1,'A','a',26159722404
2,'A','b',26671219965
3,'A','c',26953084531
4,'A','d',27163184994
5,'A','e',27244223854


In [14]:
results3[('A', 'a')]

26159722404

In [15]:
@btime aggregate($df, [:col1, :col2], sum);

  268.707 ms (50069 allocations: 250.23 MiB)


Check that all methods give identical results:

In [16]:
@assert results1 == results2

results2 = begin
    c = counter(Tuple{Char, Char})

    for (k,v) in results2
        c[Tuple(k)] = v
    end
    c
end

@assert results2 == results3