In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import glob
import tempfile
import os
import pandas as pd
from htrc_features import Volume
efpaths = glob.glob('/home/peter.organisciak/htrc-feature-reader/data/PZ-volumes/*')

In [3]:
tdir = tempfile.TemporaryDirectory()
rewritedir = tempfile.TemporaryDirectory()

# Write basic EF files to temp dir
for path in efpaths:
    vol = Volume(path)
    vol.save_parquet(tdir.name, meta=False)
ogpaths = glob.glob(tdir.name+'/*')
pd.read_parquet(ogpaths[0]).head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,count
page,section,token,pos,Unnamed: 4_level_1
1,body,On,IN,1
1,body,Serao,NN,1
1,body,The,DT,1
1,body,and,CC,1
1,body,ballet,NN,1


In [4]:
def statdir(name, dirname):
    size= sum(os.path.getsize(f) for f in glob.glob(dirname+'/*'))
    return "{} Size: {:.2f}M".format(name, (size / 1024**2))
statdir("OG", tdir.name)

'OG Size: 3.37M'

# Size Comparisons

Pyarrow rewrite:

In [5]:
for path in ogpaths:
    df = pd.read_parquet(path)
    df.to_parquet(os.path.join(rewritedir.name, os.path.split(path)[1]), engine='pyarrow')
print(statdir("pyarrow", rewritedir.name))
pd.read_parquet(glob.glob(rewritedir.name+'/*')[0]).head(2)

pyarrow Size: 3.37M


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,count
page,section,token,pos,Unnamed: 4_level_1
1,body,On,IN,1
1,body,Serao,NN,1


Rewrite with *fastparquet*:

In [6]:
for path in ogpaths:
    df = pd.read_parquet(path)
    df.to_parquet(os.path.join(rewritedir.name, os.path.split(path)[1]), engine='fastparquet')
print(statdir("fastparquet", rewritedir.name))
pd.read_parquet(glob.glob(rewritedir.name+'/*')[0]).head(2)

fastparquet Size: 6.80M


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,count
page,section,token,pos,Unnamed: 4_level_1
1,body,On,IN,1
1,body,Serao,NN,1


In [10]:
for path in ogpaths:
    df = pd.read_parquet(path).reset_index()
    df.to_parquet(os.path.join(rewritedir.name, os.path.split(path)[1]), engine='pyarrow')
print(statdir("No index / PyArrow ", rewritedir.name))

No index / PyArrow  Size: 3.37M


In [9]:
for path in ogpaths:
    df = pd.read_parquet(path).sort_index()
    df.to_parquet(os.path.join(rewritedir.name, os.path.split(path)[1]), engine='pyarrow')
print(statdir("sorted index", rewritedir.name))

sorted index Size: 3.37M


In [99]:
for path in ogpaths:
    df = (pd.read_parquet(path).reset_index()
          .sort_values(['section', 'pos', 'count', 'token']).set_index(['page'])
         )
    df.to_parquet(os.path.join(rewritedir.name, os.path.split(path)[1]), engine='pyarrow')
print(statdir("Sorted By section,pos/count/token / PyArrow ", rewritedir.name))
pd.read_parquet(glob.glob(rewritedir.name+'/*')[0]).head()

Sorted By section,pos/count/token / PyArrow  Size: 2.52M


Unnamed: 0_level_0,section,token,pos,count
page,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
59,body,#,#,1
44,body,"""",'',1
45,body,"""",'',1
48,body,"""",'',1
82,body,"""",'',1


## Effect of sorting on compression size (*snappy*)

In [15]:
import itertools
for perm in itertools.permutations(['page', 'section', 'token', 'pos']):
    for path in ogpaths:
        df = pd.read_parquet(path).reorder_levels(perm).sort_index()
        df.to_parquet(os.path.join(rewritedir.name, os.path.split(path)[1]), engine='pyarrow')
    print(statdir(",".join(perm), rewritedir.name))

page,section,token,pos Size: 3.37M
page,section,pos,token Size: 2.97M
page,token,section,pos Size: 3.38M
page,token,pos,section Size: 3.38M
page,pos,section,token Size: 2.99M
page,pos,token,section Size: 2.99M
section,page,token,pos Size: 3.37M
section,page,pos,token Size: 2.97M
section,token,page,pos Size: 2.85M
section,token,pos,page Size: 2.77M
section,pos,page,token Size: 2.97M
section,pos,token,page Size: 2.65M
token,page,section,pos Size: 2.85M
token,page,pos,section Size: 2.85M
token,section,page,pos Size: 2.84M
token,section,pos,page Size: 2.77M
token,pos,page,section Size: 2.77M
token,pos,section,page Size: 2.77M
pos,page,section,token Size: 2.99M
pos,page,token,section Size: 3.00M
pos,section,page,token Size: 2.96M
pos,section,token,page Size: 2.66M
pos,token,page,section Size: 2.67M
pos,token,section,page Size: 2.67M


In [16]:
for perm in itertools.permutations(['page', 'token', 'pos']):
    for path in ogpaths:
        df = pd.read_parquet(path).reset_index().groupby(list(perm))[['count']].sum().sort_index()
        df.to_parquet(os.path.join(rewritedir.name, os.path.split(path)[1]), engine='pyarrow')
    print(statdir(",".join(perm), rewritedir.name))

page,token,pos Size: 3.35M
page,pos,token Size: 2.95M
token,page,pos Size: 2.83M
token,pos,page Size: 2.76M
pos,page,token Size: 2.96M
pos,token,page Size: 2.66M


In [17]:
for perm in itertools.permutations(['page', 'token']):
    for path in ogpaths:
        df = pd.read_parquet(path).reset_index().groupby(list(perm))[['count']].sum().sort_index()
        df.to_parquet(os.path.join(rewritedir.name, os.path.split(path)[1]), engine='pyarrow')
    print(statdir(",".join(perm), rewritedir.name))

page,token Size: 2.70M
token,page Size: 2.53M


### Size with `section='body'`

In [18]:
for perm in itertools.permutations(['page', 'section', 'token']):
    for path in ogpaths:
        df = pd.read_parquet(path).reset_index().query("section=='body'").groupby(list(perm))[['count']].sum().sort_index()
        df.to_parquet(os.path.join(rewritedir.name, os.path.split(path)[1]), engine='pyarrow')
    print(statdir("Just body: " + ",".join(perm), rewritedir.name))

Just body: page,section,token Size: 2.65M
Just body: page,token,section Size: 2.65M
Just body: section,page,token Size: 2.65M
Just body: section,token,page Size: 2.49M
Just body: token,page,section Size: 2.49M
Just body: token,section,page Size: 2.49M


In [19]:
for perm in itertools.permutations(['page', 'token']):
    for path in ogpaths:
        df = pd.read_parquet(path).reset_index().query("section=='body'").groupby(list(perm))[['count']].sum().sort_index()
        df.to_parquet(os.path.join(rewritedir.name, os.path.split(path)[1]), engine='pyarrow')
    print(statdir("Just body: " + ",".join(perm), rewritedir.name))

Just body: page,token Size: 2.64M
Just body: token,page Size: 2.48M


## Size with `case=False`

Resaving OG volumes with `section='body', drop_section=True, case=False` token_kwargs:

In [20]:
lowerdir = tempfile.TemporaryDirectory()
for path in efpaths:
    vol = Volume(path)
    vol.save_parquet(lowerdir.name, meta=False, token_kwargs=dict(section='body', drop_section=True, case=False))
lowerpaths = glob.glob(lowerdir.name+'/*')
pd.read_parquet(lowerpaths[0]).head(2)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,count
page,lowercase,pos,Unnamed: 3_level_1
1,and,CC,1
1,ballet,NN,1


In [21]:
for perm in itertools.permutations(['page', 'lowercase', 'pos']):
    for path in lowerpaths:
        df = pd.read_parquet(path).reset_index().groupby(list(perm))[['count']].sum().sort_index()
        df.to_parquet(os.path.join(rewritedir.name, os.path.split(path)[1]), engine='pyarrow')
    print(statdir(",".join(perm), rewritedir.name))

page,lowercase,pos Size: 3.15M
page,pos,lowercase Size: 2.77M
lowercase,page,pos Size: 2.68M
lowercase,pos,page Size: 2.60M
pos,page,lowercase Size: 2.77M
pos,lowercase,page Size: 2.50M


### Size comparison without `reset_index`

Should be the same.

In [22]:
for perm in itertools.permutations(['page', 'lowercase', 'pos']):
    for path in lowerpaths:
        df = pd.read_parquet(path).reorder_levels(perm).sort_index()
        df.to_parquet(os.path.join(rewritedir.name, os.path.split(path)[1]), engine='pyarrow')
    print(statdir(",".join(perm), rewritedir.name))

page,lowercase,pos Size: 3.15M
page,pos,lowercase Size: 2.77M
lowercase,page,pos Size: 2.68M
lowercase,pos,page Size: 2.60M
pos,page,lowercase Size: 2.77M
pos,lowercase,page Size: 2.50M


# Performance

In [None]:
- speed of reorder_levels>sort_index vs reset_index>sort_values>set_index
- speed cost of reading and sorting

### speed cost of reading and sorting

In [73]:
preferred_order = ['page', 'token', 'pos']
for perm in itertools.permutations(['page', 'token', 'pos']):
    totaltime = 0
    for path in ogpaths:
        df = pd.read_parquet(path).reset_index().groupby(list(perm))[['count']].sum().sort_index()
        outpath = os.path.join(rewritedir.name, os.path.split(path)[1])
        df.to_parquet(outpath, engine='pyarrow')
        # Measure read and sort-if-necessary times
        starttime = time.time()
        df = pd.read_parquet(outpath)
        if df.index.names != preferred_order:
            # fastest
            df = df.reorder_levels(preferred_order).sort_index()
            # alt
            #df = df.reset_index().set_index(preferred_order).sort_index()
            # potentially faster, but not sure yet if there's a downstream performance hit
            #df = df.reset_index().sort_values(preferred_order).set_index(preferred_order)
        totaltime += (time.time() - starttime)
    if list(perm) == preferred_order:
        print(statdir(",".join(perm), rewritedir.name), "Reading time: {:.2f}s".format(totaltime))
    else:
        print(statdir(",".join(perm), rewritedir.name), "Reading+Sorting time: {:.2f}s".format(totaltime))

page,token,pos Size: 3.35M Reading time: 0.34s
page,pos,token Size: 2.95M Reading+Sorting time: 1.14s
token,page,pos Size: 2.83M Reading+Sorting time: 1.09s
token,pos,page Size: 2.76M Reading+Sorting time: 1.09s
pos,page,token Size: 2.96M Reading+Sorting time: 1.13s
pos,token,page Size: 2.66M Reading+Sorting time: 1.13s


### size and speed of reorder_levels>sort_index vs reset_index>sort_values>set_index

In [69]:
for perm in list(itertools.permutations(['page', 'section', 'token', 'pos']))[:3]:
    perm = list(perm)
    for strategy in ('reset_index', 'reorder_levels'):
        totaltime = 0
        for path in ogpaths:
            df = pd.read_parquet(path)
            starttime = time.time()
            if strategy == 'reset_index':
                df = df.reset_index().sort_values(perm).set_index(perm[0])
            elif strategy == 'reorder_levels':
                df = df.reorder_levels(perm).sort_index()
            totaltime += (time.time() - starttime)
            
            outpath = os.path.join(rewritedir.name, os.path.split(path)[1])
            df.to_parquet(outpath, engine='pyarrow')        
        print(strategy, '\t', statdir(",".join(perm), rewritedir.name), "Sorting Time: {:.2f}s".format(totaltime))

reset_index 	 page,section,token,pos Size: 3.37M Sorting Time: 0.39s
reorder_levels 	 page,section,token,pos Size: 3.37M Sorting Time: 1.82s
reset_index 	 page,section,pos,token Size: 2.97M Sorting Time: 0.43s
reorder_levels 	 page,section,pos,token Size: 2.97M Sorting Time: 0.99s
reset_index 	 page,token,section,pos Size: 3.38M Sorting Time: 0.40s
reorder_levels 	 page,token,section,pos Size: 3.38M Sorting Time: 0.99s
