In [None]:
from itertools import chain, zip_longest
import os.path
import time

import h5py
import numpy as np
import pandas as pd

In [None]:
short_kmer_fp = '../vl/short_clean_bact_kmer_file1.fasta.tab'

In [None]:
# later: any faster if read bytes?
with open(short_kmer_fp, 'r') as kmer_file:
    header_line = kmer_file.readline()
    first_line = kmer_file.readline()
    n = 40
    print('{} elements of header     : "{}"'.format(n, list(header_line[:n])))
    print('{} elements of first line : "{}"'.format(n, list(first_line[:n])))
    
    header_columns = header_line.strip().split()
    first_line = first_line.strip().split()
    
    

Now we know the file is tab-delimited.

Try just reading.

In [None]:
def load_df(fp, nrows):
    df = pd.read_table(filepath_or_buffer=fp, sep='\t', header=0, index_col=0, nrows=nrows)

In [None]:
%timeit load_df(short_kmer_fp, nrows=10)

In [None]:
def load_py(fp, mode, nrows):
    # timeit using open with 'rt':
    #   86.9 ms ± 3.13 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
    # timeit using open with 'r':
    #   86 ms ± 690 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
    with open(fp, mode) as f:
        header_columns = f.readline().strip().split('\t')
        for i, line in enumerate(f):
            if i == nrows:
                break
            columns = line.strip().split('\t')
            frequencies = [float(f) for f in columns[1:-1]]
            

In [None]:
%timeit load_py(short_kmer_fp, mode='rt', nrows=10)

In [None]:
%timeit load_py(short_kmer_fp, mode='r', nrows=10)

Read and write.

In [None]:
# with shuffle=False: 87.5MB -> 3MB
# with shuffle=True: 87.5MB -> 14MB
def read_df_write_h5(fp, h5_fp):
    print('reading "{}" with size {:5.2f}MB'.format(fp, os.path.getsize(fp) / 1e6))
    print('writing "{}"'.format(h5_fp))
    with open(fp, 'rt') as f:
        column_count = len(f.readline().strip().split('\t'))
        print('file "{}" has {} columns'.format(fp, column_count))

    t0 = time.time()
    initial_dset_rows = 10000
    chunksize = 100
    with h5py.File(h5_fp, 'w') as h5_f, open(fp, 'rt') as f:
        dset_shape = (initial_dset_rows, column_count-2)
        dset = h5_f.create_dataset(
            '/pd/data',
            dset_shape,
            dtype=np.float64,
            maxshape=(None, dset_shape[1]),
            chunks=(1, dset_shape[1]),
            shuffle=False,
            compression='gzip')

        chunk_iter = pd.read_table(
            filepath_or_buffer=f,
            sep='\t',
            header=0,
            index_col=0,
            usecols=range(column_count-1),  # skip the first and last columns
            chunksize=chunksize)
        
        si = 0
        t00 = time.time()
        for i, chunk in enumerate(chunk_iter):
            t11 = time.time()
            sj = si + chunk.shape[0]
            print('read chunk {} with shape {} in {:5.2f}s ({} rows total)'.format(i, chunk.shape, t11-t00, sj))
            dset[si:sj, :] = chunk.values
            si = sj
            t00 = time.time()
            print('  wrote chunk in {:5.2f}s'.format(t00-t11))
            
        print('read {} rows'.format(si))
        print('dataset "{}" has shape {}'.format(dset.name, dset.shape))
        if sj < dset_shape[0]:
            new_shape = (sj, dset.shape[1])
            print('resizing dataset from {} to {}'.format(dset.shape, new_shape))
            dset.resize(new_shape)

    with h5py.File(h5_fp) as h5_f:
        dset = h5_f['/pd/data']
        print('dataset "{}" has shape {}'.format(dset.name, dset.shape))

    print('finished writing {} in {:5.2f}s'.format(h5_fp, time.time()-t0))
    print('  file size is {:5.2f}MB'.format(os.path.getsize(h5_fp) / 1e6))

In [None]:
read_df_write_h5(fp=short_kmer_fp, h5_fp='pd_kmer.h5')

In [None]:
def read_chunk(f_, shape_):
    chunk_ = np.zeros(shape_)
    chunk_i = 0
    for line in f_:
        chunk_[chunk_i, :] = [float(f) for f in line.rstrip().split('\t')[1:-1]]
        chunk_i += 1
        if chunk_i == shape_[0]:
            # end of a chunk!
            yield chunk_
            chunk_i = 0

    if chunk_i > 0:
        # yield a partial chunk
        yield chunk_[:chunk_i, :]

def read_py_write_h5(fp_list, dset, chunksize):
    t0 = time.time()
    si = 0
    for fp in fp_list:
        with open(fp, 'rt') as f:
            print('reading "{}" with size {:5.2f}MB'.format(fp, os.path.getsize(fp) / 1e6))
            header_line = f.readline()
            print('  header : "{}"'.format(header_line[:30]))
            column_count = len(header_line.strip().split('\t'))
            print('  header has {} columns'.format(column_count))

            t00 = time.time()
            for i, chunk in enumerate(read_chunk(f, shape_=(chunksize, column_count-2))):
                t11 = time.time()
                sj = si + chunk.shape[0]
                print('read chunk {} with shape {} in {:5.2f}s ({} rows total)'.format(i, chunk.shape, t11-t00, sj))
                dset[si:sj, :] = chunk
                si = sj
                t00 = time.time()
                print('  wrote chunk in {:5.2f}s'.format(t00-t11))

            print('read {} rows'.format(si))
            print('dataset "{}" has shape {}'.format(dset.name, dset.shape))
            if sj < dset.shape[0]:
                new_shape = (sj, dset.shape[1])
                print('resizing dataset from {} to {}'.format(dset.shape, new_shape))
                dset.resize(new_shape)

    print('finished writing {} in {:5.2f}s'.format(h5_fp, time.time()-t0))
    print('  file size is {:5.2f}MB'.format(os.path.getsize(h5_fp) / 1e6))


In [None]:
h5_fp = 'py_kmer.h5'
column_count = 32768
with h5py.File(h5_fp, 'w') as h5_f: 
    dset = h5_f.create_dataset(
        '/python/data',
        (10000, column_count),
        dtype=np.float64,
        maxshape=(None, column_count),
        chunks=(1, column_count),
        shuffle=False,
        compression='gzip')

    print('writing "{}"'.format(h5_fp))
    read_py_write_h5(fp_list=(short_kmer_fp, ), dset=dset, chunksize=1000)

with h5py.File(h5_fp) as h5_f:
    dset = h5_f['/python/data']
    print('  dataset "{}" has shape {}'.format(dset.name, dset.shape))

