In [1]:
%reload_ext autoreload
%autoreload 1

In [17]:
import tarfile
from tarfile import TarInfo
from glob import glob
import dask.bag as db
import dask.dataframe as dd
import dask
import os
import pandas as pd
from odo import odo
from dask.diagnostics import ProgressBar
import logging
# from utils.facc1_reader import FACC1Reader

# 출력 디렉토리 준비

In [7]:
def get_output_name(fpath):
    return os.path.basename(fpath).replace('.tgz', '')

def get_output_dir(fpath):
    OUTPUT_DIR = '../../../Dataset/FACC1/output/'
    fname = get_output_name(fpath)
    return os.path.join(OUTPUT_DIR, fname)

def get_output_path(fpath, fname):
    output_dir = get_output_dir(fpath)
    return os.path.join(output_dir, "%s.csv.gz" % fname)
    
def prepare_output_dirs(fpaths):
    for fpath in fpaths:
        output_dir = get_output_dir(fpath)
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

In [21]:
class FACC1Reader(object):
    COLUMNS = ['trec_id', 'encoding', 'entity', 'start', 'end', 'posterior', 'posterior_context_only', 'freebase_tag']
    
    def __init__(self, fpath):
        self.fpath = fpath
        logging.basicConfig(filename='facc1.log', level=logging.INFO)
    
    def __iter__(self):
        with tarfile.open(self.fpath, 'r:gz') as tar:
            for member in tar.getmembers():
                fname = os.path.basename(member.name)
                if fname.endswith('.tsv'):
                    f = tar.extractfile(member)
                    try:
                        df = pd.read_csv(f, sep='\t', header=None, names=self.COLUMNS)
                        logging.info('Finished: %s (%s)' % (fname, self.fpath))
                        yield (fname.replace('.tsv', ''), df)
                    except:
                        logging.warning('Error: %s (%s)' % (fname, self.fpath))

In [19]:
def each_partition(values):
    for fpath in values:
        print('Processing: ', fpath)
        reader = FACC1Reader(fpath)
        for fname, df in reader:
            output_path = get_output_path(fpath, fname)
            odo(df, output_path)

def all_partition(values):
    pass

In [None]:
fpaths = sorted(glob('../../../Dataset/FACC1/tgz/*.tgz'))
prepare_output_dirs(fpaths)

b = db.from_sequence(fpaths[:1]).reduction(each_partition, all_partition)
b.compute()

Processing:  ../../../Dataset/FACC1/tgz/ClueWeb12_01.tgz
