# Settings

In [1]:
dataset_name = "rtatman/questionanswer-dataset"

In [2]:
import pathlib

dir_working = pathlib.Path("/tmp/kaggle") / dataset_name

dir_extraction = dir_working / "extracted"
dir_extraction_splits = dir_extraction / "splits"
dir_extraction_docs = dir_extraction / "docs"

dir_transformation = dir_extraction / "transformed"
dir_transformation = dir_extraction.parent / "transformed"
dir_transformation_splits = dir_transformation / "splits"
dir_transformation_docs = dir_transformation / "docs"

dir_out = pathlib.Path("../data")
dir_out_splits = dir_out / "splits"
dir_out_docs = dir_out / "docs"

# Extract

In [4]:
import shutil

shutil.rmtree(dir_working, ignore_errors=True)

In [5]:
import kaggle

kaggle.api.dataset_download_cli(dataset_name, path=dir_extraction)
filepath_zip = pathlib.Path(next(dir_extraction.glob("*.zip")))
filepath_zip

Downloading questionanswer-dataset.zip to /tmp/kaggle/rtatman/questionanswer-dataset/extracted


100%|██████████| 3.55M/3.55M [00:00<00:00, 4.69MB/s]







PosixPath('/tmp/kaggle/rtatman/questionanswer-dataset/extracted/questionanswer-dataset.zip')

In [6]:
from typing import Dict, Tuple
import zipfile
import shutil
from tqdm.auto import tqdm


def extract_and_route(
    filepath: pathlib.Path,
    extensions_and_routes: Dict[str, Tuple[str, pathlib.Path]],
):
    try:
        with zipfile.ZipFile(filepath) as zf:
            for zfi in tqdm(list(zf.filelist), desc="extracting"):
                if not zfi.is_dir():
                    dirpath, ext = None, None
                    for ending, route_data in extensions_and_routes.items():
                        if zfi.filename.endswith(ending):
                            ext, dirpath = route_data
                            break

                    if dirpath is not None and ext is not None:
                        ext = "." + ext.strip(".")
                        dirpath.mkdir(parents=True, exist_ok=True)
                        filename = ".".join(zfi.filename.split("/")[-1].split(".")[:-1])
                        if not filename.endswith(ext):
                            filename += ext
                        filepath_src = zf.extract(zfi)
                        filepath_dst = dirpath / filename
                        shutil.move(filepath_src, filepath_dst)
    finally:
        shutil.rmtree("./text_data", ignore_errors=True)

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
extract_and_route(
    filepath=filepath_zip,
    extensions_and_routes={
        "pairs.txt": (".tsv", dir_extraction_splits),
        ".clean": (".txt", dir_extraction_docs),
    },
)

extracting:  15%|█▍        | 50/334 [00:00<00:00, 473.59it/s]

extracting: 100%|██████████| 334/334 [00:00<00:00, 355.79it/s]


# Transform

## Docs

In [8]:
shutil.copytree(dir_extraction_docs, dir_transformation_docs)

PosixPath('/tmp/kaggle/rtatman/questionanswer-dataset/transformed/docs')

## Splits

In [9]:
records = []
for filepath in dir_extraction_splits.glob("*.tsv"):
    with open(filepath, "r", encoding="latin-1") as fh:
        for line in fh:
            frags = line.split("\t")
            records.append({"question": frags[1], "answer": frags[2]})
len(records)

4001

In [10]:
from sklearn.model_selection import train_test_split

records_train, records_test = train_test_split(records, test_size=0.2, random_state=42)

In [11]:
import pandas as pd

dir_transformation_splits.mkdir(parents=True, exist_ok=True)
pd.DataFrame(records_train).to_csv(dir_transformation_splits / "train.csv", index=False)
pd.DataFrame(records_test).to_csv(dir_transformation_splits / "test.csv", index=False)

# Load

## Docs

In [12]:
shutil.copytree(dir_transformation_docs, dir_out_docs, dirs_exist_ok=True)

PosixPath('../data/docs')

# Splits

In [13]:
shutil.copytree(dir_transformation_splits, dir_out_splits, dirs_exist_ok=True)

PosixPath('../data/splits')