# Making a subset of the data

In this notebook, we create a subset of the big NYC taxi dataset. This subset only contains 0.5% of all rows.

The procedure implemented here is memory efficient in that the full ZIP files are not first extracted on disk. Unzipping occurs on the fly. We make heavy use of Python generators and iterators.

In [None]:
import os
import os.path as op
import re
import zipfile
import glob
from itertools import chain, islice

In [None]:
def _csv_filename(zip_filename):
    """Return the filename of the CSV in a ZIP file."""
    return op.splitext(op.basename(zip_filename))[0]

In [None]:
def _iter_lines(zip_filename):
    """Iterate over all rows from a zipped CSV file."""
    print("Processing file {file}...".format(file=zip_filename))
    csv_filename = _csv_filename(zip_filename)
    with zipfile.ZipFile(zip_filename) as z:
        with z.open(csv_filename) as f:
            for line in f:
                yield line

In [None]:
def _iter_all_lines(files, step=None, stop=None):
    """Iterate over rows from several zipped CSV files."""
    # Keep the header row in the first file, but not in the other files.
    return chain(*[islice(_iter_lines(f), min(1, i), stop, step)
                   for i, f in enumerate(files)])

In [None]:
def _extract_number(filename):
    """Return the month number appearing in a ZIP file."""
    r = re.search(r'([\d]+)', filename)
    if r:
        return int(r.group(1))

In [None]:
def _zip_filenames(name):
    """Return the ordered list of ZIP filenames."""
    return sorted(glob.glob('data/trip_{name}_*.zip'.format(name=name)), 
                  key=_extract_number)

In [None]:
def _make_extract(step=None, stop=None):
    """Create the subset data."""
    for name in ('fare', 'data'):
        # Output CSV filename (with the subset data).
        filename = 'data/trip_{name}_subset500.csv'.format(name=name)
        # List of zipped CSV files."
        files = _zip_filenames(name)
        with open(filename, 'wb') as f:
            # Iterate over a subset of the rows from all files.
            for line in _iter_all_lines(files, step=step, stop=stop):
                f.write(line)

In [None]:
# Only keep one out of 'step' lines.
step = 200
# In every file, stop after 'stop' lines (None=until the end).
stop = None

Make the subset data (**this will take a while**).

In [None]:
_make_extract(step=step, stop=stop)