# File Ingestion and Schema Validation

## Import data

In [None]:
import modin.pandas as mpd
df = mpd.read_csv("C:\Users\Richard\Documents\GitHub\DataGlacier\Week 06\steam_reviews.csv")

import pandas as pd
df = pd.read_csv("C:\Users\Richard\Documents\GitHub\DataGlacier\Week 06\steam_reviews.csv")

## Load time Calculations

In [None]:
from pathlib import Path
from timeit import timeit

import modin.pandas as mpd
import pandas as pd

def create_input_file(filename, content, repetitions):
    path = Path(filename)
    if not path.exists():
        with path.open("a", encoding="utf-8") as f:
            for _ in range(repetitions):
                f.write(content)

def create_input_files(min_size, max_size, increment):
    content = Path("survey.csv").read_text(encoding="utf-8")
    for size in range(min_size, max_size + 1, increment):
        create_input_file(
            filename="survey{}MB.csv".format(size),
            content=content,
            repetitions=size // 5,
        )

def time_csv_read(module, filename, description):
    print(
        "{}: {:.2f} seconds".format(
            description,
            timeit(lambda: getattr(module, "read_csv")(filename), number=1)
        )
    )

def time_csv_reads(min_size, max_size, increment):
    for size in range(min_size, max_size + 1, increment):
        time_csv_read(pd, "survey{}MB.csv".format(size), "Pandas {}MB".format(size))
        time_csv_read(mpd, "survey{}MB.csv".format(size), "Modin {}MB".format(size))

def main():
    min_size1 = 5
    max_size1 = 95
    increment1 = 5
    min_size2 = 100
    max_size2 = 2000
    increment2 = 100
    create_input_files(min_size1, max_size1, increment1)
    create_input_files(min_size2, max_size2, increment2)
    time_csv_reads(min_size1, max_size1, increment1)
    time_csv_reads(min_size2, max_size2, increment2)

if __name__ == "__main__":
    main()

## File Ingestion and Schema Validation

In [None]:
import argparse
from datetime import datetime

import logging

from ingestion import Ingestor, log_and_exit

import ingestion.config as config
import ingestion.logger as logger

parser = argparse.ArgumentParser(description="Ingest data to UFrame.")
subparsers = parser.add_subparsers(dest="task")

# From CSV (from_csv)
parser_from_csv = subparsers.add_parser('from_csv',
                                        help="Ingest using parameters in a CSV file.")
parser_from_csv.add_argument('files', nargs='*',
                             help="Path to CSV file.")

# From File (from_file)
parser_single_file = subparsers.add_parser('from_file',
                                           help="Ingest a single file.")
parser_single_file.add_argument('files', nargs='*',
                                help="Path to data file.")
parser_single_file.add_argument('uframe_route',
                                help="UFrame route.")
parser_single_file.add_argument('reference_designator',
                                help="Reference Designator.")
parser_single_file.add_argument('data_source',
                                help="Data source (i.e. telemetered, recovered, etc.).")
parser_single_file.add_argument('deployment_number',
                                help="Deployment number.")

# Dummy (dummy)
parser_dummy = subparsers.add_parser('dummy',
                                     help="Create ingestor but don't ingest any data.")

# Optional Arguments
parser.add_argument('-v', '--verbose', action='store_true',
                    help="Verbose mode. Logging messages will output to console.")
parser.add_argument('-t', '--test', action='store_true',
                    help="Test mode. No ingestions will be sent to UFrame.")
parser.add_argument('-f', '--force', action='store_true',
                    help="Force mode. EDEX logs will not be checked for previous ingestions of the specified data.")
parser.add_argument('-no-edex', action='store_true',
                    help="Don't check to see if EDEX is alive after every send.")
parser.add_argument('--sleep_timer', type=int, default=config.SLEEP_TIMER, metavar="N",
                    help="Override the sleep timer with a value of N seconds.")
parser.add_argument('--start', default=config.START_DATE, metavar="YYYY-MM-DD",
                    help="Only ingest files newer than the specified date in the YYYY-MM-DD format.")
parser.add_argument('--end', default=config.END_DATE, metavar="YYYY-MM-DD",
                    help="Only ingest files older than the specified date in the YYYY-MM-DD format.")
parser.add_argument('--age', type=int, default=config.MAX_FILE_AGE, metavar="N",
                    help="Only ingest files that are N seconds old or less.")
parser.add_argument('--age_min', type=int, default=config.MIN_FILE_AGE, metavar="N",
                    help="Only ingest files that are N seconds old or more.")
parser.add_argument('--cooldown', type=int, default=config.EDEX['cooldown'], metavar="N",
                    help="Wait N seconds after EDEX services are started before ingesting.")
parser.add_argument('--quick', type=int, default=config.QUICK_LOOK_QUANTITY, metavar="N",
                    help="Ingest a maximum of N files per CSV.")
parser.add_argument('--qpid_host', type=str, default=config.QPID['host'], metavar="host",
                    help="The QPID server hostname.")
parser.add_argument('--qpid_port', type=str, default=config.QPID['port'], metavar="port",
                    help="The QPID server port.")
parser.add_argument('--qpid_user', type=str, default=config.QPID['user'], metavar="username",
                    help="The QPID server username.")
parser.add_argument('--qpid_password', type=str, default=config.QPID['password'], metavar="password",
                    help="The QPID server password.")


class Task(object):
    """ A helper class designed to manage the different types of ingestion tasks."""

    def __init__(self, args):
        self.logger = logging.getLogger('Task')

        def parse_date(date_string):
            if date_string:
                try:
                    return datetime.strptime(date_string, "%Y-%m-%d")
                except ValueError:
                    self.logger.error("Date must be in YYYY-MM-DD format")
                    log_and_exit(5)
            return None

        self.args = args

        self.options = {
            'test_mode': self.args.test,
            'force_mode': self.args.force,
            'no_edex': self.args.no_edex,
            'sleep_timer': self.args.sleep_timer,
            'max_file_age': self.args.age,
            'min_file_age': self.args.age_min,
            'start_date': parse_date(self.args.start),
            'end_date': parse_date(self.args.end),
            'cooldown': self.args.cooldown,
            'quick_look_quantity': self.args.quick,
            'edex_command': config.EDEX['command'],
            'health_check_enabled': config.EDEX['health_check_enabled'],
            'qpid_host': self.args.qpid_host,
            'qpid_port': self.args.qpid_port,
            'qpid_user': self.args.qpid_user,
            'qpid_password': self.args.qpid_password,
        }

    def execute(self):
        getattr(self, self.args.task)()

    def dummy(self):
        """ The dummy task is used for testing basic initialization functions. It creates an
            Ingestor (which in turn creates a ServiceManager) and outputs all of the script's 
            options to the log. """
        ingestor = Ingestor(**self.options)
        self.logger.info("Dummy task was run with the following options:")
        for option in sorted(["%s: %s" % (o, self.options[o]) for o in self.options]):
            self.logger.info(option)

    def from_csv(self):
        """ Ingest from specified CSV files."""
        timestamp_logname = "from_csv_" + datetime.today().strftime('%Y_%m_%d_%H_%M_%S')
        csv_files = [f for f in self.args.files if f.endswith('.csv')]
        if not csv_files:
            self.logger.error("No CSV files found.")
            return False

        # Create an instance of the Ingestor class with common options set.
        ingestor = Ingestor(**self.options)

        # Ingest from each CSV file.
        for csv_file in csv_files:
            data_groups = Ingestor.process_csv(csv_file)
            for mask, routes, deployment_number in data_groups:
                ingestor.load_queue(mask, routes, deployment_number)
        ingestor.ingest_from_queue()

        # Write out any failed ingestions from the entire batch to a new CSV file.
        if ingestor.failed_ingestions:
            ingestor.write_failures_to_csv(timestamp_logname)

        self.logger.info('')
        self.logger.info("Ingestion completed.")
        return True

    def from_file(self):
        ingestor = Ingestor(**self.options)

        for f in self.args.files:
            ingestor.load_queue(
                mask=f,
                routes={
                    'uframe_route': self.uframe_route,
                    'reference_designator': self.reference_designator,
                    'data_source': self.data_source, },
                deployment_number=self.deployment_number)
        ingestor.ingest_from_queue()

        self.logger.info('')
        self.logger.info("Ingestion completed.")
        return True


args = parser.parse_args()

task = Task(args)

if __name__ == '__main__':
    # Setup Logging
    log_file = "_".join(
        ("ingestion", args.task, datetime.today().strftime('%Y_%m_%d_%H_%M_%S'))) + ".log"
    logger.setup_logging(log_file=log_file, verbose=args.verbose)
    main_logger = logging.getLogger('Main')
    logging.getLogger("requests").setLevel(logging.WARNING)

    # Run the task with the arguments.
    task_start_time = datetime.now()
    args_string = ", ".join(["%s: %s" % (a, vars(args)[a]) for a in vars(args)])
    main_logger.info(
        "Running ingestion task '%s' with the following options: '%s'" % (args.task, args_string))
    main_logger.info('')
    try:
        task.execute()
    except Exception:
        main_logger.exception("There was an unexpected error.")

    time_elapsed = datetime.now() - task_start_time
    main_logger.info("Task completed in %s." % str(time_elapsed).split('.')[0])

## YAML Validation

In [None]:
import sys, os, os.path, pathlib, platform, shutil, tempfile, warnings

# for newer setuptools, enable the embedded distutils before importing setuptools/distutils to avoid warnings
os.environ['SETUPTOOLS_USE_DISTUTILS'] = 'local'

from setuptools import setup, Command, Distribution as _Distribution, Extension as _Extension
from setuptools.command.build_ext import build_ext as _build_ext
# NB: distutils imports must remain below setuptools to ensure we use the embedded version
from distutils import log
from distutils.errors import DistutilsError, CompileError, LinkError, DistutilsPlatformError

with_cython = False
if 'sdist' in sys.argv or os.environ.get('PYYAML_FORCE_CYTHON') == '1':
    # we need cython here
    with_cython = True
try:
    from Cython.Distutils.extension import Extension as _Extension
    from Cython.Distutils import build_ext as _build_ext
    with_cython = True
except ImportError:
    if with_cython:
        raise

try:
    from wheel.bdist_wheel import bdist_wheel
except ImportError:
    bdist_wheel = None


# on Windows, disable wheel generation warning noise
windows_ignore_warnings = [
"Unknown distribution option: 'python_requires'",
"Config variable 'Py_DEBUG' is unset",
"Config variable 'WITH_PYMALLOC' is unset",
"Config variable 'Py_UNICODE_SIZE' is unset",
"Cython directive 'language_level' not set"
]

if platform.system() == 'Windows':
    for w in windows_ignore_warnings:
        warnings.filterwarnings('ignore', w)


class Distribution(_Distribution):
    def __init__(self, attrs=None):
        _Distribution.__init__(self, attrs)
        if not self.ext_modules:
            return
        for idx in range(len(self.ext_modules)-1, -1, -1):
            ext = self.ext_modules[idx]
            if not isinstance(ext, Extension):
                continue
            setattr(self, ext.attr_name, None)
            self.global_options = [
                    (ext.option_name, None,
                        "include %s (default if %s is available)"
                        % (ext.feature_description, ext.feature_name)),
                    (ext.neg_option_name, None,
                        "exclude %s" % ext.feature_description),
            ] + self.global_options
            self.negative_opt = self.negative_opt.copy()
            self.negative_opt[ext.neg_option_name] = ext.option_name

    def has_ext_modules(self):
        if not self.ext_modules:
            return False
        for ext in self.ext_modules:
            with_ext = self.ext_status(ext)
            if with_ext is None or with_ext:
                return True
        return False

    def ext_status(self, ext):
        implementation = platform.python_implementation()
        if implementation not in ['CPython', 'PyPy']:
            return False
        if isinstance(ext, Extension):
            # the "build by default" behavior is implemented by this returning None
            with_ext = getattr(self, ext.attr_name) or os.environ.get('PYYAML_FORCE_{0}'.format(ext.feature_name.upper()))
            try:
                with_ext = int(with_ext)  # attempt coerce envvar to int
            except TypeError:
                pass
            return with_ext
        else:
            return True


class Extension(_Extension):

    def __init__(self, name, sources, feature_name, feature_description,
            feature_check, **kwds):
        if not with_cython:
            for filename in sources[:]:
                base, ext = os.path.splitext(filename)
                if ext == '.pyx':
                    sources.remove(filename)
                    sources.append('%s.c' % base)
        _Extension.__init__(self, name, sources, **kwds)
        self.feature_name = feature_name
        self.feature_description = feature_description
        self.feature_check = feature_check
        self.attr_name = 'with_' + feature_name.replace('-', '_')
        self.option_name = 'with-' + feature_name
        self.neg_option_name = 'without-' + feature_name


class build_ext(_build_ext):

    def run(self):
        optional = True
        disabled = True
        for ext in self.extensions:
            with_ext = self.distribution.ext_status(ext)
            if with_ext is None:
                disabled = False
            elif with_ext:
                optional = False
                disabled = False
                break
        if disabled:
            return
        try:
            _build_ext.run(self)
        except DistutilsPlatformError:
            exc = sys.exc_info()[1]
            if optional:
                log.warn(str(exc))
                log.warn("skipping build_ext")
            else:
                raise

    def get_source_files(self):
        self.check_extensions_list(self.extensions)
        filenames = []
        for ext in self.extensions:
            if with_cython:
                self.cython_sources(ext.sources, ext)
            for filename in ext.sources:
                filenames.append(filename)
                base = os.path.splitext(filename)[0]
                for ext in ['c', 'h', 'pyx', 'pxd']:
                    filename = '%s.%s' % (base, ext)
                    if filename not in filenames and os.path.isfile(filename):
                        filenames.append(filename)
        return filenames

    def get_outputs(self):
        self.check_extensions_list(self.extensions)
        outputs = []
        for ext in self.extensions:
            fullname = self.get_ext_fullname(ext.name)
            filename = os.path.join(self.build_lib,
                                    self.get_ext_filename(fullname))
            if os.path.isfile(filename):
                outputs.append(filename)
        return outputs

    def build_extensions(self):
        self.check_extensions_list(self.extensions)
        for ext in self.extensions:
            with_ext = self.distribution.ext_status(ext)
            if with_ext is not None and not with_ext:
                continue
            if with_cython:
                ext.sources = self.cython_sources(ext.sources, ext)
            try:
                self.build_extension(ext)
            except (CompileError, LinkError):
                if with_ext is not None:
                    raise
                log.warn("Error compiling module, falling back to pure Python")


class test(Command):

    user_options = []

    def initialize_options(self):
        pass

    def finalize_options(self):
        pass

    def run(self):
        build_cmd = self.get_finalized_command('build')
        build_cmd.run()

        # running the tests this way can pollute the post-MANIFEST build sources
        # (see https://github.com/yaml/pyyaml/issues/527#issuecomment-921058344)
        # until we remove the test command, run tests from an ephemeral copy of the intermediate build sources
        tempdir = tempfile.TemporaryDirectory(prefix='test_pyyaml')

        try:
            # have to create a subdir since we don't get dir_exists_ok on copytree until 3.8
            temp_test_path = pathlib.Path(tempdir.name) / 'pyyaml'
            shutil.copytree(build_cmd.build_lib, temp_test_path)
            sys.path.insert(0, str(temp_test_path))
            sys.path.insert(0, 'tests/lib')

            import test_all
            if not test_all.main([]):
                raise DistutilsError("Tests failed")
        finally:
            try:
                # this can fail under Windows; best-effort cleanup
                tempdir.cleanup()
            except Exception:
                pass


cmdclass = {
    'build_ext': build_ext,
    'test': test,
}
if bdist_wheel:
    cmdclass['bdist_wheel'] = bdist_wheel


if __name__ == '__main__':

    setup(
        name=NAME,
        version=VERSION,
        description=DESCRIPTION,
        long_description=LONG_DESCRIPTION,
        author=AUTHOR,
        author_email=AUTHOR_EMAIL,
        license=LICENSE,
        platforms=PLATFORMS,
        url=URL,
        download_url=DOWNLOAD_URL,
        classifiers=CLASSIFIERS,
        project_urls=PROJECT_URLS,

        package_dir={'': 'lib'},
        packages=['yaml', '_yaml'],
        ext_modules=[
            Extension('yaml._yaml', ['yaml/_yaml.pyx'],
                'libyaml', "LibYAML bindings", LIBYAML_CHECK,
                libraries=['yaml']),
        ],

        distclass=Distribution,
        cmdclass=cmdclass,
        python_requires='>=3.6',
    )