# ingest

> This module contains code to read and validate files

In [None]:
#| default_exp hcingest

In [None]:
#| hide
from nbdev.showdoc import *

In [None]:
#| export
import re
import pandas as pd        
import numpy as np 
import traceback
import datetime
from cerberus import Validator
from urllib.error import URLError

In [None]:
#| export
def read(file_type: str, file_path: str) -> tuple:
    """
    This function read  helps  to read any of a csv,excel and parquet file,
    and returns a pandas dataframe.

    Parameters
    ----------
    file_type: file type must be one of csv,excel or parquet format.
    file: file path.

    Returns
    -------
        if   errors - function returns tuple with boolean value  "False" and dictionary with errors 
        if no errors - function returns tuple with boolean value  "True" and  dataframe

    """

    report_dict = {}
    errors = {}
    emptydf = pd.DataFrame()
    error_flag = False

    if file_type not in ['csv', 'excel', 'parquet']:
        errors["error"] = "unsupported file format please pass one of csv,excel or parquet types"
        return (False, errors, emptydf)

    # first set of checks on file missing, corrupted, etc.
    try:
        if file_type == 'csv':
            df = pd.read_csv(file_path)
        if file_type == 'excel':
            df = pd.read_excel(file_path)
        if file_type == 'parquet':
            df = pd.read_parquet(file_path)

    except FileNotFoundError as e:
        errors["error"] = (
            "File not found please check file path", traceback.format_exc())
        error_flag = True
    except URLError as e1:
        errors["error"] = ('URL not found,invalid URL', traceback.format_exc())
        error_flag = True
    except UnicodeDecodeError as e2:
        errors["error"] = (
            "corrupted data or wrong file format", traceback.format_exc())
        error_flag = True
    except ValueError as e3:
        errors["error"] = (
            "corrupted data or wrong file format", traceback.format_exc())
        error_flag = True
    except:
        errors["error"] = ("UnKnow error occured ", traceback.format_exc())
        error_flag = True

    if error_flag:
        return (False, errors)
    return (True, df)

In [None]:
from pprint import pprint

### Valid test case

In [None]:
path="https://demo-time-series.s3.us-east-2.amazonaws.com/no_error_file_valid_ex.csv"

rs=read("csv",path)
assert rs[0]==True, 'something went wrong'
rs[1]

Unnamed: 0,emp_id,emp_name,emp_salary,date
0,1,rakesh,10.55,01-02-2020
1,2,ramesh,23.55,02-02-2020
2,3,radhesh,23.55,03-02-2020
3,4,ravesh,40.45,04-02-2020
4,5,ralesh,45.88,05-02-2020


### Failing Test Cases - invalid URL

In [None]:
path="https://demo-time-series.s3.us-east-2.amazonaws.com/_error_file_valid_ex.csv"

rs=read("csv",path)
rs[1]['error']

('URL not found,invalid URL',
 'Traceback (most recent call last):\n  File "C:\\Users\\anant\\AppData\\Local\\Temp\\ipykernel_66920\\2984043852.py", line 31, in read\n    df = pd.read_csv(file_path)\n  File "C:\\Users\\anant\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\pandas\\util\\_decorators.py", line 311, in wrapper\n    return func(*args, **kwargs)\n  File "C:\\Users\\anant\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\pandas\\io\\parsers\\readers.py", line 586, in read_csv\n    return _read(filepath_or_buffer, kwds)\n  File "C:\\Users\\anant\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\pandas\\io\\parsers\\readers.py", line 482, in _read\n    parser = TextFileReader(filepath_or_buffer, **kwds)\n  File "C:\\Users\\anant\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\pandas\\io\\parsers\\readers.py", line 811, in __init__\n    self._engine = self._make_engine(self.engine)\n  File "C:\\Users\\anant\\App

### Failing test case - invalid format

In [None]:
path="https://demo-time-series.s3.us-east-2.amazonaws.com/no_error_file_valid_ex.csv"

rs=read("excel",path)
rs

(False,
 {'error': ('corrupted data or wrong file format',
   'Traceback (most recent call last):\n  File "C:\\Users\\anant\\AppData\\Local\\Temp\\ipykernel_66920\\2984043852.py", line 33, in read\n    df = pd.read_excel(file_path)\n  File "C:\\Users\\anant\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\pandas\\util\\_decorators.py", line 311, in wrapper\n    return func(*args, **kwargs)\n  File "C:\\Users\\anant\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\pandas\\io\\excel\\_base.py", line 364, in read_excel\n    io = ExcelFile(io, storage_options=storage_options, engine=engine)\n  File "C:\\Users\\anant\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\pandas\\io\\excel\\_base.py", line 1195, in __init__\n    raise ValueError(\nValueError: Excel file format cannot be determined, you must specify an engine manually.\n')})

### invalid path

In [None]:
path="some/invalid/path.csv"

rs=read("csv",path)
rs

(False,
 {'error': ('File not found please check file path',
   'Traceback (most recent call last):\n  File "C:\\Users\\anant\\AppData\\Local\\Temp\\ipykernel_66920\\2984043852.py", line 31, in read\n    df = pd.read_csv(file_path)\n  File "C:\\Users\\anant\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\pandas\\util\\_decorators.py", line 311, in wrapper\n    return func(*args, **kwargs)\n  File "C:\\Users\\anant\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\pandas\\io\\parsers\\readers.py", line 586, in read_csv\n    return _read(filepath_or_buffer, kwds)\n  File "C:\\Users\\anant\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\pandas\\io\\parsers\\readers.py", line 482, in _read\n    parser = TextFileReader(filepath_or_buffer, **kwds)\n  File "C:\\Users\\anant\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\pandas\\io\\parsers\\readers.py", line 811, in __init__\n    self._engine = self._make_engine(self.engin

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()