# Data Integrity Tests

In [30]:
import pandas as pd
import numpy as np
import pytest
import ipytest
import re
from datetime import datetime

In [15]:
ipytest.autoconfig(addopts=["--color=yes"])

## Using PyTest in a Jupyter Notebook
Pytest is designed to run tests in .py files which can cause issues when you want to test in a notebook. Normally, this is a good thing, however, in the edge case we are working in, we want to be using a notebook.

In [17]:
%%ipytest

def test_test():
    assert True

def test_fail():
    assert False

platform linux -- Python 3.8.10, pytest-7.1.2, pluggy-1.0.0
rootdir: /home/jowi60/TDD_lunch_and_learn/3_data_integrity_applications
collected 2 items

tmptg04i9w0.py [32m.[0m[31mF[0m[31m                                                                            [100%][0m

[31m[1m____________________________________________ test_fail _____________________________________________[0m

    [94mdef[39;49;00m [92mtest_fail[39;49;00m():
>       [94massert[39;49;00m [94mFalse[39;49;00m
[1m[31mE       assert False[0m

[1m[31m/tmp/ipykernel_9217/3225759524.py[0m:5: AssertionError
FAILED tmptg04i9w0.py::test_fail - assert False


In [18]:
ipytest.run()

platform linux -- Python 3.8.10, pytest-7.1.2, pluggy-1.0.0
rootdir: /home/jowi60/TDD_lunch_and_learn/3_data_integrity_applications
collected 2 items

tmpmppy641a.py [32m.[0m[31mF[0m[31m                                                                            [100%][0m

[31m[1m____________________________________________ test_fail _____________________________________________[0m

    [94mdef[39;49;00m [92mtest_fail[39;49;00m():
>       [94massert[39;49;00m [94mFalse[39;49;00m
[1m[31mE       assert False[0m

[1m[31m/tmp/ipykernel_9217/3225759524.py[0m:5: AssertionError
FAILED tmpmppy641a.py::test_fail - assert False


<ExitCode.TESTS_FAILED: 1>

## Testing Dataframes

We are going to test this dummy dataset of gp appointment figures. We are expecting the dataset to fit several characteristics which we will test for. The columns we expect in the dataset are:
* practice_name: [str] Must be one of Northern Wellness, East End Doctors, West Park Practice, and Southern Health. There is an invalid practice_name included 
* practice_post_code: [str] Must be a postcode of a valid format. One practice is missing the second letter of the unit postcode.
* date: [datetime] Must unique for each practice. East End Doctors has two entries for the same date.
* number_of_appointments: [int] Must be positive. Southern Health has a negative value. All values are the wrong datatype.
* evil: [bool] Must be true or false and is required (no null values). West Park Practice is missing the value.

In [41]:
gp_appointments_dict = [
    {
        "practice_name": "Northern Wellness",
        "practice_post_code": "NW01 1AB",
        "date": datetime(2022,1,1),
        "number_of_appointments": "900",
        "evil": False
    },
    {
        "practice_name": "Northern Wellness",
        "practice_post_code": "NW01 1AB",
        "date": datetime(2022,1,2),
        "number_of_appointments": "1000",
        "evil": False
    },
    {
        "practice_name": "East End Doctors",
        "practice_post_code": "EE02 2AB",
        "date": datetime(2022,1,1),
        "number_of_appointments": "100",
        "evil": False
    },
    {
        "practice_name": "East End Doctors",
        "practice_post_code": "EE02 2AB",
        "date": datetime(2022,1,1),
        "number_of_appointments": "100",
        "evil": False
    },
    {
        "practice_name": "West Park Practice",
        "practice_post_code": "WP01 1A",
        "date": datetime(2022,1,1),
        "number_of_appointments": "100000"
    },
    {
        "practice_name": "Southern Health",
        "practice_post_code": "S01 1AB",
        "date": datetime(2022,1,1),
        "number_of_appointments": "-100",
        "evil": False
    },
    {
        "practice_name": "Snake Oil Cures",
        "practice_post_code": "B01 1AD",
        "date": datetime(2022,1,1),
        "number_of_appointments": "1000000",
        "evil": True
    },
]

df_gp_appointments = pd.DataFrame(gp_appointments_dict)
df_gp_appointments.dtypes.to_dict()

{'practice_name': dtype('O'),
 'practice_post_code': dtype('O'),
 'date': dtype('<M8[ns]'),
 'number_of_appointments': dtype('O'),
 'evil': dtype('O')}

In [38]:
expected_schema = {
        "practice_name": np.dtype('O'),
        "practice_post_code": np.dtype('O'),
        "date": datetime,
        "number_of_appointments": np.int64,
        "evil": bool
    }

expected_dims = (7,5)

In [None]:
expected_practice_names = [
    'Northern Wellness', 
    'East End Doctors', 
    'West Park Practice', 
    'Southern Health'
]

In [None]:
expected_postcode_regex = r'^(((([A-Z][A-Z]{0,1})[0-9][A-Z0-9]{0,1}) {0,}[0-9])[A-Z]{2})$'

# 