# Data Integrity Tests

In [132]:
import pandas as pd
import numpy as np
import pytest
import ipytest
import re
from datetime import datetime

## Using PyTest in a Jupyter Notebook
Pytest is designed to run tests in .py files which can cause issues when you want to test in a notebook. Normally, this is a good thing, however, in the edge case we are working in, we want to be using a notebook. 
To do this we are using the ipytest package. After importing the module we are going quickly set it up to test using the autoconfig method, however, we are going to manually set the additional options to ensure that the output is colourful.

In [133]:
ipytest.autoconfig(addopts=["--color=yes"])

There are two ways to run pytest using ipytest. The first is with a ipython magic %%ipytest that first executes the cell, then runs the tests found in the cell. It cleans any previously found tests so they don't (re)run. 

In [134]:
%%ipytest

def test_test():
    assert True

def test_fail():
    assert False

platform linux -- Python 3.8.10, pytest-7.1.2, pluggy-1.0.0
rootdir: /home/jowi60/TDD_lunch_and_learn, configfile: pytest.ini
collected 2 items

tmpvdnyx0__.py [32m.[0m[31mF[0m[31m                                                                            [100%][0m

[31m[1m____________________________________________ test_fail _____________________________________________[0m

    [94mdef[39;49;00m [92mtest_fail[39;49;00m():
>       [94massert[39;49;00m [94mFalse[39;49;00m
[1m[31mE       assert False[0m

[1m[31m/tmp/ipykernel_333/3225759524.py[0m:5: AssertionError
FAILED tmpvdnyx0__.py::test_fail - assert False


Another way to run pytest is to with ipytest.run(). This runs any previously found tests that haven't been cleaned away. It is not advisable to mix and match the two methods as it will result in unexpected behaviour.

In [135]:
ipytest.run()

platform linux -- Python 3.8.10, pytest-7.1.2, pluggy-1.0.0
rootdir: /home/jowi60/TDD_lunch_and_learn, configfile: pytest.ini
collected 2 items

tmpxpaqvabc.py [32m.[0m[31mF[0m[31m                                                                            [100%][0m

[31m[1m____________________________________________ test_fail _____________________________________________[0m

    [94mdef[39;49;00m [92mtest_fail[39;49;00m():
>       [94massert[39;49;00m [94mFalse[39;49;00m
[1m[31mE       assert False[0m

[1m[31m/tmp/ipykernel_333/3225759524.py[0m:5: AssertionError
FAILED tmpxpaqvabc.py::test_fail - assert False


<ExitCode.TESTS_FAILED: 1>

## Testing Dataframes

We are going to test this dummy dataset of gp appointment figures. We are expecting the dataset to fit several characteristics which we will test for. The columns we expect in the dataset are:
* practice_name: [str] Must be one of Northern Wellness, East End Doctors, West Park Practice, and Southern Health. There is an invalid practice_name included 
* practice_post_code: [str] Must be a postcode of a valid format. One practice is missing the second letter of the unit postcode.
* date: [datetime] Must unique for each practice. East End Doctors has two entries for the same date.
* number_of_appointments: [int] Must be positive. Southern Health has a negative value. All values are the wrong datatype.
* evil: [bool] Must be true or false and is required (no null values). West Park Practice is missing the value.

In [136]:
gp_appointments_dict = [
    {
        "practice_name": "Northern Wellness",
        "practice_post_code": "NW01 1AB",
        "date": datetime(2022,1,1),
        "number_of_appointments": "900",
        "evil": False
    },
    {
        "practice_name": "Northern Wellness",
        "practice_post_code": "NW01 1AB",
        "date": datetime(2022,1,2),
        "number_of_appointments": "1000",
        "evil": False
    },
    {
        "practice_name": "East End Doctors",
        "practice_post_code": "EE02 2AB",
        "date": datetime(2022,1,1),
        "number_of_appointments": "100",
        "evil": False
    },
    {
        "practice_name": "East End Doctors",
        "practice_post_code": "EE02 2AB",
        "date": datetime(2022,1,1),
        "number_of_appointments": "100",
        "evil": False
    },
    {
        "practice_name": "West Park Practice",
        "practice_post_code": "WP01 1A",
        "date": datetime(2022,1,1),
        "number_of_appointments": "100000"
    },
    {
        "practice_name": "Southern Health",
        "practice_post_code": "S01 1AB",
        "date": datetime(2022,1,1),
        "number_of_appointments": "-100",
        "evil": False
    },
    {
        "practice_name": "Snake Oil Cures",
        "practice_post_code": "B01 1AD",
        "date": datetime(2022,1,1),
        "number_of_appointments": "1000000",
        "evil": True
    },
]

df_gp_appointments = pd.DataFrame(gp_appointments_dict)

### Check if the schema of the dataframe is correct
We can easily compare the dataframe schema against our expected schema by extracting the datatypes fo the columns and converting them to a dictionary. The datatypes will be numpy datatypes, so will look a bit weird but we can lookup online what we should expect.

In [137]:
%%ipytest

expected_schema = {
        "practice_name": np.dtype('O'),
        "practice_post_code": np.dtype('O'),
        "date": np.dtype('datetime64[ns]'),
        "number_of_appointments": np.dtype('float_'),
        "evil": np.dtype('bool')
    }

def test_columns():
    assert expected_schema == df_gp_appointments.dtypes.to_dict()


platform linux -- Python 3.8.10, pytest-7.1.2, pluggy-1.0.0
rootdir: /home/jowi60/TDD_lunch_and_learn, configfile: pytest.ini
collected 1 item

tmpp5iudaf8.py [31mF[0m[31m                                                                             [100%][0m

[31m[1m___________________________________________ test_columns ___________________________________________[0m

    [94mdef[39;49;00m [92mtest_columns[39;49;00m():
>       [94massert[39;49;00m expected_schema == df_gp_appointments.dtypes.to_dict()
[1m[31mE       AssertionError: assert {'date': dtyp...ype('O'), ...} == {'date': dtyp...ype('O'), ...}[0m
[1m[31mE         Omitting 3 identical items, use -vv to show[0m
[1m[31mE         Differing items:[0m
[1m[31mE         {'evil': dtype('bool')} != {'evil': dtype('O')}[0m
[1m[31mE         {'number_of_appointments': dtype('float64')} != {'number_of_appointments': dtype('O')}[0m
[1m[31mE         Use -v to get more diff[0m

[1m[31m/tmp/ipykernel_333/1692329

There are some failures in how the different datatypes were cast when creating the dateframe, however, with some type conversions we can correct these issues.

In [138]:
%%ipytest

df_gp_appointments.evil = df_gp_appointments.\
    evil.astype(bool)

df_gp_appointments.number_of_appointments = df_gp_appointments.\
    number_of_appointments.\
    astype(np.float64)


def test_columns():
    assert expected_schema == df_gp_appointments.dtypes.to_dict()

platform linux -- Python 3.8.10, pytest-7.1.2, pluggy-1.0.0
rootdir: /home/jowi60/TDD_lunch_and_learn, configfile: pytest.ini
collected 1 item

tmpmy3om3vy.py [32m.[0m[32m                                                                             [100%][0m



In [139]:
%%ipytest

expected_dims = (7,5)

def test_dimensions():
    assert df_gp_appointments.shape == expected_dims

platform linux -- Python 3.8.10, pytest-7.1.2, pluggy-1.0.0
rootdir: /home/jowi60/TDD_lunch_and_learn, configfile: pytest.ini
collected 1 item

tmpvh5515t8.py [32m.[0m[32m                                                                             [100%][0m



### Check the practice names are valid.
We can use parametrisation to check all practice names in the dataframe to ensure they are all valid. Snake Oil Cures is invalid and has been flagged as failing.

In [140]:
%%ipytest

expected_practice_names = [
    'Northern Wellness', 
    'East End Doctors', 
    'West Park Practice', 
    'Southern Health'
]


@pytest.mark.parametrize("practice_name", df_gp_appointments["practice_name"].unique())
def test_practice_names(practice_name):
    assert practice_name in expected_practice_names

platform linux -- Python 3.8.10, pytest-7.1.2, pluggy-1.0.0
rootdir: /home/jowi60/TDD_lunch_and_learn, configfile: pytest.ini
collected 5 items

tmpwfewm67j.py [32m.[0m[32m.[0m[32m.[0m[32m.[0m[31mF[0m[31m                                                                         [100%][0m

[31m[1m_______________________________ test_practice_names[Snake Oil Cures] _______________________________[0m

practice_name = 'Snake Oil Cures'

    [37m@pytest[39;49;00m.mark.parametrize([33m"[39;49;00m[33mpractice_name[39;49;00m[33m"[39;49;00m, df_gp_appointments[[33m"[39;49;00m[33mpractice_name[39;49;00m[33m"[39;49;00m].unique())
    [94mdef[39;49;00m [92mtest_practice_names[39;49;00m(practice_name):
>       [94massert[39;49;00m practice_name [95min[39;49;00m expected_practice_names
[1m[31mE       AssertionError: assert 'Snake Oil Cures' in ['Northern Wellness', 'East End Doctors', 'West Park Practice', 'Southern Health'][0m

[1m[31m/tmp/ipykernel_333/17795

### Check if the postcodes are valid
We can use a regex available online to check if the postcode is a valid format. There are also libraries and APIs that will check if it is a real postcode.

In [141]:
%%ipytest

expected_postcode_regex = r'^(((([A-Z][A-Z]{0,1})[0-9][A-Z0-9]{0,1}) {0,}[0-9])[A-Z]{2})$'

@pytest.mark.parametrize("practice_post_code", df_gp_appointments["practice_post_code"])
def test_postcode_format(practice_post_code):
    assert re.match(
        pattern= expected_postcode_regex,
        string= practice_post_code
    )

platform linux -- Python 3.8.10, pytest-7.1.2, pluggy-1.0.0
rootdir: /home/jowi60/TDD_lunch_and_learn, configfile: pytest.ini
collected 7 items

tmptpabn2kv.py [32m.[0m[32m.[0m[32m.[0m[32m.[0m[31mF[0m[32m.[0m[32m.[0m[31m                                                                       [100%][0m

[31m[1m__________________________________ test_postcode_format[WP01 1A] ___________________________________[0m

practice_post_code = 'WP01 1A'

    [37m@pytest[39;49;00m.mark.parametrize([33m"[39;49;00m[33mpractice_post_code[39;49;00m[33m"[39;49;00m, df_gp_appointments[[33m"[39;49;00m[33mpractice_post_code[39;49;00m[33m"[39;49;00m])
    [94mdef[39;49;00m [92mtest_postcode_format[39;49;00m(practice_post_code):
>       [94massert[39;49;00m re.match(
            pattern= expected_postcode_regex,
            string= practice_post_code
        )
[1m[31mE       AssertionError: assert None[0m
[1m[31mE        +  where None = <function match at 0x7fa542e6

### Check if the dates are correct.
The practice's should only have one submission per day. To check we can group and count submissions from each practice on each day and check there is one per practice per day.

In [142]:
%%ipytest

df_date_count = df_gp_appointments\
    .groupby(["practice_name", "date"])\
    .count()["practice_post_code"]

@pytest.mark.parametrize("count_of_dates", df_date_count)
def test_no_repeated_dates(count_of_dates):
    assert count_of_dates == 1

platform linux -- Python 3.8.10, pytest-7.1.2, pluggy-1.0.0
rootdir: /home/jowi60/TDD_lunch_and_learn, configfile: pytest.ini
collected 6 items

tmp8pbl_3wl.py [31mF[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[31m                                                                        [100%][0m

[31m[1m____________________________________ test_no_repeated_dates[2] _____________________________________[0m

count_of_dates = 2

    [37m@pytest[39;49;00m.mark.parametrize([33m"[39;49;00m[33mcount_of_dates[39;49;00m[33m"[39;49;00m, df_date_count)
    [94mdef[39;49;00m [92mtest_no_repeated_dates[39;49;00m(count_of_dates):
>       [94massert[39;49;00m count_of_dates == [94m1[39;49;00m
[1m[31mE       assert 2 == 1[0m

[1m[31m/tmp/ipykernel_333/2415700076.py[0m:7: AssertionError
FAILED tmp8pbl_3wl.py::test_no_repeated_dates[2] - assert 2 == 1


### Check number of appointments

### Check is evil