In [1]:

# Import Libraries
import pandas as pd
import numpy as np
import os
from dask import dataframe as dd
from collections import Counter

from sklearn.metrics import auc

import matplotlib.pyplot as plt
import seaborn as sns

import datetime
from dateutil.relativedelta import relativedelta

import icd10

def ConvertDate(string):
    date = pd.to_datetime(string, format='%Y-%m-%d')
    return date

def displayDataFrame(df):
    with pd.option_context('display.max_columns', None) and pd.option_context('display.max_colwidth', None):
        display(df)
    
def SummaryDataFrame(df, ConditionName):
    print('-----------------------------')
    print('\033[1m' + ConditionName.upper() + '\033[0m')
    print('\033[1m Columns: \033[0m')
    print(df.columns)
    print('\033[1m Size of dataset: \033[0m', df.shape)
    displayDataFrame(df.head(5))
    
def BoldFont(string):
    return '\033[1m ' + string + ' \033[0m'

# Playing audio prompt
import IPython
from IPython.display import Audio
sound_file = 'https://www.mediacollege.com/downloads/sound-effects/beep/beep-07.wav'

DateToday = datetime.date.today().strftime("%y.%m.%d")

def SaveFigure(plt, FigureName):
    WorkingDir = !pwd
    Path_Figures = '/'.join( WorkingDir[0].split('/')[:-1] ) + '/Figures/'
    DateToday = datetime.date.today().strftime("%y.%m.%d")
    FileName = FigureName + '_' + DateToday + '.png'
    PathName = os.path.join(Path_Figures, FileName)
    plt.savefig(PathName,
                bbox_inches='tight', dpi=300)
    print(BoldFont('FIGURE SAVED.'))


# Reading autism dataset

In [2]:

# Open datasets with flagged errors

def OpenFile(dataset):
    print('Reading', dataset, 'dataset...')
    Path = r"/Users/flavien/Documents/Projects/NHS_DiRAC/Paper_DataQuality/Figures/DQ_Tables.nosync/"
    FileName = dataset + '_SpellLevel_FeatureEngineered_21.12.09.csv'
    FilePath = os.path.join(Path, FileName)
    df_dataset = pd.read_csv(FilePath)
    
    return df_dataset

df_Autism_ini = OpenFile('Autism')


Reading Autism dataset...


In [3]:

# Total number of spells
print("Total number of", BoldFont('spells:'), df_Autism_ini.shape[0])

# Number of susbequent spells
Filter_SubsequentSpells = df_Autism_ini.loc[:, 'SubsequentSpell'] == 1
N_SubsequentSpells = df_Autism_ini[Filter_SubsequentSpells].shape[0]
print('Number of',  BoldFont('SUBSEQUENT spells:'), f"{N_SubsequentSpells:,}")

# Number of snconsistencies
Filter_Inconsistencies = df_Autism_ini.loc[:, 'SpellLevel_Error_Autism'] == 1
N_Inconsistencies = df_Autism_ini[Filter_Inconsistencies].shape[0]
Prop_Inconsistencies = 100 * N_Inconsistencies / N_SubsequentSpells
print('Number of', BoldFont('coding inconsistencies:'), f"{N_Inconsistencies:,}")
print('Percentage of coding inconsistencies: %0.2f' % Prop_Inconsistencies, '%')


Total number of [1m spells: [0m 583873
Number of [1m SUBSEQUENT spells: [0m 390,220
Number of [1m coding inconsistencies: [0m 170,447
Percentage of coding inconsistencies: 43.68 %


In [4]:

# --------------------------------------------------
def ErrorCharacteristics_SpellLevel(df_ini, ConditionName):
    df = df_ini.copy()
    print('-----------------------------------------------------')
    print(BoldFont(ConditionName.upper()), ':')
    
    Filter_SubsequentSpells = (df.loc[:, 'SubsequentSpell']==1)
    df = df.loc[Filter_SubsequentSpells, :]
    
    ErrorName = 'SpellLevel_Error_'+ConditionName
    Filter = (df.loc[:, ErrorName]==1)
    df_Errors = df.loc[Filter, :]
    df_NoErrors = df.loc[~Filter, :]
    
    print('Total number of subsequent spells:', df.shape[0])
    print('Number of inconsistencies at spell level:', df_Errors.shape[0])
    print('Proportion of coding inconsistencies: %0.2f' % (100*df_Errors.shape[0]/df.shape[0]), '%')
    
    # --------------------------------------------
    # Errors overnight stays
    Filter_0LOS = df.loc[:, 'Spell_Los']==0
    df_0LOS = df[Filter_0LOS]
    df_Not0LOS = df[~Filter_0LOS]
    def Errors_Print(df_Sub):
        N_Errors = df_Sub.loc[:, ErrorName].sum()
        N_SubsequentSpells = df_Sub.loc[:, 'SubsequentSpell'].sum()
        Prop_Errors = 100 * N_Errors / N_SubsequentSpells
        print('Number of subsequent spells:', f"{N_SubsequentSpells:,}")
        print('Number of inconsistencies:', f"{N_Errors:,}",
              'corresponding to %0.2f' % Prop_Errors, '%')
        
    print(BoldFont('Overnight stays:'))
    print('WITH overnight stay:')
    Errors_Print(df_Not0LOS)
    print('Day-case:')
    Errors_Print(df_0LOS)
    print('\n')
    
    # --------------------------------------------
    # Errors by method of admission
    Filter_PODNan = df.loc[:, 'POD'].isnull()
    Filter_EM = df.loc[~Filter_PODNan, 'POD'].str.contains('EM|NE')
    Filter_EL = df.loc[~Filter_PODNan, 'POD'].str.contains('EL|DC|RA')
    df_EM = df[(~Filter_PODNan)&Filter_EM]
    df_EL = df[(~Filter_PODNan)&Filter_EL]
    df_PODNan = df[Filter_PODNan]

    print('POD, Emergency:')
    Errors_Print(df_EM)     
    print('POD, Elective:')
    Errors_Print(df_EL)     
    print('POD, Not recorded:')
    Errors_Print(df_PODNan)     
    print('\n')
    
    # --------------------------------------------
    # Errors by age bands
    AgeMin_List = [0, 18, 40, 60, 80] + [999]
    AgeMax_List = [age-1 for age in AgeMin_List[1:] ] + [999]
    for k in range(len(AgeMin_List)):
        AgeMin = AgeMin_List[k]
        AgeMax = AgeMax_List[k]
        FilterAge = (df['age_of_patient']>=AgeMin) & (df['age_of_patient']<=AgeMax)

        df_Age = df.loc[FilterAge, :]
        Sum = df_Age.loc[:, ErrorName].sum()
        Count = df_Age.loc[:, ErrorName].count()
        Prop_Errors = 100 * Sum/Count
        print(BoldFont('Age band:'), AgeMin, '-', AgeMax)
        print('\t Number of inconsistencies:', f"{Sum:,}", 'out of', f"{Count:,}", ', corresponding to %0.2f' % Prop_Errors, '%')

    print('\n')

    # --------------------------------------------
    # Errors by sex
    List = ['Female', 'Male']
    for sex in List:
        Filter_sex = (df.loc[:, 'sex']==sex)
        df_sex =  df.loc[Filter_sex, :]
        Sum = df_sex.loc[:, ErrorName].sum()
        Count = df_sex.loc[:, ErrorName].count()
        Prop_Errors = 100*Sum/Count
        print(BoldFont('Sex:'), sex)
        print('\t Number of inconsistencies:', f"{Sum:,}", 'out of', f"{Count:,}", ', corresponding to %0.2f' % Prop_Errors, '%')

    SexString = '|'.join(List)
    Filter_Sex = df.loc[:, 'sex'].str.contains(SexString)
    df_MissingSex = df.loc[~Filter_Sex, :]
    Sum = df_MissingSex.loc[:, ErrorName].sum()
    Count = df_MissingSex.loc[:, ErrorName].count()
    Prop_Errors = 100*Sum/Count
    print('Sex:', 'Other/Missing')
    print('\t Number of inconsistencies:', f"{Sum:,}", 'out of', f"{Count:,}", ', corresponding to %0.2f' % Prop_Errors, '%')
    print('\n')
    
    print('\n')
    print(BoldFont('Ethnicity'))
    # --------------------------------------------
    # Ethnicity
    if 'Ethnicity' in df.columns:
        df.loc[:, 'Ethnicity'] = df.loc[:, 'Ethnicity'].fillna('not known')
        df.loc[:, 'Ethnicity'] = df.loc[:, 'Ethnicity'].str.lower()
        EthnicityList = [ x for x in set(df.loc[:, 'Ethnicity'].values) ]
#         Count = df.shape[0]
        for ethnicity in EthnicityList:
            Filter = df.loc[:, 'Ethnicity']==ethnicity
            Sum = df.loc[Filter, ErrorName].sum()
            Count = df.loc[Filter, ErrorName].shape[0] 
            Prop = 100*Sum/Count
            print('Number of', ethnicity, ' :', f"{Sum:,}", 'corresponding to %0.2f' % Prop, '%')

    print('\n')
    print(BoldFont('IMD Score'))
    if 'IMD_Decile' in df.columns:
        Decile_List = range(1,10+1)
        for k in range(5):
            Decile_Max = Decile_List[k]*2
            Decile_Min = Decile_Max-1

            Filter_Quintiles = (df["IMD_Decile"]>=Decile_Min) & (df["IMD_Decile"]<=Decile_Max)
            Sum = df.loc[Filter_Quintiles, ErrorName].sum()
            Count = df.loc[Filter_Quintiles, ErrorName].count()
            Prop_Errors = 100*Sum/Count
            print('IMD Deciles:', Decile_Min, '-', Decile_Max)
            print('\t Number of errors:', f"{Sum:,}", 'out of', f"{Count:,}", ', corresponding to %0.2f' % Prop_Errors, '%')
        Filter_IMDNaN = df["IMD_Decile"].isnull()
        df_IMDNaN = df[Filter_IMDNaN]
        print('IMD not recorded')
        Errors_Print(df_IMDNaN)
# --------------------------------------------------

ErrorCharacteristics_SpellLevel(df_Autism_ini, 'Autism')



-----------------------------------------------------
[1m AUTISM [0m :
Total number of subsequent spells: 390220
Number of inconsistencies at spell level: 170447
Proportion of coding inconsistencies: 43.68 %
[1m Overnight stays: [0m
WITH overnight stay:
Number of subsequent spells: 170,120
Number of inconsistencies: 66,251 corresponding to 38.94 %
Day-case:
Number of subsequent spells: 220,100
Number of inconsistencies: 104,196 corresponding to 47.34 %


POD, Emergency:
Number of subsequent spells: 198,748
Number of inconsistencies: 85,380 corresponding to 42.96 %
POD, Elective:
Number of subsequent spells: 190,948
Number of inconsistencies: 84,845 corresponding to 44.43 %
POD, Not recorded:
Number of subsequent spells: 524
Number of inconsistencies: 222 corresponding to 42.37 %


[1m Age band: [0m 0 - 17
	 Number of inconsistencies: 57,317 out of 162,696 , corresponding to 35.23 %
[1m Age band: [0m 18 - 39
	 Number of inconsistencies: 71,936 out of 152,082 , corresponding to 4

  Prop_Errors = 100 * Sum/Count


[1m Sex: [0m Female
	 Number of inconsistencies: 64,650 out of 138,417 , corresponding to 46.71 %
[1m Sex: [0m Male
	 Number of inconsistencies: 105,797 out of 251,803 , corresponding to 42.02 %


  Prop_Errors = 100*Sum/Count


Sex: Other/Missing
	 Number of inconsistencies: 0 out of 0 , corresponding to nan %




[1m Ethnicity [0m
Number of not known  : 34,064 corresponding to 42.88 %
Number of white  : 114,938 corresponding to 44.76 %
Number of mixed  : 2,362 corresponding to 36.29 %
Number of other ethnic groups  : 10,155 corresponding to 41.85 %
Number of black or black british  : 4,195 corresponding to 41.31 %
Number of asian or asian british  : 4,733 corresponding to 36.29 %


[1m IMD Score [0m
IMD Deciles: 1 - 2
	 Number of errors: 50,739 out of 112,451 , corresponding to 45.12 %
IMD Deciles: 3 - 4
	 Number of errors: 40,305 out of 89,978 , corresponding to 44.79 %
IMD Deciles: 5 - 6
	 Number of errors: 30,392 out of 73,208 , corresponding to 41.51 %
IMD Deciles: 7 - 8
	 Number of errors: 26,390 out of 61,298 , corresponding to 43.05 %
IMD Deciles: 9 - 10
	 Number of errors: 20,478 out of 49,005 , corresponding to 41.79 %
IMD not recorded
Number of subsequent spells: 4,280
Number of inconsistencies

In [17]:

df_FirstSpell = df_Autism_ini
df_FirstSpell = df_FirstSpell.sort_values(by=['HESID', 'EPIstart', 'EPIend']).drop_duplicates(subset=['HESID'])
List_Ethnicity = list(set(df_Autism_ini['Ethnicity'].values))
for ethnicity in List_Ethnicity:
    Filter = df_FirstSpell['Ethnicity']==ethnicity
    print(ethnicity, ':', df_FirstSpell[Filter].shape[0])


nan : 0
white : 113146
mixed : 3695
other ethnic groups : 16537
black or black british : 4964
asian or asian british : 6916


In [18]:
df_FirstSpell.shape[0]

172324