In [None]:

# Import Libraries
import pandas as pd
import numpy as np
import os
from dask import dataframe as dd

import matplotlib.pyplot as plt
import seaborn as sns

import datetime
from dateutil.relativedelta import relativedelta

def displayDataFrame(df):
    with pd.option_context('display.max_columns', None):
        display(df)
    
def SummaryDataFrame(df, ConditionName):
    print('-----------------------------')
    print('\033[1m' + ConditionName.upper() + '\033[0m')
    print('\033[1m Columns: \033[0m')
    print(df.columns)
    print('\033[1m Size of dataset: \033[0m', df.shape)
    displayDataFrame(df.head(2))
    

### Importing original datasets

In [6]:

# Load dataset
# --------------------------------------------------
def OpenFile(Path, FileName):
    FilePath = os.path.join(Path, FileName)
    if FileName[-3:]=='csv':
        df = pd.read_csv(FilePath)
    else:
        df = pd.read_excel(FilePath)
    return df
# --------------------------------------------------

ReadCSV = True

Path = r"O:\GIRFT DiRAC\FlavienHardy\1_DataConsistency\Data\0_Raw\21.11"
FileName_dementia = "PD dementia 04-11-21 (with ethnicity IMD and frailty).xlsx"
FileName_autism = "Autism with IMD, HFRS and ethnicity 08-11-21.xlsx"
FileName_diabetes = "DM Combined all 16-11-21.csv"

if ReadCSV:
    print('Reading DEMENTIA dataset...')
    df_dementia_ini = OpenFile(Path, FileName_dementia)
    print('Reading AUTISM dataset...')
    df_autism_ini = OpenFile(Path, FileName_autism)
    print('Reading DIABETES dataset...')
    df_diabetes_ini = OpenFile(Path, FileName_diabetes)

print('-----------------------------')
print('DEMENTIA')
print('Columns:')
print(df_dementia_ini.columns)
print('Size of dataset:', df_dementia_ini.shape)
display(df_dementia_ini.head(2))

print('-----------------------------')
print('AUTISM')
print('Columns:')
print(df_autism_ini.columns)
print('Size of dataset:', df_autism_ini.shape)
display(df_autism_ini.head(2))

print('-----------------------------')
print('DIABETES')
print('Columns:')
print(df_diabetes_ini.columns)
print('Size of dataset:', df_diabetes_ini.shape)
display(df_diabetes_ini.head(2))


Reading DEMENTIA dataset...
Reading AUTISM dataset...
Reading DIABETES dataset...


  if (await self.run_code(code, result,  async_=asy)):


-----------------------------
DEMENTIA
Columns:
Index(['P_Spell_ID1', 'Epikey', 'procedure_group', 'sex', 'EPIstart', 'EPIend',
       'HESID1', 'epiorder', 'ProvCode', 'Sitecode', 'LSOA_2011_Code',
       'main_specialty_code', 'Main_Specialty_Description', 'age_of_patient',
       'diagnosis_group', 'POD', 'MORT', 'FinY', 'Read30', 'Read90',
       'Admission_date1', 'Discharge_date1', 'Spell_Los', 'HFRS_Score',
       'HFRS_Band', 'Ethnicity'],
      dtype='object')
Size of dataset: (193391, 26)


Unnamed: 0,P_Spell_ID1,Epikey,procedure_group,sex,EPIstart,EPIend,HESID1,epiorder,ProvCode,Sitecode,...,MORT,FinY,Read30,Read90,Admission_date1,Discharge_date1,Spell_Los,HFRS_Score,HFRS_Band,Ethnicity
0,1421,700925000000,-1~-1~-1~-1~-1~-1~-1~-1~-1~-1~-1~-1~-1~-1~-1~-...,Male,2020-09-04,2020-09-05,00007773EC3BDCAF9441AFA32057315D,1,RTD,RTD02,...,0,2020/21,,,2020-09-04,2020-09-18,14,0.0,,Not known
1,1421,700925000000,-1~-1~-1~-1~-1~-1~-1~-1~-1~-1~-1~-1~-1~-1~-1~-...,Male,2020-09-05,2020-09-07,00007773EC3BDCAF9441AFA32057315D,2,RTD,RTD02,...,0,2020/21,,,2020-09-04,2020-09-18,14,0.0,,Not known


-----------------------------
AUTISM
Columns:
Index(['P_Spell_ID1', 'Epikey', 'procedure_group', 'sex', 'EPIstart', 'EPIend',
       'HESID1', 'epiorder', 'ProvCode', 'Sitecode', 'LSOA_2011_Code',
       'main_specialty_code', 'Main_Specialty_Description', 'age_of_patient',
       'diagnosis_group', 'POD', 'MORT', 'FinY', 'Read30', 'Read90',
       'Admission_date1', 'Discharge_date1', 'Spell_Los', 'IMD_score',
       'IMD_decile', 'Ethnicity', 'HFRS_score', 'HFRS_Band'],
      dtype='object')
Size of dataset: (791789, 28)


Unnamed: 0,P_Spell_ID1,Epikey,procedure_group,sex,EPIstart,EPIend,HESID1,epiorder,ProvCode,Sitecode,...,Read30,Read90,Admission_date1,Discharge_date1,Spell_Los,IMD_score,IMD_decile,Ethnicity,HFRS_score,HFRS_Band
0,106,504000000000,K633~K635~Y534~U201~-1~-1~-1~-1~-1~-1~-1~-1~-1...,Male,2015-09-09,2015-09-11,00000BB5E7DC3BAB1D477FC33FB398CD,1,RH8,RH801,...,,,2015-09-09,2015-09-11,2,14.369,6.0,White,0,
1,107,509000000000,H259~O302~-1~-1~-1~-1~-1~-1~-1~-1~-1~-1~-1~-1~...,Male,2018-11-30,2018-11-30,00000BB5E7DC3BAB1D477FC33FB398CD,1,RH8,RH801,...,,,2018-11-30,2018-11-30,0,14.369,6.0,White,0,


-----------------------------
DIABETES
Columns:
Index(['Category', 'Ethnicity_Description', 'P_Spell_ID1', 'Right_Epikey',
       'procedure_group', 'Right_sex', 'EPIstart', 'EPIend', 'HESID1',
       'epiorder', 'Right_ProvCode', 'Sitecode', 'LSOA_2011_Code',
       'main_specialty_code', 'Main_Specialty_Description', 'age_of_patient',
       'diagnosis_group', 'Right_POD', 'MORT', 'Right_FinY', 'Read30',
       'Read90', 'Admission_date1', 'Discharge_date1', 'Spell_Los',
       'HFRS_Score', 'HFRS_Band'],
      dtype='object')
Size of dataset: (1500839, 27)


Unnamed: 0,Category,Ethnicity_Description,P_Spell_ID1,Right_Epikey,procedure_group,Right_sex,EPIstart,EPIend,HESID1,epiorder,...,Right_POD,MORT,Right_FinY,Read30,Read90,Admission_date1,Discharge_date1,Spell_Los,HFRS_Score,HFRS_Band
0,White,British,100000833,501614100000.0,U051~Y981~-1~-1~-1~-1~-1~-1~-1~-1~-1~-1~-1~-1~...,Female,2013-04-06,2013-04-09,FUAIUFCSIVOAEJ9,1,...,EM,0,2013/14,N,Y,2013-04-06,2013-04-09,3,61.0,Severe
1,White,British,100000834,501614000000.0,-1~-1~-1~-1~-1~-1~-1~-1~-1~-1~-1~-1~-1~-1~-1~-...,Female,2013-05-13,2013-05-14,FUAIUFCSIVOAEJ9,1,...,EM,0,2013/14,N,Y,2013-05-13,2013-05-17,4,61.0,Severe


### Reorganising and renaming columns

In [187]:

print('Importing IMD data...')
Path_IMD = r"O:\GIRFT DiRAC\FlavienHardy\1_DataConsistency\Data\0_Raw"
FileName_IMD = "File_7_ID_2015_All_ranks__deciles_and_scores_for_the_Indices_of_Deprivation__and_population_denominators.csv"
FilePath_IMD = os.path.join(Path_IMD, FileName_IMD)
df_IMD = OpenFile(Path_IMD, FilePath_IMD)
        

# Pre-process dataset: rename and reorganise features, convert date features
# --------------------------------------------------
def PreProcessing(df_ini, ConditionName, df_IMD):
    df = df_ini.copy()
    print('Pre-processing', ConditionName.upper(), 'dataset...')
    
    # Renaming features: ending in 1 (all three)
    FeaturesEnd1 = [x for x in df.columns if x[-1]=='1']
    for col in FeaturesEnd1:
        df = df.rename(columns={col:col[:-1]})
        
    # Renaming features: beginning with Right_ (diabetes)
    Features_Right = [x for x in df.columns if 'Right_' in x] 
    for col in Features_Right:
        df = df.rename(columns={col:col[6:]})

    # Converting date features
    DateFormat = '%Y-%m-%d'
    DateFeatures = ['Admission_date', 'Discharge_date', 'EPIstart', 'EPIend']
    for date in DateFeatures:
        df.loc[:, date] = pd.to_datetime( df.loc[:, date], format=DateFormat )  
    
    # Renaming existing frailty and deprivation data (autism)
    ColumnRename = ['IMD_score', 'IMD_decile', 'HFRS_score', 'HFRS_band']
    for col in ColumnRename:
        if col in df.columns:
            List = col.split('_')
            col_new = List[0] + '_' + List[1][0].upper()+List[1][1:]
            df = df.rename(columns={col:col_new})
            
    # Adding IMD data if missing (Not required anymore)
    if not('IMD_Score' in df.columns):
        ColumnList = ['LSOA code (2011)', 'Index of Multiple Deprivation (IMD) Score',
                      'Index of Multiple Deprivation (IMD) Decile (where 1 is most deprived 10% of LSOAs)']
        df_IMD_Sub = df_IMD.loc[:, ColumnList]

        df = pd.merge(df, df_IMD_Sub, left_on='LSOA_2011_Code', right_on='LSOA code (2011)', how='left')
        df = df.drop(columns=['LSOA code (2011)'])
        df = df.rename(columns={'Index of Multiple Deprivation (IMD) Score':'IMD_Score',
                                'Index of Multiple Deprivation (IMD) Decile (where 1 is most deprived 10% of LSOAs)':'IMD_Decile'})

    # for DIABETES: Keep ethnicity Category
    if 'Category' in df.columns:
        df = df.drop(columns=['Ethnicity_Description']).rename(columns={'Category':'Ethnicity'})

    # Reorganising columns
    ColumnList = ['P_Spell_ID', 'HESID', 'LSOA_2011_Code',
                  'Admission_date', 'Discharge_date', 'EPIstart', 'EPIend',
                  'diagnosis_group',
                  'Spell_Los', 'sex', 'age_of_patient', 'Ethnicity',
                  'IMD_Score', 'IMD_Decile', 'HFRS_Score', 'HFRS_Band',
                  'ProvCode', 'Main_Specialty_Description', 'POD',
                  'MORT', 'Read30', 'Read90']
    
    MissingColumn = [x for x in ColumnList if not(x in df.columns)]
    if len(MissingColumn)>0:
        print('Missing Feature:', MissingColumn)

    df = df.loc[:, ColumnList]
    
    return df
# --------------------------------------------------

df_dementia = PreProcessing(df_dementia_ini, 'Dementia', df_IMD)
df_autism = PreProcessing(df_autism_ini, 'Autism', df_IMD)
df_diabetes = PreProcessing(df_diabetes_ini, 'Diabetes', df_IMD)

SummaryDataFrame(df_dementia, 'Dementia')
SummaryDataFrame(df_autism, 'Autism')
SummaryDataFrame(df_diabetes, 'Diabetes')


Importing IMD data...
Pre-processing DEMENTIA dataset...
Pre-processing AUTISM dataset...
Pre-processing DIABETES dataset...
-----------------------------
[1mDEMENTIA[0m
[1m Columns: [0m
Index(['P_Spell_ID', 'HESID', 'LSOA_2011_Code', 'Admission_date',
       'Discharge_date', 'EPIstart', 'EPIend', 'diagnosis_group', 'Spell_Los',
       'sex', 'age_of_patient', 'Ethnicity', 'IMD_Score', 'IMD_Decile',
       'HFRS_Score', 'HFRS_Band', 'ProvCode', 'Main_Specialty_Description',
       'POD', 'MORT', 'Read30', 'Read90'],
      dtype='object')
[1m Size of dataset: [0m (193391, 22)


Unnamed: 0,P_Spell_ID,HESID,LSOA_2011_Code,Admission_date,Discharge_date,EPIstart,EPIend,diagnosis_group,Spell_Los,sex,age_of_patient,Ethnicity,IMD_Score,IMD_Decile,HFRS_Score,HFRS_Band,ProvCode,Main_Specialty_Description,POD,MORT,Read30,Read90
0,1421,00007773EC3BDCAF9441AFA32057315D,E01008446,2020-09-04,2020-09-18,2020-09-04,2020-09-05,R410~E86X~R630~G20X~F067~G473~I679~D333~Z922~Z...,14,Male,81,Not known,7.717,9.0,0.0,,RTD,General Internal Medicine,EM,0,,
1,1421,00007773EC3BDCAF9441AFA32057315D,E01008446,2020-09-04,2020-09-18,2020-09-05,2020-09-07,R410~R441~E86X~R630~D529~E559~G20X~F067~I951~M...,14,Male,81,Not known,7.717,9.0,0.0,,RTD,Respiratory Medicine,EM,0,,


-----------------------------
[1mAUTISM[0m
[1m Columns: [0m
Index(['P_Spell_ID', 'HESID', 'LSOA_2011_Code', 'Admission_date',
       'Discharge_date', 'EPIstart', 'EPIend', 'diagnosis_group', 'Spell_Los',
       'sex', 'age_of_patient', 'Ethnicity', 'IMD_Score', 'IMD_Decile',
       'HFRS_Score', 'HFRS_Band', 'ProvCode', 'Main_Specialty_Description',
       'POD', 'MORT', 'Read30', 'Read90'],
      dtype='object')
[1m Size of dataset: [0m (791789, 22)


Unnamed: 0,P_Spell_ID,HESID,LSOA_2011_Code,Admission_date,Discharge_date,EPIstart,EPIend,diagnosis_group,Spell_Los,sex,age_of_patient,Ethnicity,IMD_Score,IMD_Decile,HFRS_Score,HFRS_Band,ProvCode,Main_Specialty_Description,POD,MORT,Read30,Read90
0,106,00000BB5E7DC3BAB1D477FC33FB398CD,E01020243,2015-09-09,2015-09-11,2015-09-09,2015-09-11,I214~I251~I501~E780~F845~F419~J459~-1~-1~-1~-1...,2,Male,52,White,14.369,6.0,0,,RH8,Cardiology,EM,0,,
1,107,00000BB5E7DC3BAB1D477FC33FB398CD,E01020243,2018-11-30,2018-11-30,2018-11-30,2018-11-30,Z121~-1~-1~-1~-1~-1~-1~-1~-1~-1~-1~-1~-1~-1~-1...,0,Male,55,White,14.369,6.0,0,,RH8,Nursing episode,DC,0,,


-----------------------------
[1mDIABETES[0m
[1m Columns: [0m
Index(['P_Spell_ID', 'HESID', 'LSOA_2011_Code', 'Admission_date',
       'Discharge_date', 'EPIstart', 'EPIend', 'diagnosis_group', 'Spell_Los',
       'sex', 'age_of_patient', 'Ethnicity', 'IMD_Score', 'IMD_Decile',
       'HFRS_Score', 'HFRS_Band', 'ProvCode', 'Main_Specialty_Description',
       'POD', 'MORT', 'Read30', 'Read90'],
      dtype='object')
[1m Size of dataset: [0m (1500839, 22)


Unnamed: 0,P_Spell_ID,HESID,LSOA_2011_Code,Admission_date,Discharge_date,EPIstart,EPIend,diagnosis_group,Spell_Los,sex,age_of_patient,Ethnicity,IMD_Score,IMD_Decile,HFRS_Score,HFRS_Band,ProvCode,Main_Specialty_Description,POD,MORT,Read30,Read90
0,100000833,FUAIUFCSIVOAEJ9,E01029075,2013-04-06,2013-04-09,2013-04-06,2013-04-09,J189~E119~N189~-1~-1~-1~-1~-1~-1~-1~-1~-1~-1~-...,3,Female,75,White,20.836,5.0,61.0,Severe,RA4,General medicine,EM,0,N,Y
1,100000834,FUAIUFCSIVOAEJ9,E01029075,2013-05-13,2013-05-17,2013-05-13,2013-05-14,J189~E119~J459~N189~Z895~-1~-1~-1~-1~-1~-1~-1~...,4,Female,75,White,20.836,5.0,61.0,Severe,RA4,General medicine,EM,0,N,Y


### Identify missing mandatory codes

In [188]:

# Find mentions of mandatory codes in CodeList in ICDfeature_Name
# --------------------------------------------------
def FindMandatoryCode(df_ini, ICDfeature_Name, ConditionName, CodeList):
    df = df_ini
    
    Codes = '|'.join(CodeList)

    # Look for ANY mention of mandatory code
    Filter = df[ICDfeature_Name].str.contains(Codes)
    df.loc[:, "ICD10_" + ConditionName] = np.where(Filter, 1, 0)

    return df
# --------------------------------------------------

# Mandatory codes for dementia
CodesDementia_Categories = "F00|F01|F02"
CodesDementia_Unspecified = "F03|F051"
Codes_Dementia_SubCategories = "F000|F001|F002|F009|F010|F011|F012|F013|F018|F019|F020|F021|F022|F023|F024|F028"
Codes_Dementia_Supp = "G301|G302|G308|G309"
CodesDementia = CodesDementia_Categories + "|" + CodesDementia_Unspecified + "|" + Codes_Dementia_SubCategories + "|" + Codes_Dementia_Supp
CodesDementia_List = CodesDementia.split("|")
Parkinson_List = ["G20"] 
ParkinsonsDementia_List = ["F023"]

# Mandatory codes for autism
CodesAutism_List = ["F840", "F841", "F845"]

# Mandatory codes for diabetes
CodesDiabetes_List = ["E10", "E11", "E14"]
CodesDiabetesTypeII_List = ["E11"]


# Look for mandatory codes
df_dementia_2 = FindMandatoryCode(df_dementia, "diagnosis_group", "Dementia", CodesDementia_List)
df_dementia_2 = FindMandatoryCode(df_dementia_2, "diagnosis_group", "Parkinson", Parkinson_List)
df_dementia_2 = FindMandatoryCode(df_dementia_2, "diagnosis_group", "ParkinsonsDementia", ParkinsonsDementia_List)

df_autism_2 = FindMandatoryCode(df_autism, "diagnosis_group", "Autism", CodesAutism_List)

df_diabetes_2 = FindMandatoryCode(df_diabetes, "diagnosis_group", "Diabetes", CodesDiabetes_List)
df_diabetes_2 = FindMandatoryCode(df_diabetes_2, "diagnosis_group", "DiabetesTypeII", CodesDiabetesTypeII_List)


SummaryDataFrame(df_dementia_2, 'Dementia')
SummaryDataFrame(df_autism_2, 'Autism')
SummaryDataFrame(df_diabetes_2, 'Diabetes')



-----------------------------
[1mDEMENTIA[0m
[1m Columns: [0m
Index(['P_Spell_ID', 'HESID', 'LSOA_2011_Code', 'Admission_date',
       'Discharge_date', 'EPIstart', 'EPIend', 'diagnosis_group', 'Spell_Los',
       'sex', 'age_of_patient', 'Ethnicity', 'IMD_Score', 'IMD_Decile',
       'HFRS_Score', 'HFRS_Band', 'ProvCode', 'Main_Specialty_Description',
       'POD', 'MORT', 'Read30', 'Read90', 'ICD10_Dementia', 'ICD10_Parkinson',
       'ICD10_ParkinsonsDementia'],
      dtype='object')
[1m Size of dataset: [0m (193391, 25)


Unnamed: 0,P_Spell_ID,HESID,LSOA_2011_Code,Admission_date,Discharge_date,EPIstart,EPIend,diagnosis_group,Spell_Los,sex,age_of_patient,Ethnicity,IMD_Score,IMD_Decile,HFRS_Score,HFRS_Band,ProvCode,Main_Specialty_Description,POD,MORT,Read30,Read90,ICD10_Dementia,ICD10_Parkinson,ICD10_ParkinsonsDementia
0,1421,00007773EC3BDCAF9441AFA32057315D,E01008446,2020-09-04,2020-09-18,2020-09-04,2020-09-05,R410~E86X~R630~G20X~F067~G473~I679~D333~Z922~Z...,14,Male,81,Not known,7.717,9.0,0.0,,RTD,General Internal Medicine,EM,0,,,0,1,0
1,1421,00007773EC3BDCAF9441AFA32057315D,E01008446,2020-09-04,2020-09-18,2020-09-05,2020-09-07,R410~R441~E86X~R630~D529~E559~G20X~F067~I951~M...,14,Male,81,Not known,7.717,9.0,0.0,,RTD,Respiratory Medicine,EM,0,,,0,1,0


-----------------------------
[1mAUTISM[0m
[1m Columns: [0m
Index(['P_Spell_ID', 'HESID', 'LSOA_2011_Code', 'Admission_date',
       'Discharge_date', 'EPIstart', 'EPIend', 'diagnosis_group', 'Spell_Los',
       'sex', 'age_of_patient', 'Ethnicity', 'IMD_Score', 'IMD_Decile',
       'HFRS_Score', 'HFRS_Band', 'ProvCode', 'Main_Specialty_Description',
       'POD', 'MORT', 'Read30', 'Read90', 'ICD10_Autism'],
      dtype='object')
[1m Size of dataset: [0m (791789, 23)


Unnamed: 0,P_Spell_ID,HESID,LSOA_2011_Code,Admission_date,Discharge_date,EPIstart,EPIend,diagnosis_group,Spell_Los,sex,age_of_patient,Ethnicity,IMD_Score,IMD_Decile,HFRS_Score,HFRS_Band,ProvCode,Main_Specialty_Description,POD,MORT,Read30,Read90,ICD10_Autism
0,106,00000BB5E7DC3BAB1D477FC33FB398CD,E01020243,2015-09-09,2015-09-11,2015-09-09,2015-09-11,I214~I251~I501~E780~F845~F419~J459~-1~-1~-1~-1...,2,Male,52,White,14.369,6.0,0,,RH8,Cardiology,EM,0,,,1
1,107,00000BB5E7DC3BAB1D477FC33FB398CD,E01020243,2018-11-30,2018-11-30,2018-11-30,2018-11-30,Z121~-1~-1~-1~-1~-1~-1~-1~-1~-1~-1~-1~-1~-1~-1...,0,Male,55,White,14.369,6.0,0,,RH8,Nursing episode,DC,0,,,0


-----------------------------
[1mDIABETES[0m
[1m Columns: [0m
Index(['P_Spell_ID', 'HESID', 'LSOA_2011_Code', 'Admission_date',
       'Discharge_date', 'EPIstart', 'EPIend', 'diagnosis_group', 'Spell_Los',
       'sex', 'age_of_patient', 'Ethnicity', 'IMD_Score', 'IMD_Decile',
       'HFRS_Score', 'HFRS_Band', 'ProvCode', 'Main_Specialty_Description',
       'POD', 'MORT', 'Read30', 'Read90', 'ICD10_Diabetes',
       'ICD10_DiabetesTypeII'],
      dtype='object')
[1m Size of dataset: [0m (1500839, 24)


Unnamed: 0,P_Spell_ID,HESID,LSOA_2011_Code,Admission_date,Discharge_date,EPIstart,EPIend,diagnosis_group,Spell_Los,sex,age_of_patient,Ethnicity,IMD_Score,IMD_Decile,HFRS_Score,HFRS_Band,ProvCode,Main_Specialty_Description,POD,MORT,Read30,Read90,ICD10_Diabetes,ICD10_DiabetesTypeII
0,100000833,FUAIUFCSIVOAEJ9,E01029075,2013-04-06,2013-04-09,2013-04-06,2013-04-09,J189~E119~N189~-1~-1~-1~-1~-1~-1~-1~-1~-1~-1~-...,3,Female,75,White,20.836,5.0,61.0,Severe,RA4,General medicine,EM,0,N,Y,1,1
1,100000834,FUAIUFCSIVOAEJ9,E01029075,2013-05-13,2013-05-17,2013-05-13,2013-05-14,J189~E119~J459~N189~Z895~-1~-1~-1~-1~-1~-1~-1~...,4,Female,75,White,20.836,5.0,61.0,Severe,RA4,General medicine,EM,0,N,Y,1,1


In [189]:

# --------------------------------------------------
def FindErrors(df_ini, ConditionName):
    df = df_ini
    
    # Exclude all spells with discharge date > end cut-off-date
    Filter_NotEnd = (df.loc[:, 'Discharge_date']<=pd.to_datetime('2021-03-31', format='%Y-%m-%d'))
    df = df.loc[Filter_NotEnd, :]

    # Exclude all spells with discharge date < start cut-off-date
    Filter_AfterFirstTwoYears = (df.loc[:, 'Discharge_date']>=pd.to_datetime('2013-04-01', format='%Y-%m-%d'))
    df = df.loc[Filter_AfterFirstTwoYears, :]
    
    Filter_Code = df["ICD10_"+ConditionName] == 1
    df_Code = df.loc[Filter_Code, :]
    df_NoCode = df.loc[~Filter_Code, :]

    # Keep first occurence of mandatory code
    df_CodeFirst = df_Code.sort_values(by=['HESID', 'EPIstart', 'EPIend']).drop_duplicates(subset=['HESID'], keep='first')

    ColumnList = ['HESID', 'P_Spell_ID',
                  'EPIstart', 'EPIend', 'Admission_date', 'Discharge_date',
                  'ProvCode', 'POD', 'Main_Specialty_Description']
    
    df = pd.merge( df, df_CodeFirst[ColumnList], how='left', on='HESID' )
    
    ColumnRename_Original = [x for x in df.columns if '_x' in x]
    ColumnRename_First = [x for x in df.columns if '_y' in x]
    ColumnRename = ColumnRename_Original + ColumnRename_First
    
    d_rename = dict(zip(ColumnRename, ColumnRename))
    for feature in ColumnRename:
        if '_x' in feature:
            d_rename[feature] = feature[:-2]
        elif 'y' in feature:
            d_rename[feature] = 'FirstEpisode_' + feature[:-2]
    
    df = df.rename(columns=d_rename)
    
    # Flag first episode
    df_CodeFirst.loc[:, 'FirstDiagnosis'] = 1
    ColumnList = ['P_Spell_ID', 'EPIstart', 'EPIend', 'FirstDiagnosis']
    df = pd.merge(df, df_CodeFirst[ColumnList], on=['P_Spell_ID', 'EPIstart', 'EPIend'], how='left')
    df.loc[:, 'FirstDiagnosis'] = df.loc[:, 'FirstDiagnosis'].fillna(0)
    df = df.astype({'FirstDiagnosis':int})
    
#     FirstDiagnosis_idx = df.index.isin(df_CodeFirst.index)
#     df.loc[:, 'FirstDiagnosis']=np.where(FirstDiagnosis_idx, 1, 0)
    
    # Flag subsequent episodes
    Filter_SubsequentEpisode = (df['FirstEpisode_EPIend']<=df['EPIend']) & (df.loc[:, 'FirstDiagnosis']==0)
    df.loc[:, 'SubsequentEpisode'] = np.where(Filter_SubsequentEpisode, 1, 0)
    
    # Find errors in dataset
    Filter_Error = (df["ICD10_"+ConditionName]==0) & (df['FirstEpisode_EPIend']<df['EPIend'])
    df.loc[:, 'Error_'+ConditionName] = np.where(Filter_Error, 1, 0)

    df = df.sort_values(by=['HESID', 'EPIstart', 'EPIend'])
    
    return df
# --------------------------------------------------

df_dementia_3 = FindErrors(df_dementia_2, "Dementia")
df_autism_3 = FindErrors(df_autism_2, "Autism")
df_diabetes_3 = FindErrors(df_diabetes_2, "Diabetes")

SummaryDataFrame(df_dementia_3, 'Dementia')
SummaryDataFrame(df_autism_3, 'Autism')
SummaryDataFrame(df_diabetes_3, 'Diabetes')

# displayDataFrame(df_dementia_3.head(2))
# displayDataFrame(df_autism_3.head(2))
# displayDataFrame(df_diabetes_3.head(2))


-----------------------------
[1mDEMENTIA[0m
[1m Columns: [0m
Index(['P_Spell_ID', 'HESID', 'LSOA_2011_Code', 'Admission_date',
       'Discharge_date', 'EPIstart', 'EPIend', 'diagnosis_group', 'Spell_Los',
       'sex', 'age_of_patient', 'Ethnicity', 'IMD_Score', 'IMD_Decile',
       'HFRS_Score', 'HFRS_Band', 'ProvCode', 'Main_Specialty_Description',
       'POD', 'MORT', 'Read30', 'Read90', 'ICD10_Dementia', 'ICD10_Parkinson',
       'ICD10_ParkinsonsDementia', 'FirstEpisode_P_Spell_ID',
       'FirstEpisode_EPIstart', 'FirstEpisode_EPIend',
       'FirstEpisode_Admission_date', 'FirstEpisode_Discharge_date',
       'FirstEpisode_ProvCode', 'FirstEpisode_POD',
       'FirstEpisode_Main_Specialty_Description', 'FirstDiagnosis',
       'SubsequentEpisode', 'Error_Dementia'],
      dtype='object')
[1m Size of dataset: [0m (167081, 36)


Unnamed: 0,P_Spell_ID,HESID,LSOA_2011_Code,Admission_date,Discharge_date,EPIstart,EPIend,diagnosis_group,Spell_Los,sex,age_of_patient,Ethnicity,IMD_Score,IMD_Decile,HFRS_Score,HFRS_Band,ProvCode,Main_Specialty_Description,POD,MORT,Read30,Read90,ICD10_Dementia,ICD10_Parkinson,ICD10_ParkinsonsDementia,FirstEpisode_P_Spell_ID,FirstEpisode_EPIstart,FirstEpisode_EPIend,FirstEpisode_Admission_date,FirstEpisode_Discharge_date,FirstEpisode_ProvCode,FirstEpisode_POD,FirstEpisode_Main_Specialty_Description,FirstDiagnosis,SubsequentEpisode,Error_Dementia
0,1421,00007773EC3BDCAF9441AFA32057315D,E01008446,2020-09-04,2020-09-18,2020-09-04,2020-09-05,R410~E86X~R630~G20X~F067~G473~I679~D333~Z922~Z...,14,Male,81,Not known,7.717,9.0,0.0,,RTD,General Internal Medicine,EM,0,,,0,1,0,1421.0,2020-09-07,2020-09-18,2020-09-04,2020-09-18,RTD,EM,Geriatric Medicine,0,0,0
1,1421,00007773EC3BDCAF9441AFA32057315D,E01008446,2020-09-04,2020-09-18,2020-09-05,2020-09-07,R410~R441~E86X~R630~D529~E559~G20X~F067~I951~M...,14,Male,81,Not known,7.717,9.0,0.0,,RTD,Respiratory Medicine,EM,0,,,0,1,0,1421.0,2020-09-07,2020-09-18,2020-09-04,2020-09-18,RTD,EM,Geriatric Medicine,0,0,0


-----------------------------
[1mAUTISM[0m
[1m Columns: [0m
Index(['P_Spell_ID', 'HESID', 'LSOA_2011_Code', 'Admission_date',
       'Discharge_date', 'EPIstart', 'EPIend', 'diagnosis_group', 'Spell_Los',
       'sex', 'age_of_patient', 'Ethnicity', 'IMD_Score', 'IMD_Decile',
       'HFRS_Score', 'HFRS_Band', 'ProvCode', 'Main_Specialty_Description',
       'POD', 'MORT', 'Read30', 'Read90', 'ICD10_Autism',
       'FirstEpisode_P_Spell_ID', 'FirstEpisode_EPIstart',
       'FirstEpisode_EPIend', 'FirstEpisode_Admission_date',
       'FirstEpisode_Discharge_date', 'FirstEpisode_ProvCode',
       'FirstEpisode_POD', 'FirstEpisode_Main_Specialty_Description',
       'FirstDiagnosis', 'SubsequentEpisode', 'Error_Autism'],
      dtype='object')
[1m Size of dataset: [0m (692810, 34)


Unnamed: 0,P_Spell_ID,HESID,LSOA_2011_Code,Admission_date,Discharge_date,EPIstart,EPIend,diagnosis_group,Spell_Los,sex,age_of_patient,Ethnicity,IMD_Score,IMD_Decile,HFRS_Score,HFRS_Band,ProvCode,Main_Specialty_Description,POD,MORT,Read30,Read90,ICD10_Autism,FirstEpisode_P_Spell_ID,FirstEpisode_EPIstart,FirstEpisode_EPIend,FirstEpisode_Admission_date,FirstEpisode_Discharge_date,FirstEpisode_ProvCode,FirstEpisode_POD,FirstEpisode_Main_Specialty_Description,FirstDiagnosis,SubsequentEpisode,Error_Autism
0,106,00000BB5E7DC3BAB1D477FC33FB398CD,E01020243,2015-09-09,2015-09-11,2015-09-09,2015-09-11,I214~I251~I501~E780~F845~F419~J459~-1~-1~-1~-1...,2,Male,52,White,14.369,6.0,0,,RH8,Cardiology,EM,0,,,1,106.0,2015-09-09,2015-09-11,2015-09-09,2015-09-11,RH8,EM,Cardiology,1,0,0
1,107,00000BB5E7DC3BAB1D477FC33FB398CD,E01020243,2018-11-30,2018-11-30,2018-11-30,2018-11-30,Z121~-1~-1~-1~-1~-1~-1~-1~-1~-1~-1~-1~-1~-1~-1...,0,Male,55,White,14.369,6.0,0,,RH8,Nursing episode,DC,0,,,0,106.0,2015-09-09,2015-09-11,2015-09-09,2015-09-11,RH8,EM,Cardiology,0,1,1


-----------------------------
[1mDIABETES[0m
[1m Columns: [0m
Index(['P_Spell_ID', 'HESID', 'LSOA_2011_Code', 'Admission_date',
       'Discharge_date', 'EPIstart', 'EPIend', 'diagnosis_group', 'Spell_Los',
       'sex', 'age_of_patient', 'Ethnicity', 'IMD_Score', 'IMD_Decile',
       'HFRS_Score', 'HFRS_Band', 'ProvCode', 'Main_Specialty_Description',
       'POD', 'MORT', 'Read30', 'Read90', 'ICD10_Diabetes',
       'ICD10_DiabetesTypeII', 'FirstEpisode_P_Spell_ID',
       'FirstEpisode_EPIstart', 'FirstEpisode_EPIend',
       'FirstEpisode_Admission_date', 'FirstEpisode_Discharge_date',
       'FirstEpisode_ProvCode', 'FirstEpisode_POD',
       'FirstEpisode_Main_Specialty_Description', 'FirstDiagnosis',
       'SubsequentEpisode', 'Error_Diabetes'],
      dtype='object')
[1m Size of dataset: [0m (1500839, 35)


Unnamed: 0,P_Spell_ID,HESID,LSOA_2011_Code,Admission_date,Discharge_date,EPIstart,EPIend,diagnosis_group,Spell_Los,sex,age_of_patient,Ethnicity,IMD_Score,IMD_Decile,HFRS_Score,HFRS_Band,ProvCode,Main_Specialty_Description,POD,MORT,Read30,Read90,ICD10_Diabetes,ICD10_DiabetesTypeII,FirstEpisode_P_Spell_ID,FirstEpisode_EPIstart,FirstEpisode_EPIend,FirstEpisode_Admission_date,FirstEpisode_Discharge_date,FirstEpisode_ProvCode,FirstEpisode_POD,FirstEpisode_Main_Specialty_Description,FirstDiagnosis,SubsequentEpisode,Error_Diabetes
440002,941,0005MWSQ5R4HJ9V,E01030750,2016-09-29,2016-09-29,2016-09-29,2016-09-29,I7020~M314~E115~I792~I340~I071~I252~Z955~E782~...,0,Female,38,Other ethnic groups,19.873,5.0,0.0,,RYJ,General surgery,DC,0,,,1,1,941.0,2016-09-29,2016-09-29,2016-09-29,2016-09-29,RYJ,DC,General surgery,1,0,0
440156,942,0005MWSQ5R4HJ9V,E01030750,2016-10-13,2016-10-13,2016-10-13,2016-10-13,M314~I7020~E115~I792~I340~I071~I252~Z955~E782~...,0,Female,38,Other ethnic groups,19.873,5.0,0.0,,RYJ,General surgery,DC,0,,,1,1,941.0,2016-09-29,2016-09-29,2016-09-29,2016-09-29,RYJ,DC,General surgery,0,1,0


### Count coding errors

In [208]:

# Count number of errors
# --------------------------------------------------
def CountErrors(df_ini, NameCondition):
    df = df_ini.copy()
    
    # Keep episodes with existing dementia diagnosis (e.g. no G20X only episodes)
    Filter_NotFirst = (df['SubsequentEpisode']==1)
    df = df.loc[Filter_NotFirst, :]

    Sum = df['Error_'+NameCondition].sum()
    Count = df['Error_'+NameCondition].shape[0]
    Prop = 100 * Sum / Count
    
    return Sum, Count, Prop

# --------------------------------------------------

Sum_dementia, Count_dementia, Prop_dementia = CountErrors(df_dementia_3, 'Dementia')
print('-----------------------------')
print('DEMENTIA')
print('Number of errors:', Sum_dementia)
print('Number of subsequent episodes where mandatory codes should be used:', Count_dementia)
print('Proportion of errors: %0.2f' % Prop_dementia, '%')
Filter_SubsEpisodes = (df_dementia_3['SubsequentEpisode']==1)
Sum_G20 = df_dementia_3.loc[Filter_SubsEpisodes, 'ICD10_Parkinson'].sum()
Count_G20 = df_dementia_3.loc[Filter_SubsEpisodes, 'ICD10_Parkinson'].shape[0]
Prop_G20 = 100 * (Count_G20-Sum_G20) / Count_G20
print('Proportion of subsequent episodes with no mention of PARKINSON: %0.2f' % Prop_G20, '%')

Sum_autism, Count_autism, Prop_autism = CountErrors(df_autism_3, 'Autism')
print('-----------------------------')
print('AUTISM')
print('Number of errors:', Sum_autism)
print('Number of subsequent episodes where mandatory codes should be used:', Count_autism)
print('Proportion of errors: %0.2f' % Prop_autism, '%')

Sum_diabetes, Count_diabetes, Prop_diabetes = CountErrors(df_diabetes_3, 'Diabetes')
print('-----------------------------')
print('DIABETES')
print('Number of errors:', Sum_diabetes)
print('Number of subsequent episodes where mandatory codes should be used:', Count_diabetes)
print('Proportion of errors: %0.2f' % Prop_diabetes, '%')


-----------------------------
DEMENTIA
Number of errors: 31915
Number of subsequent episodes where mandatory codes should be used: 132113
Proportion of errors: 24.16 %
Proportion of subsequent episodes with no mention of PARKINSON: 14.92 %
-----------------------------
AUTISM
Number of errors: 208971
Number of subsequent episodes where mandatory codes should be used: 491190
Proportion of errors: 42.54 %
-----------------------------
DIABETES
Number of errors: 360396
Number of subsequent episodes where mandatory codes should be used: 1377121
Proportion of errors: 26.17 %
