In [5]:

# Pre-processing dataframe by:
#      - Exluding trusts with fewer than 500 patients
#      - Correcting typos/errors
#      - Adding Patient Count for each trust

def PreProcessing(df_ini):
    # Creating copy of initial df_ini
    df = df_ini
    
#     # Creating column with patient counts for each trust
#     df['Patients_Count'] = df.groupby('ProvCode')['HESID'].transform('count')

#     # Keeping trusts with more than N patients within period
#     Nb_Patients = 500
#     NbPatients_Filter = df['Patients_Count']>=Nb_Patients
#     df = df[NbPatients_Filter]

# Following Keith's advice, remove the condition on minimum number of infections recorded
    
    # Correcting errors/ typos
    if 'ethnicity_updated_cat' in df.columns:
        df['ethnicity_updated_cat'] = df['ethnicity_updated_cat'].str.replace('black', 'Black', regex=False)
        df['ethnicity_updated_cat'] = df['ethnicity_updated_cat'].str.replace('british', 'British', regex=False)
    
    return df


In [7]:

# Returns df with unique trusts and additional columns for:
#      - Count for use of Code (e.g. Y95)
#      - Percentage use for use of Code (e.g. Y95)

def CodeUseTrust(df_ini, Code):
    import numpy as np
    
    # Creating copy of initial df_ini
    df_Code = df_ini
    
    Code_Filter = df_Code['diagnosis_group'].str.contains(Code)
    df_Code[Code+'_Use'] = np.where(Code_Filter, 1, 0)

    df_Trust = df_Code.drop_duplicates(subset='ProvCode')

    df_Trust[Code+'_Count'] = df_Code.groupby('ProvCode')[Code+'_Use'].transform('sum')
    df_Trust[Code+'_Prop'] = 100 * df_Trust[Code+'_Count'] / df_Trust['Patients_Count']

    return df_Trust


In [8]:

# Plot plots for use of Code across trusts:
#      - Count of Code for every trust
#      - Histogram of percentage use for Code
#      - Box plot of percentage use for Code

def PlotStatistics(df):
    import matplotlib.pyplot as plt
    import seaborn as sns
    
    plt.bar(df['ProvCode'], df['Y95_Prop'])
    plt.xlabel('ProvCode')
    plt.ylabel('Percentage Use of Y95 Code (%)')
    plt.title('Percentage Use of Y95 Code Across Trusts')
    plt.show()

    plt.hist( df['Y95_Prop'] )
    plt.xlabel('Percentage Use of Y95 Code (%)')
    plt.ylabel('Trusts')
    plt.title('Histogram of Percentage Use for Y95 Code Across Trusts')
    plt.show()

    ax = sns.boxplot(x=df['Y95_Prop'])
    ax.set_xlabel('Percentage Use of Y95 Code (%)')
    plt.show()
    

In [9]:

# Print table that compares populations WITH/WITHOUT use of Code

def ComparePopulations(df_ini, Code):
    import pandas as pd
    
    # Creating copy of initial df_ini
    df_Code = df_ini
    
    Code_Mask = df_Code[Code+'_Use'] == 1
    df_Use = df_Code[Code_Mask]
    # df_Use.head()

    # Dataframe for diagnoses WITHOUT Y95 code
    df_NotUse = df_Code[~Code_Mask]

    CatVar_List = ['ethnicity_updated_cat', 'sex', 'ageband']

    for CatVar in CatVar_List:
        s1 = df_Use[CatVar].value_counts(normalize=True)
        s2 = df_NotUse[CatVar].value_counts(normalize=True)
        df_1 = pd.DataFrame({CatVar:s1.index, Code+'_Use_Prop':s1.values})
        df_2 = pd.DataFrame({CatVar:s2.index, Code+'_NotUse_Prop':s2.values})
        df_Merged = pd.merge(df_1, df_2, on=CatVar)
        print(df_Merged.head())
        print('---------------')

    print('Statistics for IMD_decile:')
    s1 = df_Use['IMD_decile'].describe()[1:3]
    s2 = df_NotUse['IMD_decile'].describe()[1:3]
    df_1 = pd.DataFrame({'IMD_decile':s1.index, 'Y95_Use_Prop':s1.values})
    df_2 = pd.DataFrame({'IMD_decile':s2.index, 'Y95_NotUse_Prop':s2.values})
    df_Merged = pd.merge(df_1, df_2, on='IMD_decile')
    print(df_Merged.head())


In [10]:

# Creates column for NI / Not NI using method 1

def NI_M1_Identification(df_ini):
    import numpy as np
    
    df = df_ini
    
    Diagnosis_group = df['diagnosis_group']

    # Y95 followed by U071/2
    Y95_Regex_1 = ['Y95(.?)~U071','Y95(.?)~U072']
    # U071/2, Y95
    Y95_Regex_2 = ['U071~Y95', 'U072~Y95']
    # U071/2, ---, Y95
    Y95_Regex_3 = ['U071(.?)~[^~].{0,4}~Y95', 'U072(.?)~[^~].{0,4}~Y95']
    # U071/2, ---, ---, Y95
    Y95_Regex_4 = ['U071(.?)~[^~].{0,4}~[^~].{0,4}~Y95', 'U072(.?)~[^~].{0,4}~[^~].{0,4}~Y95']
    Y95_Regex_List = Y95_Regex_1+Y95_Regex_2+Y95_Regex_3+Y95_Regex_4
    Y95_Regex = '|'.join(Y95_Regex_List)

    Filter_Y95 = Diagnosis_group.str.contains(Y95_Regex, regex=True)
    df['NC_Method1'] = np.where(Filter_Y95, 1, 0)
    
    return df
