# Data Preparation

## Load and Present Data

In [104]:
# Data sourcing and cleaning
from tabulate import tabulate
import pandas as pd
import numpy as np
from tensorboard.notebook import display

# Load the data set 
df = pd.read_csv('../data/raw_data/Children-and-young-ppl-asthma-organisational-audit-2019-20-Data.csv')
df.info()
print("Original Data")
print(df)
#Returning how many rows in the data frame 
print("\n The amount of organisations in the raw data set is",df.count().values[0],".") 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 142 entries, 0 to 141
Columns: 156 entries, orgcode to q8_2hasyourcomm_iatricasthmacare
dtypes: object(156)
memory usage: 173.2+ KB
Original Data
    orgcode                          description  \
0       ADD               Addenbrooke's Hospital   
1       AEI        Royal Albert Edward Infirmary   
2       AIR            Airedale General Hospital   
3       ALC  Royal Alexandra Children's Hospital   
4       BAR   Barnsley District General Hospital   
..      ...                                  ...   
137     WSH                West Suffolk Hospital   
138     WYB           Withybush General Hospital   
139     WYT                 Wythenshawe Hospital   
140     YDH               York District Hospital   
141     YEO             Yeovil District Hospital   

                                                 trust  country emergency_adm  \
0    Cambridge University Hospitals NHS Foundation ...  England             -   
1    Wrightington

## Data Cleaning - Irrelevant Columns

In [105]:
#Dropping irrelevant columns 
#Dropping columns with contain the following keywords which are irrelevant for this data analysis
previous_col_size= len(df.columns)
valsToRemove = ["WTE","q8","q7","q6","q5","q4","q3","othernotlisted","cat"]
listOfColumns = []
for column in df.columns:
    if any(sub in column for sub in valsToRemove):
        listOfColumns.append(column)
    column_name = df[column].astype(str)
    wteString = column_name.str.contains("|".join(valsToRemove), case=False, na=False)
    if wteString.any():
        listOfColumns.append(column) 
df = df.drop(columns = listOfColumns)
succeeding_col_size = len(df.columns)
end_col_size = previous_col_size - succeeding_col_size
print(df)
print("There were", end_col_size,"irrelevant columns in the raw data set")

    orgcode                          description  \
0       ADD               Addenbrooke's Hospital   
1       AEI        Royal Albert Edward Infirmary   
2       AIR            Airedale General Hospital   
3       ALC  Royal Alexandra Children's Hospital   
4       BAR   Barnsley District General Hospital   
..      ...                                  ...   
137     WSH                West Suffolk Hospital   
138     WYB           Withybush General Hospital   
139     WYT                 Wythenshawe Hospital   
140     YDH               York District Hospital   
141     YEO             Yeovil District Hospital   

                                                 trust  country emergency_adm  \
0    Cambridge University Hospitals NHS Foundation ...  England             -   
1    Wrightington, Wigan and Leigh NHS Foundation T...  England           400   
2                        Airedale NHS Foundation Trust  England           110   
3    Brighton and Sussex University Hospitals NHS T

## Data Cleaning - Duplicate Values

In [106]:
# Duplicate values
has_duplicates = df.duplicated().sum()
print("The number of duplicate organisations are ",has_duplicates)  # True if duplicates exist

The number of duplicate organisations are  0


## Data Cleaning - Missing Values 

In [107]:
from sklearn.impute import SimpleImputer

# Handling the different types of na values "-, n/a , null"
print('Replacing the - values with na so it can be detected by dropna \n')
df = df.replace(['-','1-7'],np.nan)

#Number of missing values in whole dataset
total_missing_val= df.isna().sum().sum()

# How many NA values are in a row
print("EMPTY ROWS VALUES")
print(df.isna().sum(axis=1))

# How many NA values are in a column
print("\nEMPTY COLUMNS VALUES")
print(df.isna().sum(axis=0))

# Filling NA values using simple impute and mode(most frequent)
# Separate numeric and string columns
numeric_col = [col for col in df.columns if df[col].dtypes in ['int64', 'float64']]
string_columns = [col for col in df.columns if df[col].dtypes == 'object']

# Impute numeric columns with mean
if numeric_col:
    numeric_imputer = SimpleImputer(strategy='mean')
    df[numeric_col] = numeric_imputer.fit_transform(df[numeric_col])

# Impute string columns with the most frequent value
if string_columns:
    string_imputer = SimpleImputer(strategy='most_frequent')
    df[string_columns] = string_imputer.fit_transform(df[string_columns])

print("\n \n")    
print(df)
print("Total missing values that were replaced by SimpleImputer", total_missing_val )


Replacing the - values with na so it can be detected by dropna 

EMPTY ROWS VALUES
0      39
1       1
2       3
3       3
4       2
       ..
137     1
138    16
139     1
140     1
141     3
Length: 142, dtype: int64

EMPTY COLUMNS VALUES
orgcode                              0
description                          0
trust                                0
country                              0
emergency_adm                       28
                                    ..
asthmanursespecialistunfilledwte    11
nurseconsultant_nurseunfilledwte    12
specialistrespi_rapisunfilledwte    15
paediatricpsych_ogistunfilledwte    13
paediatricpharmacistunfilledwte     14
Length: 63, dtype: int64

 

    orgcode                          description  \
0       ADD               Addenbrooke's Hospital   
1       AEI        Royal Albert Edward Infirmary   
2       AIR            Airedale General Hospital   
3       ALC  Royal Alexandra Children's Hospital   
4       BAR   Barnsley District General H

## Data Cleaning - Data Usability

In [108]:
# Changing data frame column names
col_names = ["Hospital code","Hospital name",]
update_col_names = [f"{name}" for name in col_names]
df.columns = update_col_names


Unnamed: 0,orgcode,description,trust,country,emergency_adm,respiratory_adm,asthma_adm,admsperbed,respadmsperbed,asthmaadmsper1000adms,...,st3andaboveunfilledwte,paediatricconsultantunfilledwte,paediatricrespi_ltantunfilledwte,associatespecialistunfilledwte,staffgradeunfilledwte,asthmanursespecialistunfilledwte,nurseconsultant_nurseunfilledwte,specialistrespi_rapisunfilledwte,paediatricpsych_ogistunfilledwte,paediatricpharmacistunfilledwte
count,142,142,142,142,142,142,142,142,142,142,...,142,142,142,142,142,142,142,142,142,142
unique,142,142,117,3,85,26,14,34,12,54,...,13,7,2,4,6,4,3,4,4,3
top,ADD,Addenbrooke's Hospital,Hywel Dda University LHB,England,220,20,10,9,1,0,...,0,0,0,0,0,0,0,0,0,0
freq,1,1,3,128,35,51,108,43,88,34,...,72,109,138,136,124,129,138,136,131,138


In [None]:
# Comparing sizes of data frames 
# Saving the cleaned data set into a new file 
df.to_csv("cleaned_dataset.csv", index=False)
