# Data Preparation

## Load and Present Data

In [12]:
# Data sourcing and cleaning
import pandas as pd
import numpy as np
from tensorboard.notebook import display

# Reading the data set 
df = pd.read_csv('../data/raw_data/Children-and-young-ppl-asthma-organisational-audit-2019-20-Data.csv')
print("Original Data")
print(df.head(10))
print("\n \n")
print(df.tail(10))
#Returning how many rows in the data frame 
print("\n The amount of organisations in this data set is",df.count().values[0],".") 

Original Data
  orgcode                          description  \
0     ADD               Addenbrooke's Hospital   
1     AEI        Royal Albert Edward Infirmary   
2     AIR            Airedale General Hospital   
3     ALC  Royal Alexandra Children's Hospital   
4     BAR   Barnsley District General Hospital   
5     BAS                    Basildon Hospital   
6     BAT           Royal United Hospital Bath   
7     BCH       Birmingham Children's Hospital   
8     BED                     Bedford Hospital   
9     BFH                Broomfield Chelmsford   

                                               trust  country emergency_adm  \
0  Cambridge University Hospitals NHS Foundation ...  England             -   
1  Wrightington, Wigan and Leigh NHS Foundation T...  England           400   
2                      Airedale NHS Foundation Trust  England           110   
3  Brighton and Sussex University Hospitals NHS T...  England            10   
4             Barnsley Hospital NHS Foun

## Data Cleaning

In [17]:
# Dropping irrelevent columns 

#Dropping columns with contain "WTE"
valToRemove = "WTE"
listOfColumns = []
for column in df.columns:
    column_name = df[column].astype(str)
    wteString = column_name.str.contains("|".join(valToRemove), case=False, na=False)
    if wteString.any():
        listOfColumns.append(column)
df = df.drop(columns = listOfColumns)

print (df)

    emergency_adm respiratory_adm asthma_adm admsperbed respadmsperbed  \
0               -               -          -          -              -   
1             400              35         10         20              2   
2             110              15        1-7          6              1   
3              10             1-7        1-7          1              0   
4             325              20        1-7         20              1   
..            ...             ...        ...        ...            ...   
137           200              25         10         20              3   
138            26               3          0          -              -   
139           390              65         10         12              2   
140          1510            1160         75         69             53   
141           180              15        1-7          8              1   

    asthmaadmsper1000adms howmanypaediatr_icasthmapatients  \
0                       -                        

Via inspection I have noticed that some values which are - are also equlient to Nan values, but since python is dropna() function only detects nan values I would have to convert the NaN or Null values 

In [19]:
# Data Cleaning - missing values

# Dropping unuseful columns 
# Handling the different types of na values "-, n/a , null"
print('Replacing the - values with na so it can be detected by dropna \n')
df = df.replace('-',np.nan)

# How many NA values are in a row
print("EMPTY ROWS VALUES")
print(df.head(50).isna().sum(axis=1))
print(df.tail(50).isna().sum(axis=1))

# How many NA values are in a column
print("\nEMPTY COLUMNS VALUES")
print(df.isna().sum(axis=0))




Replacing the - values with na so it can be detected by dropna 

EMPTY ROWS VALUES
0     43
1      2
2      3
3      1
4      1
5      3
6      2
7      0
8      1
9      2
10    34
11     2
12     2
13     1
14     2
15    18
16     2
17    34
18     4
19     3
20     2
21     3
22    33
23    47
24     1
25     1
26     1
27     2
28     2
29     3
30     2
31     2
32     1
33     2
34     0
35     1
36    17
37     3
38    46
39     1
40     2
41     1
42     2
43     2
44     1
45    48
46    33
47    32
48     2
49    33
dtype: int64
92     46
93      3
94      1
95      2
96      2
97      0
98     50
99      1
100     2
101     2
102    47
103     0
104     1
105     3
106     2
107    32
108     2
109     2
110     2
111     1
112     0
113     1
114     1
115     2
116     1
117     2
118    33
119     1
120    35
121    47
122    45
123     3
124     1
125     2
126     2
127     1
128    34
129    14
130     1
131     1
132     1
133     1
134    47
135    15
136     1
137 

The statement which I have chosen takes into consideration the amount of na values which is in each row in the different format. Based on the inspection there are many na values in the format - or na, so I have replaced the - values with na so it can be detected. I then checked how many na values are in each row and how many na values are in each column

So it would be appropriate to drop a row or column I would need to have a threshold, 143 by 156 respectively. I believe that if a row or column has too many missing values it would be acceptable to drop them as it wouldn't be useful 

In [None]:
# duplicate values

In [None]:
#formatting

In [None]:
#headers

In [None]:
# Data transformation - normalise, Discretize or aggregate , new attributes 


In [None]:
# Data reduction

In [None]:
# Saving the cleaned data set into a new file 
df.to_csv("cleaned_dataset.csv", index=False)
