-
Notifications
You must be signed in to change notification settings - Fork 1
/
stage_02_Objective_0_PreProcessing_004.py
2338 lines (1820 loc) · 97.7 KB
/
stage_02_Objective_0_PreProcessing_004.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
"""
What this file does:
This workbook undertakes pre-processing of the service's data set.
The output from this .py file is 1 or more 'processed' data files for each of the
objectives in scope.
The file checks for the presence of a directory structure, and if not present, creates this.
The file saves the processed files into the relevant folder of that created directory
structure, as applicable for each component of the project.
Currently, the processing that takes place serves to clean up the messy data.
At this point in time, this is done in one of 2 ways, dependant whether the field in question
is a numeric (float or integer) field or a string (text) field. If numeric, the code calculates
the median of that field and replaces (imputes) all Null (missing) values with the median. If string
the code replaces (imputes) the Null (missing) value with the word 'missing'. This is because Machine
Learning algorithms can't handle missing values.
It is important to note though that while the code will identify what fields are numeric
and what string, it gives the user the option of confirming what fields are in scope for
imputing missing values (median or word 'missing').
"""
# ---------------------------------------
#<<< Importing Libraries >>>
# ---------------------------------------
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
import os
# ---------------------------------------
# Check whether the "Assets_produced_by_code folder exists in the current directory.
# If it doesn't exist, create it with the following directory structure:
# Assets_produced_by_code (top level)
# 01_pre_processing_assets<br>
# 02_HEA_assets
# 03_DNA_ML_assets
# 04_Carbon_emissions_assets
#subdirectory folder names for assets folder
preprocessing_assets_path = 'Assets_produced_by_code/01_pre_processing_assets'
hea_assets_path = 'Assets_produced_by_code/02_HEA_assets'
dna_assets_path = 'Assets_produced_by_code/03_DNA_ML_assets'
carbon_emissions_assets_path = 'Assets_produced_by_code/04_Carbon_emissions_assets'
#logic to check if Assets_produced_by_code folder exists, if not, create it with subdirs too
if os.path.exists('Assets_produced_by_code') == False:
os.makedirs('Assets_produced_by_code')
os.makedirs(preprocessing_assets_path)
os.makedirs(hea_assets_path)
os.makedirs(dna_assets_path)
os.makedirs(carbon_emissions_assets_path)
print("New directories have been created to store the outputs from the code.")
print("Existing 'Assets_produced_by_code' directory located.")
# ---------------------------------------
#<<< Create branding colours >>>
# ---------------------------------------
#Taken from https://www.england.nhs.uk/nhsidentity/identity-guidelines/colours/
"""
New code to use user-entered branding colours, taken from the user_params file.
"""
#identify the colour values to use from the Branding tab of the user parameters file
filename = "raw_data/user_and_data_parameters/user_and_data_params.xlsx"
#read-in user param for level_1_colour
level_1_colour = pd.read_excel(filename, 'Branding', index_col=None, usecols = "D", header = 1, nrows=0)
level_1_colour = list(level_1_colour)[0]
#read-in user param for level_2_colour
level_2_colour = pd.read_excel(filename, 'Branding', index_col=None, usecols = "D", header = 2, nrows=0)
level_2_colour = list(level_2_colour)[0]
#read-in user param for level_2_colour
level_3_colour = pd.read_excel(filename, 'Branding', index_col=None, usecols = "D", header = 3, nrows=0)
level_3_colour = list(level_3_colour)[0]
#read-in user param for level_2_colour
level_4_colour = pd.read_excel(filename, 'Branding', index_col=None, usecols = "D", header = 4, nrows=0)
level_4_colour = list(level_4_colour)[0]
# ---------------------------------------
#<<< Define Functions >>>
# ---------------------------------------
# ---------------------------------------
#<<< Define Functions >>>
# ---------------------------------------
#Identify any duplicate rows and allow user to decide whether to remove
def identify_duplicate_rows(df):
#Could use stringtobool in this function - this recognises commonly used yes/no responses, converts to 1 or 0
"""
>> from distutils.util import strtobool
>> strtobool('yes')
"""
original_df_count = df.count()
original_df_num_rows = int(original_df_count[0])
print(f'\nOriginal number of rows: {original_df_num_rows}')
df_dedup = df.drop_duplicates()
df_dedup_count = df_dedup.count()
df_dedup_num_rows = int(df_dedup_count[0])
print(f'\nNumber of unique rows if duplicates removed: {df_dedup_num_rows}')
num_duplicates = original_df_num_rows - df_dedup_num_rows
#print(f'Number of duplicates removed: {num_duplicates}')
df_to_use = pd.DataFrame()
#Remove Duplicates - User Decision if duplicates detected
if num_duplicates != 0:
print(f'\nWARNING: This data set has {num_duplicates} duplicate rows:')
df_dup_rows = df.duplicated()
print(df[df_dup_rows])
print('\nWould you like to remove these duplicate rows?')
while True:
try:
remove_dups = int((input('\nPress 1 to remove or 2 to retain >>>> ')))
if remove_dups == 1:
df_to_use = df_to_use.append(df_dedup)
break
elif remove_dups == 2:
df_to_use = df_to_use.append(df)
break
else:
print('That is not a valid selection.')
except:
print('That is not a valid selection.')
return df_to_use
else:
print('This data set has no duplicate rows.')
return df
# ---------------------------------------
#Allow user to define what column heading / data field name is a unique identifier for patients
#such as customer ID, patient ID etc.
#Assign to variable name of "patient_id", which is then used throughout the rest of the program
#Attempts to control for typos / name variations etc. in data sets for different use cases
def user_field_selection(field_list, field_name):
dict_of_fields = {}
key_counter = 1
for value in field_list:
dict_of_fields[key_counter] = value
key_counter += 1
print("\nThe data fields present in the data set are listed below")
for key_number in dict_of_fields:
print(str(key_number) + " : " + str(dict_of_fields[key_number]))
print(f"\nPlease enter the number that represents the {field_name} field in the above list:\n")
while True:
try:
patient_id_num = int(input('Make a selection: '))
break
except ValueError:
print('Invalid Input. Please enter a number.')
print(f"\nGreat. The entered number was: {patient_id_num}")
print(f"\nThe {field_name} field that the program will use is: {dict_of_fields[patient_id_num]}")
return dict_of_fields[patient_id_num]
# ---------------------------------------
def data_quality_summary(df, field_list):
present_values = []
missing_values = []
percent_missing = []
list_of_lists = [present_values, missing_values, percent_missing]
list_names = ["Present", "Missing", "% Missing"]
for col_name in field_list:
col_missing = df[col_name].isna().sum()
missing_values.append(col_missing)
col_present = df[col_name].count()
present_values.append(col_present)
try:
percent = (col_missing / (col_missing + col_present)) * 100
percent_missing.append(round(percent,1))
except:
percent_missing.append(round(0.0,1))
data_quality_summary_df = pd.DataFrame(list_of_lists, columns = field_list, index = list_names)
#create new lists, representing counts of present / missing data by
#variable, and percent missing
#To enable summary table beneath chart (subsequent cell)
present = data_quality_summary_df.loc["Present"].values.tolist()
missing = data_quality_summary_df.loc["Missing"].values.tolist()
missing_pct = data_quality_summary_df.loc["% Missing"].values.tolist()
return list_names, present, missing, missing_pct, data_quality_summary_df
# ---------------------------------------
#consider adding second axis plot for % missing
#This could improve interpretation (where missing count small, its invisible on the chart...)
def present_missing_stacked_bar(
fields_list,
present,
missing,
present_color,
missing_color,
title,
filename,
file_location):
#width = 0.35
width = 0.5
fig, ax = plt.subplots()
ax.bar(fields_list, present, width, label="Present Values", color=present_color)
ax.bar(fields_list, missing, width, bottom=present, label="Missing Values", color=missing_color)
ax.set_ylabel("Value count")
ax.set_title(f"Missing Values Summary: {title}")
ax.tick_params(axis='x', labelrotation = 90)
ax.legend(loc="lower right")
plt.savefig(file_location+filename+".png", bbox_inches="tight")
plt.show()
# ---------------------------------------
#https://www.pythonpool.com/matplotlib-table/
#data_quality_summary_df.loc[data_quality_summary_df['column_name'] == some_value]
def present_missing_stacked_bar_with_table(
list_names,
fields_list,
present,
missing,
missing_pct,
present_color,
missing_color,
pct_color,
title,
filename,
file_location):
plt.rcParams['figure.figsize'] = [10, 5]
data = [present, missing, missing_pct]
columns = fields_list
rows = list_names #present, missing, % missing
colors = [present_color, missing_color, pct_color]
n_rows = len(data) #3
index = np.arange(len(columns)) + 0.3
bar_width = 0.4
# Initialize the vertical-offset for the stacked bar chart.
y_offset = np.zeros(len(columns))
# Plot bars and create text labels for the table
cell_text = []
for row in range(n_rows):
#y_offset_edit = y_offset + data[row]
cell_text.append([f'{x:.1f}' for x in data[row]])
if "%" in rows[row]:
continue
plt.bar(index, data[row], bar_width, bottom=y_offset, color=colors[row])
# Add a table at the bottom of the axes
the_table = plt.table(cellText=cell_text,
rowLabels=rows,
rowColours=colors,
colLabels=columns,
loc='bottom')
the_table.auto_set_font_size(False)
the_table.set_fontsize(10)
# Adjust layout to make room for the table:
plt.subplots_adjust(left=0.2, bottom=0.2)
plt.ylabel("Count of rows in the data set used")
#plt.yticks(values * value_increment, ['%d' % val for val in values])
plt.xticks([])
plt.title(f"Missing Data Summary: {title}.")
plt.savefig(file_location+filename+".png", bbox_inches="tight")
plt.show()
# ---------------------------------------
#Function to visually check if missing at random or not
#Definte function to visualise matrix of missing values
#Missing data appears as white space in the chart
#Development idea: set fig size for the this (?and possibly, all) chart(s) to ensure consistency. Esp. if creating a report at the end?
def dq_matrix(df, filename, file_location):
matrix = msno.matrix(df)
matrix_copy = matrix.get_figure() #new line
matrix_copy.savefig(file_location+filename+".png", bbox_inches="tight") #changed from matrix to matrix_copy
return matrix
# ---------------------------------------
#The missingno correlation heatmap measures nullity correlation:
#how strongly the presence or absence of one variable affects the presence of another:
#(source: https://github.com/ResidentMario/missingno)
def heatmap(df, filename, file_location):
print("WARNING: Results in the below chart that show '<1' or '>-1' suggest the presence of a correlation that is close to being exactly negative or positive (but is still not quite perfectly so) and cases requiring special attention.")
heatmap = msno.heatmap(df)
heatmap_copy = heatmap.get_figure() #newline
heatmap_copy.savefig(file_location+filename+".png", bbox_inches="tight") #change name from heatmap
return heatmap
# ---------------------------------------
def missing_values_field_names(fields_list, missing):
"""
dict datafields (keys) and missing series (values)
iterate over values in datafields dict, check if >0
if so, create new dict of the subset where there are missing values (misisng == >0)
then, iterate over that, and use each key/value pair, plug into impute function
"""
keys = fields_list
values = missing
#dict_fieldname_missing_values = dict(zip(keys, values))
#dict_fieldname_missing_values
#Create dictionary of field names (keys) and county of missing values (values)
iterator_fieldname_missing_values = zip(keys, values)
dict_fieldname_missing_values = dict(iterator_fieldname_missing_values)
#Use this dictionary to create a list, consisting of the subset of field names, where there are missing values
#This will be used later in the program to identify which fields require missing values to be accounted for in some way
missing_values_fields = []
fields_none_missing = []
for field in fields_list:
if dict_fieldname_missing_values[field] >0:
missing_values_fields.append(field)
else:
fields_none_missing.append(field)
return dict_fieldname_missing_values, missing_values_fields
# ---------------------------------------
#Function adjusted from the titanic example by Mike Allen
def impute_missing_with_median(series):
"""
Replace missing values in a Pandas series with median,
Returns a completed series, and a series showing which values are imputed
Required because ML algorithms like Log regression cannot handle missing / Null values.
By imputing missing values we are replaced all Null values in a numeric series with the
median of that series.
"""
median = series.median(skipna = True) #ignore missing values in calculation
missing = series.isna()
# Copy the series to avoid change to the original series.
series_copy = series.copy()
series_copy[missing] = median
return series_copy, missing
# ---------------------------------------
def group_fields_by_data_type(df, fields_list, dict_fieldname_missing_values):
"""
Identify which fields both have missing values and are an appropriate data type to have their median calculated.
This doesn't consider whether an int/float field is contextually appropriate to have a median calculated.
E.g. if appt_id is an incremental numeric field, it will be logically identified and assigned to int_float_fields
but, it is contextually meaningless to give a median appt_id.
Therefore, the output of this step needs some form of user validation / approval to proceed.
This is to be handled in the next cell.
"""
#df_with_imd_unique_pts.dtypes
data_type_dict = dict(df.dtypes)
int_float_fields = []
str_fields = []
datetime_fields = []
#original - poss error in df_with_imd_unique_pts
"""
#identify which fields both have missing values and are int or float type. Append to list.
for field in fields_list:
if dict_fieldname_missing_values[field] > 0:
if df_with_imd_unique_pts[field].dtype == "float" or df_with_imd_unique_pts[field].dtype == "int":
int_float_fields.append(field)
elif df_with_imd_unique_pts[field].dtype == "<M8[ns]":
datetime_fields.append(field)
else:
str_fields.append(field)
return int_float_fields, str_fields, datetime_fields, data_type_dict
"""
#experimental code attempt to correct error
#identify which fields both have missing values and are int or float type. Append to list.
for field in fields_list:
if dict_fieldname_missing_values[field] > 0:
if df[field].dtype == "float" or df[field].dtype == "int":
int_float_fields.append(field)
elif df[field].dtype == "<M8[ns]":
datetime_fields.append(field)
else:
str_fields.append(field)
return int_float_fields, str_fields, datetime_fields, data_type_dict
# ---------------------------------------
"""
User confirmation these are all logically / contextually relevant
(e.g. numeric patient ID could have median calculated but this doesnt make sense and
should be removed before the process to control for missing values takes place)
This function requires two input parameters:
1. a field_list, and
2. a data type as a string ("int" or "string")
"""
def identify_fields_to_impute(fields_list, data_type):
#tell python which variables from the global scope to update
global int_float_fields_keep
global int_float_fields_drop
global str_fields_keep
global str_fields_drop
global datetime_fields_keep
global datetime_fields_drop
#Create a dictionary of incremental numbers starting from 1 (keys) and
#field names identified in the last step as being int/float data type and having missing values (values)
dict_of_missing_fields = {}
key_counter = 1
for value in fields_list:
dict_of_missing_fields[key_counter] = value
key_counter += 1
#Print to screen a numbered 'list' to the user, listing the fields that have been identified as int/float and missing values
if data_type == "int":
data_type_descriptor = "numeric (integer/decimal)"
elif data_type == "str":
data_type_descriptor = "string (text)"
else:
data_type_descriptor = "date time type"
print(f"\nThe data fields with missing data that look to be {data_type_descriptor} are listed below")
for key_number in dict_of_missing_fields:
print(str(key_number) + " : " + str(dict_of_missing_fields[key_number]))
#Get user confirmation that it is contextually appropriate to proceed with these fields for missing value control
print("\nDo you want to impute missing data for all of these fields?")
print("\nPlease enter 'y' to continue with all fields in scope, or 'n' to choose which fields to drop")
while True:
try:
selection = input("Please make a selection (y or n): ").lower()
if selection == "y" or selection == "n":
break
print("Invalid selection")
except ValueError:
print("\nThat is not a valid selection. Please try again (y or n).")
#loop to choose which fields to drop from the missing value process
if selection == "n":
while True:
print("\nThe data fields currently in scope are listed below:")
for key_number in dict_of_missing_fields:
print(f"{key_number} : {dict_of_missing_fields[key_number]}")
print("0 : To continue with the current selection")
while True:
try:
drop_selection = int(input("\nPlease enter the number of the field to remove, or 0 to continue with the current selection:"))
if drop_selection in range(key_counter):
break
else:
print("\nThat is not a correct selection.")
except ValueError:
print("Value Error - try again.")
if drop_selection == 0:
break
else:
if data_type == "int":
int_float_fields_drop.append(dict_of_missing_fields[drop_selection]) #create new list of fields user does not want to control for missing values
del dict_of_missing_fields[drop_selection]
elif data_type == "str":
str_fields_drop.append(dict_of_missing_fields[drop_selection]) #create new list of fields user does not want to control for missing values
del dict_of_missing_fields[drop_selection]
else:
datetime_fields_drop.append(dict_of_missing_fields[drop_selection]) #create new list of fields user does not want to control for missing values
del dict_of_missing_fields[drop_selection]
else:
print("Please enter a valid selection.")
if data_type == "int":
int_float_fields_keep = fields_list
for item in int_float_fields_drop:
int_float_fields_keep.remove(item)
elif data_type == "str":
str_fields_keep = fields_list
for item in str_fields_drop:
str_fields_keep.remove(item)
else:
datetime_fields_keep = fields_list
for item in datetime_fields_drop:
datetime_fields_keep.remove(item)
print()
print(f"\nGreat. The following {data_type_descriptor} fields will be used when controlling for missing data:")
if data_type == "int":
for item in int_float_fields_keep:
print(item)
elif data_type == "str":
for item in str_fields_keep:
print(item)
else:
for item in datetime_fields_keep:
print(item)
# ---------------------------------------
"""
Decision / consideration needed:
>Need to decide whether to remove df rows based on missing values in
int_float_fields_drop and str_fields_drop lists or exclude the df columns these fields
relate to entirely, as ML can't handle Nulls.
Current method involves removing the rows.
"""
def remove_inappropriate_appt_statuses(df, df_field, field_name):
print(f"\nThe {field_name} field that the program will use is: {df_field}")
print("\nThe program requires a binary outcome measure (e.g. attended or did not attend).")
ls = list(set(df[df_field]))
unique_options_attend_status = [item for item in ls if type(item) != float]
dict_of_unique_options = {}
appt_status_to_drop = []
appt_status_to_keep = []
if len(unique_options_attend_status) == 2:
print(f"{df_field} is a binary measure - perfect!")
unique_key_counter = 1
for option in unique_options_attend_status:
dict_of_unique_options[unique_key_counter] = option
unique_key_counter += 1
elif len(unique_options_attend_status) > 2:
print(f"\n{df_field} has too many possible appointment statuses.")
print("\nThe unique appointment statuses are listed below.")
print("One by one, please enter the number for each option that represents a staus other than attended and did not attend (e.g. patient cancelled, etc.)")
print("This will remove rows for those appointments from the data set to be used in the analysis.")
print("Once the list only includes 2 appointment statuses, enter 0 to continue.")
#dict_of_unique_options = {}
unique_key_counter = 1
for option in unique_options_attend_status:
dict_of_unique_options[unique_key_counter] = option
unique_key_counter += 1
#appt_status_to_drop = []
#appt_status_to_keep = []
appt_status_to_drop.append("nan")
#loop to choose which fields to drop from the missing value process
while True:
print("\nThe appointment statuses currently in scope are listed below:")
for key_number in dict_of_unique_options:
print(f"{key_number} : {dict_of_unique_options[key_number]}")
print("0 : To continue with the current selection")
while True:
try:
drop_selection = int(input("\nPlease enter the number of the appointment status to remove, or 0 to continue with the current selection:"))
if drop_selection in range(unique_key_counter):
break
else:
print("\nThat is not a correct selection.")
except ValueError:
print("Value Error - try again.")
if drop_selection != 0:
appt_status_to_drop.append(dict_of_unique_options[drop_selection]) #create new list of appt statuses user wants to exclude
del dict_of_unique_options[drop_selection]
elif drop_selection == 0:
break
# else:
# appt_status_to_keep.append(dict_of_unique_options[drop_selection]) #create new list of appt statuses user wants to include
# del dict_of_unique_options[drop_selection]
else:
print("Please enter a valid selection.")
for key in dict_of_unique_options:
appt_status_to_keep.append(dict_of_unique_options[key])
# else:
# continue #WARNING possible point of failure if there is only one appointment status in the column, not handled currently
return appt_status_to_drop, appt_status_to_keep
#print(patient_id) #test print statement, delete
# ---------------------------------------
#Consider how to change the terminology used here, in case the term "missing" is used in the actual data set passed
#Taken from titanic - 01. pre-processing example:
def impute_missing_with_missing_label(series):
"""Replace missing values in a Pandas series with the text 'missing'"""
missing = series.isna()
# Copy the series to avoid change to the original series.
series_copy = series.copy()
series_copy[missing] = 'missing'
return series_copy, missing
# ---------------------------------------
#REPLACED WITH NEW FUNCTION BELOW
'''
def impute_missing(field_list, data_type):
#Tell the function to update variables that exist in the global space, and not locally within the function
global dict_ints_imputed
global dict_ints_imputed_missing
global dict_str_imputed
global dict_str_imputed_missing
if data_type == "int":
for field in field_list:
series, missing = impute_missing_with_median(obj_1a_df_unique_pts[field])
dict_ints_imputed[field] = series
dict_ints_imputed_missing[field] = missing
else:
for field in field_list:
series, missing = impute_missing_with_missing_label(obj_1a_df_unique_pts[field])
dict_str_imputed[field] = series
dict_str_imputed_missing[field] = missing
'''
def impute_missing(df, field_list, data_type):
#Tell the function to update variables that exist in the global space, and not locally within the function
global dict_ints_imputed
global dict_ints_imputed_missing
global dict_str_imputed
global dict_str_imputed_missing
if data_type == "int" or data_type == 'float':
for field in field_list:
series, missing = impute_missing_with_median(df[field])
dict_ints_imputed[field] = series
dict_ints_imputed_missing[field] = missing
else:
for field in field_list:
series, missing = impute_missing_with_missing_label(df[field])
dict_str_imputed[field] = series
dict_str_imputed_missing[field] = missing
# ---------------------------------------
#duplicate function for impute missing objective 1b - error using original for 2nd time
#testing new solution with this approach
def impute_missing_1b(df, field_list, data_type):
#Tell the function to update variables that exist in the global space, and not locally within the function
global dict_ints_imputed_1b
global dict_ints_imputed_missing_1b
global dict_str_imputed_1b
global dict_str_imputed_missing_1b
if data_type == "int" or data_type == 'float':
for field in field_list:
series, missing = impute_missing_with_median(df[field])
dict_ints_imputed_1b[field] = series
dict_ints_imputed_missing_1b[field] = missing
else:
for field in field_list:
series, missing = impute_missing_with_missing_label(df[field])
dict_str_imputed_1b[field] = series
dict_str_imputed_missing_1b[field] = missing
# ---------------------------------------
'''
#original function
#identify the fields that are labels and not numbers (as the encoding will be done on label data only)
def fields_to_encode_or_drop(df):
final_processed_data_fields = list(df.columns)
fields_to_not_encode = []
fields_to_encode = []
fields_to_remove = [] #unsure if this is needed? had thought so for bools, but now removed those via alternative method
for field in final_processed_data_fields:
if field != patient_id and field != appt_id:
if df[field].dtype == "float64" or df[field].dtype == "int" or df[field].dtype == "<M8[ns]":
fields_to_not_encode.append(field)
#elif df[field].dtype == "<M8[ns]":
# fields_to_remove.append(field)
else:
fields_to_encode.append(field)
return fields_to_not_encode, fields_to_encode, fields_to_remove
'''
#revised function to remove columns from the df where they are all blank
#revised function 18/9
#identify the fields that are labels and not numbers (as the encoding will be done on label data only)
def fields_to_encode_or_drop(df):
#revised to include the global variables
global patient_id
global appt_id
global age
global lsoa_residence
final_processed_data_fields = list(df.columns)
fields_to_not_encode = []
fields_to_encode = []
fields_to_remove = [] #unsure if this is needed? had thought so for bools, but now removed those via alternative method
for field in final_processed_data_fields:
#if field != patient_id and field != appt_id and field != age and field != lsoa_residence: #revised to exclude age and lsoa from encoding
if field != patient_id and field != appt_id and field != lsoa_residence: #revised to exclude age and lsoa from encoding
if df[field].dtype == "float64" or df[field].dtype == "int" or df[field].dtype == "<M8[ns]" or df[field].dtype == 'int64':
fields_to_not_encode.append(field)
else:
try:
if df[field].value_counts()['missing'] == df.shape[0]:
fields_to_remove.append(field)
except KeyError:
fields_to_encode.append(field)
return fields_to_not_encode, fields_to_encode, fields_to_remove
# ---------------------------------------
#create function to encode all relevant fields for a given data frame and list of fields to encode, passed into the function as parameters
def encoding_dataframe(dataframe, field_list_to_encode, field_list_to_not_encode):
unencoded_df = pd.DataFrame()
for field in field_list_to_not_encode:
unencoded_df = pd.concat([unencoded_df, dataframe[field]], axis=1)
#Create blank dataframe that will be populated with encoded series
encoded_dataframe = pd.DataFrame()
#iterate through all fields in the dataframe passed to the function
for field in field_list_to_encode:
#encode the current field in the loop and create a dataframe for the current fields encoding
field_encoded = pd.get_dummies(dataframe[field], prefix=f"encoded_{field}")
#concatenate the current fields encoding with the newly create dataframe above
encoded_dataframe = pd.concat([encoded_dataframe, field_encoded], axis=1)
combined_df = pd.concat([unencoded_df, encoded_dataframe], axis=1)
return combined_df
# ---------------------------------------
#Function to remove bool series from final produced df
def remove_booleans_from_df(df):
final_processed_df_bools_removed = pd.DataFrame()
for column in df:
if df[column].dtype != 'bool':
final_processed_df_bools_removed[column] = df[column]
return final_processed_df_bools_removed
# ---------------------------------------
#function to create subset dataframes from main df, for each objective in project
#def make_df(df, dict_field_to_keep):
# subset_df = df.copy()
# for key in dict_field_to_keep.keys():
# if dict_field_to_keep[key] == 'n':
# subset_df = subset_df.drop(key, 1)
# return subset_df
#new function to control for capitalised field names in raw source data - replace commented out function above 17/09
#function to create subset dataframes from main df, for each objective in project
def make_df(df, dict_field_to_keep):
subset_df = df.copy()
for key in dict_field_to_keep.keys():
if dict_field_to_keep[key] == 'n':
subset_df = subset_df.drop(key.lower(), 1)
return subset_df
# ---------------------------------------
def assign_field_name_to_variable(filename, sheetname, row_num):
"""
function to assign a given user-provided field names in the 'user_data_and_params.xlsx file,
to a variable name.
"""
field_name = pd.read_excel(filename, sheetname, index_col=None, usecols = "B", header = row_num, nrows=0)
field_name = field_name.columns.values[0]
try:
field_name = (field_name.split("."))[0]
except:
pass
return(field_name)
# ---------------------------------------
def press_any_key_to_continue():
"""
Function to pause the code from running, requiring a user to hit any key for code
processing to resume.
"""
print("")
print("-" * 44)
print("<<<<< Press any key to continue >>>>> ")
print("-" * 44)
input()
# ---------------------------------------
#function to provide numbered prefixes to file names for files saved as outputs from the code
def number_saved_file_preprocessing(counter):
"""
To be used as follows:
file_prefix = number_saved_file(counter)
"""
#check if counter between 1 and 9, if so return counter prefixed by two leading zeros.
#e.g. if counter = 1, this would return 001
if counter in range (1,10):
return(f"00{counter}")
elif counter in range(10,100):
return(f"0{counter}")
else:
return(counter)
# ---------------------------------------
# ---------------------------------------
# <<< CODE STARTS! >>>
# ---------------------------------------
# ---------------------------------------
#setting up parameters
#counters for file names
df_counter = 1
chart_counter = 1
# Read in user_and_data_params file assign to filename variable
filename = "raw_data/user_and_data_parameters/user_and_data_params.xlsx"
# ---------------------------------------
#Assign all field names from the data set to variable names, for use throughout the code
patient_id = assign_field_name_to_variable(filename, "AssignFieldNamesToVars", 1).lower()
appt_id = assign_field_name_to_variable(filename, "AssignFieldNamesToVars", 2).lower()
age = assign_field_name_to_variable(filename, "AssignFieldNamesToVars", 3).lower()
lsoa_residence = assign_field_name_to_variable(filename, "AssignFieldNamesToVars", 4).lower()
ethnicity = assign_field_name_to_variable(filename, "AssignFieldNamesToVars", 5).lower()
gender_at_birth = assign_field_name_to_variable(filename, "AssignFieldNamesToVars", 6).lower()
attend_status = assign_field_name_to_variable(filename, "AssignFieldNamesToVars", 7).lower()
appt_type = assign_field_name_to_variable(filename, "AssignFieldNamesToVars", 8).lower()
appt_date_time = assign_field_name_to_variable(filename, "AssignFieldNamesToVars", 9).lower()
gp_practice_code = assign_field_name_to_variable(filename, "AssignFieldNamesToVars", 10).lower()
religion = assign_field_name_to_variable(filename, "AssignFieldNamesToVars", 11).lower()
disability = assign_field_name_to_variable(filename, "AssignFieldNamesToVars", 12).lower()
pregnant = assign_field_name_to_variable(filename, "AssignFieldNamesToVars", 13).lower()
clinic_name = assign_field_name_to_variable(filename, "AssignFieldNamesToVars", 14).lower()
attend_reason = assign_field_name_to_variable(filename, "AssignFieldNamesToVars", 15).lower()
activity_in_van = assign_field_name_to_variable(filename, "AssignFieldNamesToVars", 16).lower()
in_out_area_classifier = assign_field_name_to_variable(filename, "AssignFieldNamesToVars", 17).lower()
imd_decile = assign_field_name_to_variable(filename, "AssignFieldNamesToVars", 18).lower()
clinic_format = assign_field_name_to_variable(filename, "AssignFieldNamesToVars", 19).lower()
# ---------------------------------------
#sense checking cell - include in final - print at the end of this section exists to print to the screen for the user to see the code has read in th right variable to the relevant string name
#sense checking cell - include in final - print at the end of this section exists to print to the screen for the user to see the code has read in th right variable to the relevant string name
list_var_names = [
"patient_id",
"appt_id",
"age",
"lsoa_residence",
"ethnicity",
"gender_at_birth",
"attend_status",
"appt_type",
"appt_date_time",
"gp_practice_code",
"religion",
"disability",
"pregnant",
"clinic_name",
"attend_reason",
"activity_in_van",
"in_out_area_classifier",
"imd_decile",
"clinic_format"
]
dict_var_names_original = {
"patient_id": patient_id,
"appt_id": appt_id,
"age": age,
"lsoa_residence": lsoa_residence,
"ethnicity": ethnicity,
"gender_at_birth": gender_at_birth,
"attend_status": attend_status,
"appt_type": appt_type,
"appt_date_time": appt_date_time,
"gp_practice_code": gp_practice_code,
"religion": religion,
"disability": disability,
"pregnant": pregnant,
"clinic_name": clinic_name,
"attend_reason": attend_reason,
"activity_in_van": activity_in_van,
"in_out_area_classifier": in_out_area_classifier,
"imd_decile": imd_decile,
"clinic_format": clinic_format
}
print("The following lower-case field names have been assigned to each variable (variable name : field name):")
for var_string in list_var_names:
print(f"{var_string} : {dict_var_names_original[var_string]}")
#run press_any_key_to_continue function, to pause the code to allow the user to check the fields have aligned
press_any_key_to_continue()
# ---------------------------------------
#Read in the data set / create DataFrame
csv_files_in_folder = []
# traverse the directory
for root, dirs, files in os.walk(r'raw_data/local_service_data_test'):
# select file name
for file in files:
# check the extension of files
if file.endswith('.csv'):
# print whole path of files
csv_files_in_folder.append(file)
if len(csv_files_in_folder) > 1: #when there are more than 1 csv file present, have user choose which file to use
dict_int_filename = {}
for num in range(len(csv_files_in_folder)):
dict_int_filename[num] = csv_files_in_folder[num]
for num in range(len(csv_files_in_folder)):
print(f"{num}: {csv_files_in_folder[num]}")
user_file_selection = int(input("Please enter the number associated with the data file you wish to use, from the printed list."))
df = pd.read_csv(f'raw_data/local_service_data_test/{csv_files_in_folder[user_file_selection]}') #extended dataset
df.columns = map(str.lower, df.columns) #18/09 addition. zxcv