-
Notifications
You must be signed in to change notification settings - Fork 1
/
stage_04_Objective_1b_HEA_unmet_need_v004.py
2374 lines (1747 loc) · 136 KB
/
stage_04_Objective_1b_HEA_unmet_need_v004.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
# link to stack overflow post for set up issue I had when running this first time as .py file and getting an error of 'Importing the numpy c-extensions failed'
# https://stackoverflow.com/questions/58868528/importing-the-numpy-c-extensions-failed
# 1. Open VS Code's Command Palette menu by pressing Ctrl+Shift+P or F1
# 2. Choose "Terminal: Select Default Profile" entry
# 3. Then pick "Command Prompt" option
# 4. Restart VS Code
#import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
#import seaborn as sns
import os
#import openpyxl
import copy
#import stats library for chi square test
from scipy import stats
# -----------------------------------------------------
# <<< set up folder structures >>>
# -----------------------------------------------------
# Check whether the "Assets_produced_by_code folder exists in the current directory.
# If it doesn't exist, create it with the following directory structure:
# Assets_produced_by_code (top level)
# 01_pre_processing_assets<br>
# 02_HEA_assets
# 03_DNA_ML_assets
# 04_Carbon_emissions_assets
#subdirectory folder names for assets folder
preprocessing_assets_path = 'Assets_produced_by_code/01_pre_processing_assets'
hea_assets_path = 'Assets_produced_by_code/02_HEA_assets'
dna_assets_path = 'Assets_produced_by_code/03_DNA_ML_assets'
carbon_emissions_assets_path = 'Assets_produced_by_code/04_Carbon_emissions_assets'
#logic to check if Assets_produced_by_code folder exists, if not, create it with subdirs too
if os.path.exists('Assets_produced_by_code') == False:
os.makedirs('Assets_produced_by_code')
os.makedirs(preprocessing_assets_path)
os.makedirs(hea_assets_path)
os.makedirs(dna_assets_path)
os.makedirs(carbon_emissions_assets_path)
print("New directories have been created to store the outputs from the code.")
print("Existing 'Assets_produced_by_code' directory located.")
# -----------------------------------------------------
#set up parameters
# -----------------------------------------------------
#identify the value for num_bins from the user parameters file
#this will be the number of bin-ranges when splitting up the eligible population in age bands,
# for the diff in proportion calculation.
filename = "raw_data/user_and_data_parameters/user_and_data_params.xlsx"
num_bins = pd.read_excel(filename, 'HEA_parameters', index_col=None, usecols = "C", header = 1, nrows=0)
num_bins = list(num_bins)[0]
attend_reason_field_name = 'attend_reason'
age_with_median_field_name = 'age' #above is no longer needed as standardised field names in processing file
la_of_interest_user_list_selection = pd.read_excel(filename, 'HEA_parameters', index_col=None, usecols = "C", header = 3, nrows=0) #ref to change zxcv
la_of_interest_user_list_selection = list(la_of_interest_user_list_selection)[0]
#counters for file names
df_counter = 1
chart_counter = 1
# -----------------------------------------------------
#define functions
# -----------------------------------------------------
#define functions
def subset_pop_estimate(source_df):
"""
function used during the preparation stages to prep for quantifying population size for
demographic in scope.
This function simply takes the source_df, and retains ALL rows, but ONLY the first 6 columns.
This is because the ONS public population estimates by single year of age and LSOA are published
in a format that means the single years of age commence from col 7 onwards.
True as of Aug 2022.
"""
target_df = source_df.iloc[:,:7]
return target_df
# -----------------------------------
def sum_age_range(source_df, target_df, min_age, max_age, condition):
"""
function to sum across columns for each row, from column labelled 'min_age'
to column labelled 'max_age' inclusive
"""
target_df[f"Total_{min_age}_to_{max_age}_{condition}"] = source_df.loc[:, min_age:max_age].sum(axis=1)
return target_df
# -----------------------------------
def sum_per_age_group(source_df, target_df, min_age, max_age, condition):
target_df[f"{condition}_PopAgeRange:_{min_age}-{max_age}"] = source_df.loc[:, min_age:max_age].sum(axis=1)
return target_df
# -----------------------------------
def calc_prevalence_rate(prevalence_type):
"""
function to enable user to provide their prevalence rate
"""
if prevalence_type == 1: #age standardised
prev_rate_type = "age standardised"
else: #crude
prev_rate_type = "crude"
prevalence_per_1k_pop = int(input(f"\nPlease enter the numerator for your {prev_rate_type} prevalence figure for the service in scope. E.g. if the prevalence is 100 per 100,000 population, type 100, then press [ENTER]\n"))
denominator = int(input("\nPlease enter the denominator for your {prev_rate_type} prevalence rate (e.g. 1000, 10000, 100000, etc.). Don't use commas to separate the figure.\n"))
prev_multiplier = prevalence_per_1k_pop / denominator
return prev_multiplier
# -----------------------------------
def calc_list_percents_for_list_ints(list_of_ints):
"""
Function to take in a list of integers, and calculate the relative % of the total that each item in the list represents.
Then, return this as a new list of float percents, in the same order as the original list.
This can be used to then calculate the diff between proportions
"""
array_of_ints = np.array(list_of_ints)
total = sum(list_of_ints)
array_percents = (array_of_ints / total) # * 100
list_percents = array_percents.tolist()
return(list_percents)
# -----------------------------------
def calc_diff_between_proportions(list_prop_1, list_prop_2):
array_prop_1 = np.array(list_prop_1)
array_prop_2 = np.array(list_prop_2)
diff_between_props = (array_prop_1 - array_prop_2).tolist()
return(diff_between_props)
# -----------------------------------
#working code below:
def sum_pop_in_each_age_range_test(num_bins, dict_matched_utlas, la_to_subset_population, dict_condition_age_ranges, condition, dict_condition_bin_range, dict_condition_start_ages, source_df, target_df):
#take first 6 cols from source df to retain lsoa names, codes etc
target_df = source_df.iloc[:,:6]
target_df['UTLA_Name'] = source_df['UTLA19NM']
#subset the df to only contain rows associated with the UTLA of interest
mask = target_df['UTLA_Name'] == dict_matched_utlas[la_to_subset_population]
target_df = target_df[mask]
bin_size = dict_condition_bin_range[condition]
#create list to contain lists of single year of age field names - these are the ages (24, 25, 26.. etc.) col headers in the ons pop estimate lsoa to single year of age file
list_of_syoa_field_names = []
#um_items = range(len(dict_condition_start_ages['GUM']))
for item_num in dict_condition_start_ages[condition]:
temp_list = []
for num in range(bin_size):
temp_list.append(str(item_num + num))
list_of_syoa_field_names.append(temp_list)
#list_of_sum_cols = []
list_col_names = []
for num in range(num_bins):
list_col_names.append(f"{condition}_PopAged_{dict_condition_age_ranges[condition][num]}")
target_df[f"{condition}_PopAged_{dict_condition_age_ranges[condition][num]}"] = source_df.loc[:,list_of_syoa_field_names[num]].sum(axis=1)
return (list_col_names, target_df)
# -----------------------------------
#experimental function code below for more straightforward approach to reading in LA of interest:
def sum_pop_in_each_age_range_revised(num_bins, la_to_subset_population, dict_condition_age_ranges, condition, dict_condition_bin_range, dict_condition_start_ages, source_df, target_df):
#take first 6 cols from source df to retain lsoa names, codes etc
target_df = source_df.iloc[:,:6]
target_df['UTLA_Name'] = source_df['UTLA19NM']
#subset the df to only contain rows associated with the UTLA of interest
mask = target_df['UTLA_Name'] == la_to_subset_population
target_df = target_df[mask]
bin_size = dict_condition_bin_range[condition]
#create list to contain lists of single year of age field names - these are the ages (24, 25, 26.. etc.) col headers in the ons pop estimate lsoa to single year of age file
list_of_syoa_field_names = []
#um_items = range(len(dict_condition_start_ages['GUM']))
for item_num in dict_condition_start_ages[condition]:
temp_list = []
for num in range(bin_size):
temp_list.append(str(item_num + num))
list_of_syoa_field_names.append(temp_list)
#list_of_sum_cols = []
list_col_names = []
for num in range(num_bins):
list_col_names.append(f"{condition}_PopAged_{dict_condition_age_ranges[condition][num]}")
target_df[f"{condition}_PopAged_{dict_condition_age_ranges[condition][num]}"] = source_df.loc[:,list_of_syoa_field_names[num]].sum(axis=1)
return (list_col_names, target_df)
# -----------------------------------
def calc_95_ci_diff_proportions(list_of_conditions, num_bins, dict_condition_service_percents,dict_condition_diff_percents, dict_service_counts_condition_age_range):
z_score = 1.96
dict_condition_95_confidence_interval = {}
dict_condition_95_confidence_interval_sig_bool = {}
for condition in list_of_conditions:
pop_size = dict_service_counts_condition_age_range[condition].sum()
list_confidence_intervals = []
list_confidence_intervals_significant_or_not = []
for num in range(num_bins):
confidence_interval = z_score * ((dict_condition_service_percents[condition][num] * (1 - dict_condition_service_percents[condition][num]) / pop_size) ** 0.5)
list_confidence_intervals.append(confidence_interval)
if abs(dict_condition_diff_percents[condition][num]) < confidence_interval:
list_confidence_intervals_significant_or_not.append('ns')
else:
list_confidence_intervals_significant_or_not.append('SIG')
#list_confidence_intervals_significant_or_not.append(abs(dict_condition_diff_percents[condition][num]) < confidence_interval)
dict_condition_95_confidence_interval[condition] = list_confidence_intervals
dict_condition_95_confidence_interval_sig_bool[condition] = list_confidence_intervals_significant_or_not
return (dict_condition_95_confidence_interval, dict_condition_95_confidence_interval_sig_bool)
# -----------------------------------
def calc_95_ci_diff_proportions_gender(condition, num_bins, dict_condition_service_percents,dict_condition_diff_percents, dict_service_counts_condition_gender):
z_score = 1.96
#dict_condition_95_confidence_interval = {} #produced outside of function for gender, to solve where multiple conditions in scope, each with differing pop genders (persons / males etc.)
#dict_condition_95_confidence_interval_sig_bool = {} #produced outside of function, as above.
pop_size = dict_service_counts_condition_gender[condition].sum()
list_confidence_intervals = []
list_confidence_intervals_significant_or_not = []
for num in range(num_bins):
confidence_interval = z_score * ((dict_condition_service_percents[condition][num] * (1 - dict_condition_service_percents[condition][num]) / pop_size) ** 0.5)
list_confidence_intervals.append(confidence_interval)
if abs(dict_condition_diff_percents[condition][num]) < confidence_interval:
list_confidence_intervals_significant_or_not.append('ns')
else:
list_confidence_intervals_significant_or_not.append('SIG')
#list_confidence_intervals_significant_or_not.append(abs(dict_condition_diff_percents[condition][num]) < confidence_interval)
#dict_condition_95_confidence_interval[condition] = list_confidence_intervals
#dict_condition_95_confidence_interval_sig_bool[condition] = list_confidence_intervals_significant_or_not
return (list_confidence_intervals, list_confidence_intervals_significant_or_not)
# -----------------------------------
#function to provide numbered prefixes to file names for files saved as outputs from the code
def number_saved_file(counter):
"""
To be used as follows:
file_prefix = number_saved_file(counter)
"""
#check if counter between 1 and 9, if so return counter prefixed by two leading zeros.
#e.g. if counter = 1, this would return 001
if counter in range (1,10):
return(f"00{counter}")
elif counter in range(10,100):
return(f"0{counter}")
else:
return(counter)
# -----------------------------------
def create_bar_plot(chart_counter, string_for_title, string_for_file_name, list_of_conditions, dict_condition_demographic_labels, dict_condition_service_percents, dict_condition_population_percents, x_tick_rotation_int, dict_condition_95_confidence_interval, service_colour, pop_colour, dict_condition_95_confidence_interval_sig_bool, dict_condition_demographic_file_path):
"""
Function to plot a chart for each condition the user is using the code for
and compare the service proportionate make up for the given feature of interest
to the population make up for the same feature.
note: this code runs on matplotlib version 3.2.2
more recent versions of matplotlib use a different method for setting xticklabels.
if the code doesn't run and fails in this function, check the version of matplotlib.
if the version is >3.2.2 then add the 'label' kwarg to the ax.bar calls in the function,
with the value dict_condition_demographic_labels[condition] as in label = dict_condition_demographic_labels[condition]
"""
def addlabels(x,service,sig_or_not):
for i in range(x):
plt.text(i,service[i],sig_or_not[i])
dict_condition_chart_file_paths = {}
for condition in list_of_conditions:
N = len(dict_condition_demographic_labels[condition])
service = dict_condition_service_percents[condition]
service_std = dict_condition_95_confidence_interval[condition]
fig, ax = plt.subplots()
ind = np.arange(N) # the x locations for the groups
width = 0.35 # the width of the bars
#c = '#41B6E6' #NHS Light Blue
c = service_colour #NHS Light Blue
ax.bar(ind, service, width, bottom=0, yerr=service_std, label='Service', color = c)
pop = dict_condition_population_percents[condition]
#c = '#768692' #NHS Mid Grey
c = pop_colour
ax.bar(ind + width, pop, width, bottom=0, label='Population', color = c)
ax.set_title(f"{string_for_title} ({condition})")
addlabels(N,service,dict_condition_95_confidence_interval_sig_bool[condition])
ax.set_xticks(ind + width / 2)
ax.set_xticklabels(dict_condition_demographic_labels[condition])
plt.xticks(rotation = x_tick_rotation_int) #new line to set rotation of x labels
ax.set_ylabel('Proportion per category')
ax.legend()
ax.autoscale_view()
file_prefix = number_saved_file(chart_counter)
temp_filename = f"chart{file_prefix}_{string_for_file_name}-{condition}.png"
file_path = f'Assets_produced_by_code/02_HEA_assets/{temp_filename}'
#plt.savefig(f'Assets_produced_by_code/02_HEA_assets/{temp_filename}')
plt.savefig(file_path, bbox_inches='tight')
dict_condition_chart_file_paths[condition] = file_path
chart_counter+=1
print("File saved.")
plt.show()
dict_condition_demographic_file_path['charts'] = dict_condition_chart_file_paths
return (chart_counter, dict_condition_demographic_file_path)
# -----------------------------------
#now we need to get the total Male and Female population, respectively, for the relevant age ranges, for each condition in scope
def calc_male_and_female_pop_size(df_males, la_of_interest_user_list_selection, df_females, dict_min_age_for_each_condition, dict_max_age_for_each_condition, list_of_conditions, dict_prev_for_each_condition):
"""
function to subset the single year of age dataframes for males and females, for each condition, as applicable to the age range for that condition, and derive the male/female pop estimate
"""
#MALES
dict_condition_subtotal_male_pop_size = {}
dict_condition_subtotal_female_pop_size = {}
for condition in list_of_conditions:
#filter males df to just the la of interest
#df_males['UTLA19NM'] = df_males['UTLA19NM'].str.lower()
#mask = df_males['UTLA19NM'] == dict_matched_utlas[la_to_subset_population] #original
mask = df_males['UTLA19NM'] == la_of_interest_user_list_selection
df_males = df_males[mask]
#get sub-total male population for the age range as relevant to this condition
male_pop_total = df_males.loc[:, dict_min_age_for_each_condition[condition]:dict_max_age_for_each_condition[condition]].sum(axis=1).sum()
dict_condition_subtotal_male_pop_size[condition] = int(round(male_pop_total * dict_prev_for_each_condition[condition], 0))
#debug print
#print(dict_condition_subtotal_male_pop_size[condition])
#FEMALES
#filter females df to just the la of interest
#df_females['UTLA19NM'] = df_females['UTLA19NM'].str.lower()
#mask = df_females['UTLA19NM'] == dict_matched_utlas[la_to_subset_population]
mask = df_females['UTLA19NM'] == la_of_interest_user_list_selection
df_females = df_females[mask]
#get sub-total male population for the age range as relevant to this condition
female_pop_total = df_females.loc[:, dict_min_age_for_each_condition[condition]:dict_max_age_for_each_condition[condition]].sum(axis=1).sum()
dict_condition_subtotal_female_pop_size[condition] = int(round(female_pop_total * dict_prev_for_each_condition[condition], 0))
#debug print
#print(dict_condition_subtotal_female_pop_size[condition])
dict_condition_gender_list_F_M = {}
for condition in list_of_conditions:
temp_list = []
temp_list.append(dict_condition_subtotal_female_pop_size[condition])
temp_list.append(dict_condition_subtotal_male_pop_size[condition])
dict_condition_gender_list_F_M[condition] = temp_list
return(dict_condition_gender_list_F_M)
# -----------------------------------
# NEW CODE ENTERED 07/09/22 TO READ-IN USER PARAMS AND SIGNIFICANTLY REDUCE USER INTERACTION WITH THE CODE
def create_condition_to_param_dict(df, desired_col_list, value_column_string, number_of_conditions,cast_as_type):
"""
Function to produce a subset df from a source, and turn this into a dictionary consisting of condition as the key and the associated values of the target column name in the source df
"""
subset_df = df.iloc[ : , desired_col_list]
#convert to a dictionary
temp_dict = subset_df.to_dict()
#15/6/23 code to control for conditions with age 90+ zxcv
print(temp_dict)
for key in temp_dict.keys():
print(type(key))
#populate a dict to subsequent populate with condition (key) to prev rate multiplier (values)
dict_condition_to_col_of_interest = {}
for num in range(number_of_conditions):
#extract required content from temp nested dict created above
condition = temp_dict['condition'][num]
col_of_interest = cast_as_type(temp_dict[value_column_string][num])
dict_condition_to_col_of_interest[condition] = col_of_interest
return dict_condition_to_col_of_interest
# -----------------------------------
def run_chi_square_test(
dict_condition_category_labels_1,
dict_condition_category_service_counts_2,
dict_condition_category_pop_counts_4,
dict_condition_category_pop_counts_as_percents_5
):
"""
Function to run the chi-square goodness of fit test.
The output of this function has been validated using the following website: https://www.statskingdom.com/310GoodnessChi.html
The results from this function matched identically the results from the above website.
"""
#scratch cell to solve problem
#service level data
#dict_condition_age_ranges #age ranges (demographic categories) #1
#dict_service_counts_condition_age_range #service counts by age range #2
#dictionary comprehension to derive sum of the count of patients in all ages, for each condition in the SERVICE #3
dict_service_total_count_per_condition_3 = {key: dict_condition_category_service_counts_2[key].sum() for key in dict_condition_category_service_counts_2.keys()}
#population level comparative data
#dict_condition_age_sub_totals #population counts by age range #4
#dict_condition_population_percents_age #list of population counts as percent of whole in each age range, #5
#derive the EXPECTED counts in the service data IF the proportions seen in the population were to be applied to the total patients seen in the servce #6
dict_expected_service_counts_per_condition_6 = {}
for key in dict_service_total_count_per_condition_3.keys():
list_service_expected_counts = [dict_service_total_count_per_condition_3[key] * x for x in dict_condition_category_pop_counts_as_percents_5[key]]
dict_expected_service_counts_per_condition_6[key] = pd.Series(list_service_expected_counts)
#dictionary comprehension to derive sum of the count of patients in all ages, for each condition in the POPULATION #7 (sum of #4)
dict_pop_total_count_per_condition = {key: pd.Series(dict_condition_category_pop_counts_4[key]).sum() for key in dict_condition_category_pop_counts_4.keys()}
#run chi square test for each condition
dict_chi_square_test_results_per_condition = {}
dict_chi_square_test_statistic_per_condition = {}
dict_chi_square_test_pvalue_per_condition = {}
dict_chi_square_test_sig_at_alpha_per_condition = {}
for key in dict_condition_category_labels_1.keys():
chi_square_test = stats.chisquare(dict_condition_category_service_counts_2[key], dict_expected_service_counts_per_condition_6[key])
#assign the 2 outputs from the chi square test above to their own variables
chi_square_statistic = chi_square_test[0]
chi_square_pvalue = chi_square_test[1]
#check whether the chi_square_pvalue is less than 0.05 (assuming this is the chosen significance level)
ahlpa=0.05
#chi_square_sig_at_alpha contains a Bool value of True if the result was significant.
chi_square_sig_at_alpha = chi_square_pvalue < 0.05
dict_chi_square_test_results_per_condition[key] = chi_square_test
dict_chi_square_test_statistic_per_condition[key] = chi_square_statistic
dict_chi_square_test_pvalue_per_condition[key] = chi_square_pvalue
dict_chi_square_test_sig_at_alpha_per_condition[key] = chi_square_sig_at_alpha
#check the chi square assumptions that the min value for observed and expected counts is >=5
#This populates a dictionary with keys of condition label and Bool values, indicating whether the requirement for all expected and observed counts
#to be at least 5 is met. Where this condition is met, a Bool of True is present, otherwise, False is present.
dict_chi_square_conditions_met = {}
for key in dict_expected_service_counts_per_condition_6.keys():
dict_chi_square_conditions_met[key] = dict_expected_service_counts_per_condition_6[key].min() >= 5 and dict_condition_category_service_counts_2[key].min() >= 5
return(dict_chi_square_test_results_per_condition, dict_chi_square_test_statistic_per_condition, dict_chi_square_test_pvalue_per_condition, dict_chi_square_test_sig_at_alpha_per_condition)
# -----------------------------------
def summarise_chi_square_results_into_df(
list_of_conditions,
dict_chi_square_test_results_per_condition,
dict_chi_square_test_statistic_per_condition,
dict_chi_square_test_pvalue_per_condition,
dict_chi_square_test_sig_at_alpha_per_condition,
master_list_of_data_frames,
demographic_label
):
"""
Function to summarise the results from the chisquare test for a given demographic into a df
append this to the master list of data frames so that once all results are converted and appended
to that list of dataframes, the whole list of dataframes can be concatenated into a single df of results
"""
list_of_data_frames = []
for condition in list_of_conditions:
if condition in dict_chi_square_test_results_per_condition.keys():
temp_dict = {}
temp_dict['Condition'] = condition
temp_dict['Demographic'] = demographic_label
temp_dict['Chi_square_statistic'] = round(dict_chi_square_test_statistic_per_condition[condition],5)
temp_dict['P_value'] = round(dict_chi_square_test_pvalue_per_condition[condition],5)
if dict_chi_square_test_sig_at_alpha_per_condition[condition] == True:
temp_dict['Significant?'] = 'yes'
else:
temp_dict['Significant?'] = 'no'
temp_df = pd.DataFrame.from_dict(temp_dict, orient='index').T
list_of_data_frames.append(temp_df)
for list_item in list_of_data_frames:
master_list_of_data_frames.append(list_item)
return(master_list_of_data_frames)
# -----------------------------------
#Read-in user parameters
# -----------------------------------
#code to identify the number of conditions the user is running the tool for
filename = "raw_data/user_and_data_parameters/user_and_data_params.xlsx"
number_of_conditions_modelling_for = pd.read_excel(filename, 'HEA_condition_details', index_col=None, usecols = "D", header = 1, nrows=1)
number_of_conditions_modelling_for = int(number_of_conditions_modelling_for.iloc[0,0])
#read in the params defined by the user for each condition in scope. This will be used in subsequent steps to create a number of dictionaries required for HEA code to run.
#This approach removes a number of steps of user interaction with the code in the development environment, and enables separation of concerns in terms of sourcing ref material for the model
# and actually running the model.
df_condition_params_original = pd.read_excel(filename, 'HEA_condition_details', index_col=None, usecols = "B, C, D, E, F, G, H, I, J", header = 5, nrows=number_of_conditions_modelling_for)
df_condition_params_original = df_condition_params_original.fillna(1)
#ID the column description ised in the params file
column_names_from_condition_df = list(df_condition_params_original.columns)
#create variables containing the text for each column name from the user params df read in above. By doing this, if there is a change to the word of the column, so long as the shape and order doesnt change
#the code will still run as we aren't hard-coding the text as a reference point.
temp_condition_col_name = column_names_from_condition_df[0]
temp_prev_type_col_name = column_names_from_condition_df[1]
temp_numerator_col_name = column_names_from_condition_df[2]
temp_denominator_col_name = column_names_from_condition_df[3]
temp_min_age_col_name = column_names_from_condition_df[4]
temp_max_age_col_name = column_names_from_condition_df[5]
temp_gender_col_name = column_names_from_condition_df[6]
temp_proportion_this_need_represents = column_names_from_condition_df[7]
temp_proportion_seen_by_this_service = column_names_from_condition_df[8]
df_condition_params_original['prevalence_multiplier'] = ((df_condition_params_original[temp_numerator_col_name] / df_condition_params_original[temp_denominator_col_name]) / df_condition_params_original[temp_proportion_this_need_represents]) * df_condition_params_original[temp_proportion_seen_by_this_service]
#rename the df cols to have shorter meaningful names rather than the instructional column names from the user params file.
df_condition_params_original.rename(columns= {
temp_condition_col_name: "condition",
temp_prev_type_col_name: "prevalence_type",
temp_numerator_col_name: "numerator",
temp_denominator_col_name: "denominator",
temp_min_age_col_name: "min_age",
temp_max_age_col_name: "max_age",
temp_gender_col_name: "gender",
temp_proportion_this_need_represents: "proportion_need_represents",
temp_proportion_seen_by_this_service: "proportion_seen_by_service",
},
inplace=True)
#create variables containing the text for the shorter column names we just assigned
revised_column_names_from_condition_df = list(df_condition_params_original.columns)
condition_col_name = revised_column_names_from_condition_df[0]
prev_type_col_name = revised_column_names_from_condition_df[1]
numerator_col_name = revised_column_names_from_condition_df[2]
denominator_col_name = revised_column_names_from_condition_df[3]
min_age_col_name = revised_column_names_from_condition_df[4]
max_age_col_name = revised_column_names_from_condition_df[5]
gender_col_name = revised_column_names_from_condition_df[6]
prev_multiplier_col_name = revised_column_names_from_condition_df[9]
#create list_of_conditions variable
list_of_conditions = df_condition_params_original[condition_col_name].tolist()
#zxcv
#15/6/23 revising max_age values wherever these are == 90 to instead be 90+
#this is because this is the value in the associated cell in the ONS population estimate data tables
"""print(df_condition_params_original)
print()
df_condition_params_original['max_age'] = df_condition_params_original['max_age'].replace([90], '90+')
print(df_condition_params_original)
print()"""
#create dictionary of condition to prevalence rate type
dict_condition_to_prevalence_rate = create_condition_to_param_dict(df_condition_params_original, [0, 1], prev_type_col_name, number_of_conditions_modelling_for, str)
#use above dict of condition to prevalence rate type to
# create dictionary of condition to prevalence rate reference integer where
#1 = Age standardised rate
#2 = Crude rate
#3 = Census or no rate
dict_prev_ref_for_each_condition = {}
for condition in list_of_conditions:
ref_num = int(dict_condition_to_prevalence_rate[condition].split(":")[0])
dict_prev_ref_for_each_condition[condition] = ref_num
#create multiplier for prev rate multiplier
dict_prev_for_each_condition = create_condition_to_param_dict(df_condition_params_original, [0, 9], prev_multiplier_col_name, number_of_conditions_modelling_for, float)
#create dictionary of condition name to min age
dict_min_age_for_each_condition = create_condition_to_param_dict(df_condition_params_original, [0, 4], min_age_col_name, number_of_conditions_modelling_for, str)
#create dictionary of condition name to max age
dict_max_age_for_each_condition = create_condition_to_param_dict(df_condition_params_original, [0, 5], max_age_col_name, number_of_conditions_modelling_for, str)
#zxcv #15/6
print()
print(dict_max_age_for_each_condition)
print()
print(dict_min_age_for_each_condition)
print()
#create dictionary of condition name to gender seen
dict_pop_gender_for_each_condition_text = create_condition_to_param_dict(df_condition_params_original, [0, 6], gender_col_name, number_of_conditions_modelling_for, str)
#use the above dict to create a dictionary of condition to gender integer, where
#1 = Persons
#2 = Males only
#3 = Females only
dict_pop_gender_for_each_condition = {}
for condition in list_of_conditions:
if dict_pop_gender_for_each_condition_text[condition] == "Persons":
dict_pop_gender_for_each_condition[condition] = 1
elif dict_pop_gender_for_each_condition_text[condition] == "Males only":
dict_pop_gender_for_each_condition[condition] = 2
else:
dict_pop_gender_for_each_condition[condition] = 3
# -----------------------------------
# <<< Code Begins >>>
# -----------------------------------
#Read in population estimates by LSOA and identify the fields included in each
#Note: added _ to the end of each df var name due to needing to join the upper tier la df for the utla name.
df_all_persons_ = pd.read_csv("raw_data/ons_population_data/2020_persons_pop_lsoa_syoa.csv")
df_males_ = pd.read_csv("raw_data/ons_population_data/2020_males_pop_lsoa_syoa.csv", encoding='latin-1')
df_females_ = pd.read_csv("raw_data/ons_population_data/2020_females_pop_lsoa_syoa.csv")
#read in the lower tier to upper tier look up file from open geography portal
df_lower_tier_to_upper_tier_la = pd.read_csv("raw_data/open_geography_portal_lookups/Lower_Tier_Local_Authority_to_Upper_Tier_Local_Authority__April_2019__Lookup_in_England_and_Wales.csv")
#create dataframes for pop size, with the upper tier la code and name joined to the end (right) of the original dataframe
df_all_persons = pd.merge(df_all_persons_, df_lower_tier_to_upper_tier_la, left_on='LA Code (2018 boundaries)', right_on='LTLA19CD')
df_males = pd.merge(df_males_, df_lower_tier_to_upper_tier_la, left_on='LA Code (2018 boundaries)', right_on='LTLA19CD')
df_females = pd.merge(df_females_, df_lower_tier_to_upper_tier_la, left_on='LA Code (2018 boundaries)', right_on='LTLA19CD')
#read in the lsoa to imd decile data set, available from : https://data-communities.opendata.arcgis.com/datasets/communities::indices-of-multiple-deprivation-imd-2019-1/about
df_lsoa_imd_full = pd.read_csv('raw_data/ministry_of_housing_communities_local_gov/Indices_of_Multiple_Deprivation_(IMD)_2019.csv')
df_lsoa_imd_only = pd.DataFrame()
df_lsoa_imd_only['lsoa11cd'] = df_lsoa_imd_full['lsoa11cd']
df_lsoa_imd_only['imd_decile'] = df_lsoa_imd_full['IMD_Decile']
#df_lsoa_imd_only.head()
#NEW code to merge imd decile into the df - need to run through file to check this hasnt introduced bugs
df_all_persons = pd.merge(df_all_persons, df_lsoa_imd_only, left_on='LSOA Code', right_on='lsoa11cd')
df_males = pd.merge(df_males, df_lsoa_imd_only, left_on='LSOA Code', right_on='lsoa11cd')
df_females = pd.merge(df_females, df_lsoa_imd_only, left_on='LSOA Code', right_on='lsoa11cd')
#get all column names and assign to variables for each pop type
all_persons_fields = df_all_persons.columns
males_fields = df_males.columns
females_fields = df_females.columns
#Create empty data frames to build up in subsequent steps with the identified
#population age range
df_all_persons_selected_age = pd.DataFrame()
df_males_selected_age = pd.DataFrame()
df_females_selected_age = pd.DataFrame()
#---------------------------------------------------
#altered dict content - check hasn't thrown any code out in UAT
dict_prevalence_rate = {
1: "Age standardised rate",
2: "Crude rate",
3: "No rate - census only"
}
#---------------------------------------------------
#create list of conditions to include in the model
"""#add error control (e.g. must be at least 1 condition ? )
number_of_conditions_modelling_for = int(input("Please enter the number of conditions/services you would like to include in this Health Equity Assessment. \nIn doing this, consider how granular you would like to estimate unmet need. \nIt may be easier to start with high level groupings rather than individual conditions. For example, for sexual health services, there are two primary uses of the service, genitourinary conditions and contraceptive requirements. \n>>> "))
list_of_conditions = []
for condition_number in range(number_of_conditions_modelling_for):
condition_name = input(f"Please enter ONE word to describe the nature of condition/service {condition_number + 1}. PLEASE NOTE: The word you use will appear in the final report and all charts produced related to this condition.: >>> ")
list_of_conditions.append(condition_name)
"""
#---------------------------------------------------
#ITEMS TO SAVE TO SAVE INTO THE WORD REPORT
#number_of_conditions_modelling_for #can use this in report to summarise "the HEA model has been used for one condition (condition)"
if number_of_conditions_modelling_for == 1:
text_condition_summary = f"The HEA model has been used to model one condition ({list_of_conditions[0]})."
elif number_of_conditions_modelling_for == 2:
text_condition_summary = f"The HEA model has been used to model two conditions ({' and '.join(list_of_conditions)})."
else:
text_condition_summary = f"The HEA model has been used to model {number_of_conditions_modelling_for} conditions ({f', '.join(list_of_conditions[:-1]) + ' and ' + list_of_conditions[-1]})."
#Sentence to use in the report summarising the conditions being modelled = text_condition_summary
temp_filename = "text_condition_summary.txt"
filepath = f'{hea_assets_path}/{temp_filename}'
#open text file
text_file = open(filepath, "w")
#write string to file
text_file.write(text_condition_summary)
#close file
text_file.close()
print("Summary of conditions being modelled saved as txt file.")
#---------------------------------------------------
#DEV NOTE: "while try except" needed here to control for data entry error
if number_of_conditions_modelling_for >1:
plural = "genders"
else:
plural = "gender"
#---------------------------------------------------
#Preparation finished
#---------------------------------------------------
# <<< HEA Starts Here : >>>
#---------------------------------------------------
#This code is used to calculate equal age ranges (bin_range) between the min and max age the user has said the service sees.
#The // in the calculation provides a rounded down answer. This ensures whole ages and equal ranges are returned.
#The trade-off is where a service_age_range cannot be divided equally by the num_bins parameter, the last (oldest) age range will not extend all
#the way to the oldest age the service sees. The margin of error will be anything from 1 to bin_range-1 years.
#So, for example, if bin_range is equal to 10, the above trade-off means the data used to calculate any statistical difference between proportions
#could exclude upto the oldest 9 years of patients in the service (but, it would also exclude this same 9 years from the population comparison,
# so we are still 'comparing apples with apples')
#dictionary to store the bin range for each condition
dict_condition_bin_range = {}
#dictionary of user_entered condition names (keys) to a list of the start ages for the age ranges (values)
dict_condition_start_ages = {}
#dictionary of user_entered condition names (keys) to a list of the end ages for the age ranges (values)
dict_condition_end_ages = {}
# -----------------------------
#AGE SECTION STARTS
# -----------------------------
#loop over all conditions the user has entered, derive start and end ages for equal age ranges, with the number of age ranges according to user parameter.
#update the above two dictionaries with the outputs from each loop
for condition in list_of_conditions:
min_age = int(dict_min_age_for_each_condition[condition])
max_age = int(dict_max_age_for_each_condition[condition])
service_age_range = max_age - min_age
bin_range = service_age_range // num_bins
dict_condition_bin_range[condition] = bin_range #store the derived bin range in the dictionary
print(f"The interval for each age range for the condition type '{condition}' is : {bin_range} years.")
list_start_ages = []
list_end_ages = []
#using list comprehension, derive the start ages for each age range relevant to the user-entered parameters
condition_start_ages = [list_start_ages.append(min_age + age) for age in range(0, service_age_range, bin_range)]
dict_condition_start_ages[condition] = list_start_ages[:num_bins+1]
#using list comprehension, derive the end ages for each age range relevant to the user-entered parameters
condition_end_ages = [list_end_ages.append(min_age + age + (bin_range-1)) for age in range(0, service_age_range, bin_range)]
dict_condition_end_ages[condition] = list_end_ages[:num_bins+1]
#print(f"{condition} : {dict_condition_end_ages}") # test print - delete
#check whether the number of start and end ages for each age range is equal to the number of bins.
#if it is, add one more start and end age to the respective lists, using the appropriate bin range interval from the dict created above
#This is to ensure the age ranges can be correctly applied to the df shortly (that step requires that labels and bins are not the same length)
for condition in list_of_conditions:
if len(dict_condition_start_ages[condition]) == num_bins:
dict_condition_start_ages[condition].append(dict_condition_start_ages[condition][-1]+dict_condition_bin_range[condition])
if len(dict_condition_end_ages[condition]) == num_bins:
dict_condition_end_ages[condition].append(dict_condition_end_ages[condition][-1]+dict_condition_bin_range[condition])
#create a dictionary for later use (in charts etc.) which contains the concatenated string of the age_range for each bin/group
dict_condition_age_ranges = {}
for condition in list_of_conditions:
temp_list_age_ranges = []
for num in range(num_bins):
temp_list_age_ranges.append(f"{dict_condition_start_ages[condition][num]}-{dict_condition_end_ages[condition][num]}")
dict_condition_age_ranges[condition] = temp_list_age_ranges
#test print - delete in final
print()
print(dict_condition_age_ranges)
print(dict_condition_start_ages)
print(dict_condition_end_ages)
print()
#---------------------------------------------------
#read in the processed HEA file (not encoded) from the pre-processing file
processed_hea_file = pd.read_csv('processed_data/1A_HEA/processed_data_1A_HEA_not_encoded_standard_col_names.csv')
#---------------------------------------------------
#Revised to have >1 attendance reason associated with each condition in scope
#This solves the problem for sexual health services, for example, where attendance reason may be GU, Con, or Both
# Where the total contraception activity is Con + Both, and the total GU activity is GU + Both
# as 'Both' is a dual appointment reason.
#mapping user entered conditions to the attend_reasons present in the dataset (this works for now, but an alternative could be to incorporate this into the user paraems xlsx file)
print(text_condition_summary)
print()
print(f"We now need to subset the dataset so we have a new dataset for each of these conditions.")
print()
print("Below is a numbered list of all attendance reasons present in the dataset:")
dict_condition_list_attend_reasons = {}
for condition in list_of_conditions:
dict_attend_reasons = {}
counter = 1 #was 0
#The first line in the for loop below is a way of reliably ensuring the dictionary created for the user to subsequently select / assign the attendance reason to the
#condition name is in the same order (alphabetical) every time the code is run.
for reason in list(dict.fromkeys(sorted(processed_hea_file[attend_reason_field_name].squeeze()))):
dict_attend_reasons[counter] = reason
counter += 1
#for num in list(range(counter))[:-1]:
# print(f"{num+1}: {dict_attend_reasons[num+1]}")
unique_options_attend_reason = list(dict.fromkeys(sorted(processed_hea_file[attend_reason_field_name].squeeze())))
attend_reasons_for_this_condition = []
#loop to choose which fields to drop from the missing value process
while True:
print("\nThe attendance reasons to choose from are listed below:")
for key_number in dict_attend_reasons:
print(f"{key_number} : {dict_attend_reasons[key_number]}")
print("0 : To continue with the current selection")
while True:
try:
attend_reason_selection = int(input(f"\nPlease enter the number of the attendance reason associated with the {condition.upper()} condition, or press 0 to continue with the current selection:"))
if attend_reason_selection in range(counter):
break
else:
print("\nThat is not a correct selection.")
except ValueError:
print("Value Error - try again.")
if attend_reason_selection != 0:
attend_reasons_for_this_condition.append(dict_attend_reasons[attend_reason_selection]) #create new list of appt statuses user wants to exclude
del dict_attend_reasons[attend_reason_selection]
elif attend_reason_selection == 0:
break
dict_condition_list_attend_reasons[condition] = attend_reasons_for_this_condition
#---------------------------------------------------
#converted below to doc string as doesnt look to be used, if breaks code, un-doc and fix
"""
column_headers = list(processed_hea_file.columns)
temp_df = pd.DataFrame(columns=column_headers)
#temp_df
for condition in list_of_conditions:
for reason in dict_condition_list_attend_reasons[condition]:
print(f"{condition} : {reason}")
"""
#---------------------------------------------------
#read in the processed file output from the pre-processing code
#processed_hea_file = pd.read_csv('processed_data/1A_HEA/processed_data_1A_HEA_not_encoded.csv') #original file read in, included col names with the "_with_missing" and "_with_median" suffixes
processed_hea_file = pd.read_csv('processed_data/1A_HEA/processed_data_1A_HEA_not_encoded_standard_col_names.csv') #new file identical content as above line, but uses original col names
#create nested dictionaries to populate with dfs for each attendance reason associated with each condition
nested_dict_condition_attend_reason = {}
nested_dict_condition_attend_reason_all_ages = {}
for condition in list_of_conditions:
#create a temp dictionary to populate with attend reasons (key(s) ) as relevant to this condition, and values of filtered df to these attend reasons
temp_dict = {}
temp_dict_all_ages = {}
for attendance_reason in dict_condition_list_attend_reasons[condition]:
mask = processed_hea_file[attend_reason_field_name] == attendance_reason #create mask for each attend reason in attend_reason
select_attend_reason_df = processed_hea_file[mask] #filter using mask
#then for each df, add new column to indicate the relevant bin age range the row relates to from dict[condition]: df of age ranges
bins = dict_condition_start_ages[condition]
labels = dict_condition_age_ranges[condition]
select_attend_reason_df['age_group'] = pd.cut(select_attend_reason_df[age_with_median_field_name], bins=bins, labels=labels, right=False)
#the output from the above code returns NaN for age values beyond the upper limit, so remove these
select_attend_reason_df_dropna = select_attend_reason_df.dropna(subset = ['age_group'])
#add this subset df to the dictionary above, with condition name as the key, subset df with age ranges as applicable to that specific condition, as values
temp_dict[attendance_reason] = select_attend_reason_df_dropna
#two-step process to subset the df to ages within the user-entered age-range (min to max age)
select_attend_reason_df_ages_in_range_step_1 = select_attend_reason_df[(select_attend_reason_df[age_with_median_field_name] >= int(dict_min_age_for_each_condition[condition]))]
select_attend_reason_df_ages_in_range_step_2 = select_attend_reason_df_ages_in_range_step_1[(select_attend_reason_df_ages_in_range_step_1[age_with_median_field_name] <= int(dict_max_age_for_each_condition[condition]))]
temp_dict_all_ages[attendance_reason] = select_attend_reason_df_ages_in_range_step_2
#after iterating over all attendance reasons the user selected for this condition (the nested for loop), repeat the sequence for the next condition the user is modelling for
nested_dict_condition_attend_reason[condition] = temp_dict
nested_dict_condition_attend_reason_all_ages[condition] = temp_dict_all_ages
#Once the nested dictionary is populated, for all conditions in scope, we need to combine all dfs created for each condition, so we have a single df for each condition
dict_condition_dataset = {}
dict_condition_dataset_all_ages_in_range = {}
for condition in list_of_conditions:
dict_condition_dataset[condition] = pd.concat(nested_dict_condition_attend_reason[condition].values(), ignore_index=True)
dict_condition_dataset_all_ages_in_range[condition] = pd.concat(nested_dict_condition_attend_reason_all_ages[condition].values(), ignore_index=True)
#---------------------------------------------------
#Copied this cell from the statistical difference between proportions (gender) section
#By placing this code here, we can then reliably subset the df at this point based on the
# selected gender (persons, males, females) which will allow accuracte count per age group and
# estimate of unmet need in due course #when running the code from scratch,
# check this doesnt have independancies with variables created later in the code
#DEV NOTE: check if 'missing' appears in gender_with_missing col, when using 'persons' and
# not male or female. if it does, update the 'if' section of the below if elif else,
# to remove rows with gender of 'missing'.
#read in the gender sheet from user params as a new df
df_gender_groups_to_high_level = pd.read_excel(filename, 'Gender_mapping', skiprows=1)
#create a dict to populate with service data (granular) gender categories (key) to high level categories (values)
dict_service_gender_groups_to_high_level = {}
#create a tuple from each row in the df, and populate the above dict. Due to the ONS single year of age estimates only being available for genders Male and Female, the user params file has been limited accordingly.
#This means the code below will return NaN for any gender value present without an assigned Male or Female string in the user params file.
for row in df_gender_groups_to_high_level.itertuples():
dict_service_gender_groups_to_high_level[row.Service_data_categories] = row.High_level_categories
# ----------------------------
#identify the string used as the field name for the gender field from the user parameters file
gender_field_name = 'gender_at_birth'
print(f"The field name that will be used for the gender field is: {gender_field_name}")
# --------------------------------