# Readmission Risk for Heart Failure Patients

In [1]:
import mimicfouretl.bigquery_utils as bq
from mimicfouretl.data_insights import display_datasets
from mimicfouretl.query_builder import QueryBuilder
from mimicfouretl.feature_engineering import FeatureEngineering

## Build BigQuery Spark session

In [2]:
bq.set_credentials_file('../bq_credentials/client_secret.json')
bq.set_project_id('mimic-iv-418015')
# bq.set_project_id('micro-vine-412020')

In [3]:
client = bq.get_client(use_service_account_auth=False)

Please visit this URL to authorize this application: https://accounts.google.com/o/oauth2/auth?response_type=code&client_id=236933606679-n0530hpv6li2upvr6ibubbd7f3hik03j.apps.googleusercontent.com&redirect_uri=http%3A%2F%2Flocalhost%3A55513%2F&scope=https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fbigquery&state=JsdbqqTF22RrzPeHe1B4JJQOKztb27&access_type=offline


In [4]:
# tables = bq.list_tables('mimiciv_icu', client)
tables = bq.list_tables('physionet-data.mimiciv_hosp', client)
tables

['physionet-data.mimiciv_hosp.admissions',
 'physionet-data.mimiciv_hosp.d_hcpcs',
 'physionet-data.mimiciv_hosp.d_icd_diagnoses',
 'physionet-data.mimiciv_hosp.d_icd_procedures',
 'physionet-data.mimiciv_hosp.d_labitems',
 'physionet-data.mimiciv_hosp.diagnoses_icd',
 'physionet-data.mimiciv_hosp.drgcodes',
 'physionet-data.mimiciv_hosp.emar',
 'physionet-data.mimiciv_hosp.emar_detail',
 'physionet-data.mimiciv_hosp.hcpcsevents',
 'physionet-data.mimiciv_hosp.labevents',
 'physionet-data.mimiciv_hosp.microbiologyevents',
 'physionet-data.mimiciv_hosp.omr',
 'physionet-data.mimiciv_hosp.patients',
 'physionet-data.mimiciv_hosp.pharmacy',
 'physionet-data.mimiciv_hosp.poe',
 'physionet-data.mimiciv_hosp.poe_detail',
 'physionet-data.mimiciv_hosp.prescriptions',
 'physionet-data.mimiciv_hosp.procedures_icd',
 'physionet-data.mimiciv_hosp.provider',
 'physionet-data.mimiciv_hosp.services',
 'physionet-data.mimiciv_hosp.transfers']

In [5]:
display_datasets()

Dropdown(description='Dataset:', options=('hosp.provider', 'hosp.services', 'hosp.d_icd_procedures', 'hosp.pre…

Output()

In [6]:
spark = bq.get_spark_session()

24/03/29 18:10:25 WARN Utils: Your hostname, KGMSurface resolves to a loopback address: 127.0.1.1; using 10.0.0.136 instead (on interface wlp0s20f3)
24/03/29 18:10:25 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


:: loading settings :: url = jar:file:/home/kevin/anaconda3/envs/mimic-iv-etl/lib/python3.12/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/kevin/.ivy2/cache
The jars for the packages stored in: /home/kevin/.ivy2/jars
com.google.cloud.spark#spark-bigquery-with-dependencies_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-85d54891-c610-42e1-be00-a5ec0a968c57;1.0
	confs: [default]
	found com.google.cloud.spark#spark-bigquery-with-dependencies_2.12;0.37.0 in central
	[0.37.0] com.google.cloud.spark#spark-bigquery-with-dependencies_2.12;latest.version
:: resolution report :: resolve 1771ms :: artifacts dl 2ms
	:: modules in use:
	com.google.cloud.spark#spark-bigquery-with-dependencies_2.12;0.37.0 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	---------------------------------------------------------------------
	|      default     |   1   |   1   |   0   |   0   

## Get relevant ICD codes and Lab Item IDs

### Get ICD codes for heart failure diagnoses

In [7]:
qb_heart_failure_codes = QueryBuilder(dataset='hosp.d_icd_diagnoses', 
                                      columns=['icd_code', 'icd_version', 'long_title'])
qb_heart_failure_codes.apply_filters("LOWER(long_title) LIKE '%heart failure%'")
heart_failure_codes_query = qb_heart_failure_codes.generate_query()
print(heart_failure_codes_query)

SELECT icd_code, long_title, icd_version
FROM `physionet-data.mimiciv_hosp.d_icd_diagnoses`
WHERE LOWER(long_title) LIKE '%heart failure%'


In [8]:
heart_failure_icd_items = bq.run_query(spark, heart_failure_codes_query)

In [9]:
heart_failure_icd_items.toPandas()

                                                                                

Unnamed: 0,icd_code,long_title,icd_version
0,40200,Malignant hypertensive heart disease without heart failure,9
1,42830,"Diastolic heart failure, unspecified",9
2,42841,Acute combined systolic and diastolic heart failure,9
3,42831,Acute diastolic heart failure,9
4,40290,Unspecified hypertensive heart disease without heart failure,9
...,...,...,...
68,I50,Heart failure,10
69,I5021,Acute systolic (congestive) heart failure,10
70,I5031,Acute diastolic (congestive) heart failure,10
71,I97131,Postprocedural heart failure following other surgery,10


In [10]:
heart_failure_icd_codes_list = [row['icd_code'] for row in heart_failure_icd_items.select('icd_code').distinct().collect()]

In [11]:
heart_failure_icd_codes_str = "'" + "', '".join(heart_failure_icd_codes_list) + "'"

### Get itemids for BNP labs

In [12]:
qb_bnp_labs = QueryBuilder(dataset='hosp.d_labitems', 
                                      columns=['itemid', 'label', 'fluid', 'category'])
qb_bnp_labs.apply_filters(["LOWER(label) LIKE '%bnp%'", "fluid = 'Blood'"])
bnp_labs_query = qb_bnp_labs.generate_query()
print(bnp_labs_query)

SELECT category, itemid, fluid, label
FROM `physionet-data.mimiciv_hosp.d_labitems`
WHERE LOWER(label) LIKE '%bnp%' AND fluid = 'Blood'


In [13]:
bnp_lab_items = bq.run_query(spark, bnp_labs_query)

In [14]:
bnp_lab_items.toPandas()

Unnamed: 0,category,itemid,fluid,label
0,Chemistry,50963,Blood,NTproBNP


In [15]:
bnp_lab_itemids_list = [row['itemid'] for row in bnp_lab_items.select('itemid').distinct().collect()]

In [16]:
bnp_lab_itemids_str = ', '.join(str(itemid) for itemid in bnp_lab_itemids_list)

## Query MIMIC IV database for Heart Failure diagnoses and BNP labs

In [17]:
# Initialize QueryBuilders
qb_diagnoses = QueryBuilder(dataset='hosp.diagnoses_icd', 
                            columns=['subject_id', 'hadm_id', 'seq_num', 'icd_code', 'icd_version'])
qb_diagnoses.apply_filters(f'icd_code IN ({heart_failure_icd_codes_str})')

qb_labevents = QueryBuilder(dataset='hosp.labevents', 
                            columns=['subject_id', 'hadm_id', 'itemid', 'valuenum', 'ref_range_lower', 'ref_range_upper'])
qb_labevents.apply_filters(f'itemid IN ({bnp_lab_itemids_str})')

# Join datasets
qb_diagnoses.join_with(qb_labevents, join_type='inner', columns=['subject_id', 'hadm_id'])

# Generate query for joined data
qualifying_hosp_admissions_query = qb_diagnoses.generate_query(limit=100000)
# This query can now be used to extract the relevant joined data
print(qualifying_hosp_admissions_query)

SELECT icd_code, icd_version, seq_num, `physionet-data.mimiciv_hosp.diagnoses_icd`.hadm_id, itemid, ref_range_lower, ref_range_upper, `physionet-data.mimiciv_hosp.diagnoses_icd`.subject_id, valuenum
FROM `physionet-data.mimiciv_hosp.diagnoses_icd`
INNER JOIN `physionet-data.mimiciv_hosp.labevents` ON `physionet-data.mimiciv_hosp.diagnoses_icd`.subject_id = `physionet-data.mimiciv_hosp.labevents`.subject_id AND `physionet-data.mimiciv_hosp.diagnoses_icd`.hadm_id = `physionet-data.mimiciv_hosp.labevents`.hadm_id
WHERE icd_code IN ('I119', 'I5021', 'I5033', 'I509', 'I50814', '40413', '42842', '42830', 'I130', '40403', 'I50', 'I9713', '4289', '40412', 'I5032', '4280', '40291', 'I50811', 'I5042', '42833', 'I50812', 'I1311', 'I5041', 'I5043', '40211', '40411', 'I503', '40492', 'I110', 'I5083', '42821', '40401', '40410', 'I131', 'I5089', '40493', '42840', 'I5022', 'I0981', '40402', 'I97130', 'I50810', 'I97131', 'I508', 'I5081', 'I5084', '40400', 'I5020', '42832', 'I5040', 'I5023', '42841', '4

In [18]:
qualifying_hosp_admissions_df = bq.run_query(spark, qualifying_hosp_admissions_query)

In [19]:
feature_engineer = FeatureEngineering(qualifying_hosp_admissions_df)

In [20]:
condition_str = "(valuenum < ref_range_lower) OR (valuenum > ref_range_upper)"

# Assuming your DataFrame object is named df
feature_engineer = FeatureEngineering(qualifying_hosp_admissions_df)
feature_engineer.create_conditional_feature(condition_str, "bnp_outside_ref_range")

In [21]:
qualifying_hosp_admissions_df = feature_engineer.get_processed_data()
qualifying_hosp_admissions_df.toPandas()

Unnamed: 0,icd_code,icd_version,seq_num,hadm_id,itemid,ref_range_lower,ref_range_upper,subject_id,valuenum,bnp_outside_ref_range
0,42833,9,2,23778490,50963,0.0,192.0,13455616,2639.0,1
1,I5033,10,2,23991472,50963,0.0,192.0,13455616,296.0,1
2,I5033,10,2,23991472,50963,0.0,192.0,13455616,30.0,0
3,I5033,10,1,20753261,50963,0.0,192.0,13455616,273.0,1
4,I5033,10,2,20663270,50963,0.0,192.0,13455616,224.0,1
...,...,...,...,...,...,...,...,...,...,...
22461,I110,10,1,29511412,50963,0.0,852.0,11119871,2603.0,1
22462,I5023,10,15,29511412,50963,0.0,852.0,11119871,4068.0,1
22463,I5033,10,3,25829101,50963,0.0,852.0,15592191,1705.0,1
22464,I5033,10,1,26453292,50963,0.0,852.0,15592191,973.0,1


## Get admissions data, filtered by Subject ID of Qualifying Patients

In [22]:
qualifying_hosp_admissions_subject_ids_list = [row['subject_id'] for row in qualifying_hosp_admissions_df.select('subject_id').distinct().collect()]

In [23]:
len(qualifying_hosp_admissions_subject_ids_list)

7536

In [24]:
qualifying_hosp_admissions_subject_ids_str = ', '.join(str(subject_id) for subject_id in qualifying_hosp_admissions_subject_ids_list)

In [25]:
# Initialize QueryBuilders
qb_admissions = QueryBuilder(dataset='hosp.admissions')
qb_admissions.apply_filters(f'subject_id IN ({qualifying_hosp_admissions_subject_ids_str})')

# Generate query for joined data
admissions_query = qb_admissions.generate_query(limit=100000)
# This query can now be used to extract the relevant joined data
#print(admissions_query)

In [26]:
admissions_df = bq.run_query(spark, admissions_query)

In [27]:
admissions_df.toPandas()

                                                                                

Unnamed: 0,subject_id,hadm_id,admittime,dischtime,deathtime,admission_type,admit_provider_id,admission_location,discharge_location,insurance,language,marital_status,race,edregtime,edouttime,hospital_expire_flag
0,17960448,27662227,2157-09-01T16:00,2157-09-07T17:10,,URGENT,P491WZ,TRANSFER FROM HOSPITAL,HOME,Medicare,ENGLISH,MARRIED,WHITE,,,0
1,16651008,25588349,2144-06-13T19:50,2144-06-14T14:50,,URGENT,P611A0,TRANSFER FROM HOSPITAL,HOME HEALTH CARE,Medicaid,ENGLISH,SINGLE,HISPANIC/LATINO - PUERTO RICAN,,,0
2,15014144,29599966,2189-04-06T04:37,2189-04-14T16:10,,URGENT,P3529J,TRANSFER FROM HOSPITAL,HOME HEALTH CARE,Medicare,ENGLISH,MARRIED,WHITE,2189-04-06T03:18,2189-04-06T06:06,0
3,17636096,27092866,2172-02-24T21:50,2172-03-30T16:20,,URGENT,P29CGZ,TRANSFER FROM HOSPITAL,HOME,Medicaid,ENGLISH,SINGLE,WHITE,,,0
4,18034432,29407045,2159-08-10T20:14,2159-08-16T15:45,,URGENT,P4507L,TRANSFER FROM HOSPITAL,HOME HEALTH CARE,Medicare,ENGLISH,WIDOWED,UNKNOWN,,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45754,10850815,29529191,2143-08-02T08:00,2143-08-18T12:00,,SURGICAL SAME DAY ADMISSION,P49EON,PHYSICIAN REFERRAL,REHAB,Medicare,ENGLISH,MARRIED,WHITE,,,0
45755,10264575,27726486,2183-01-10T07:15,2183-01-16T14:45,,SURGICAL SAME DAY ADMISSION,P41R5N,PHYSICIAN REFERRAL,HOME HEALTH CARE,Other,ENGLISH,MARRIED,WHITE,,,0
45756,16233215,26325135,2125-04-12T01:36,2125-04-14T15:47,,SURGICAL SAME DAY ADMISSION,P872K3,PHYSICIAN REFERRAL,HOME,Medicare,ENGLISH,SINGLE,BLACK/AFRICAN AMERICAN,,,0
45757,19978239,23112915,2122-05-29T07:30,2122-06-01T13:22,,SURGICAL SAME DAY ADMISSION,P98DW0,PHYSICIAN REFERRAL,SKILLED NURSING FACILITY,Other,ENGLISH,WIDOWED,WHITE,,,0


## Count Previous Admissions for Qualifying Patients

In [28]:
feature_engineer = FeatureEngineering(admissions_df)

In [29]:
feature_engineer.count_previous_events(
    partition_column='subject_id',
    order_column='admittime',
    event_column='hadm_id'
)

In [30]:
admissions_df = feature_engineer.get_processed_data()
admissions_df.toPandas()

Unnamed: 0,subject_id,hadm_id,admittime,dischtime,deathtime,admission_type,admit_provider_id,admission_location,discharge_location,insurance,language,marital_status,race,edregtime,edouttime,hospital_expire_flag,previous_hadm_id
0,10000980,29654838,2188-01-03T17:41,2188-01-05T17:30,,EW EMER.,P20N5X,EMERGENCY ROOM,HOME HEALTH CARE,Medicare,ENGLISH,MARRIED,BLACK/AFRICAN AMERICAN,2188-01-03T12:23,2188-01-03T18:42,0,0
1,10000980,26913865,2189-06-27T07:38,2189-07-03T03:00,,EW EMER.,P30KEH,EMERGENCY ROOM,HOME HEALTH CARE,Medicare,ENGLISH,MARRIED,BLACK/AFRICAN AMERICAN,2189-06-27T06:25,2189-06-27T08:42,0,1
2,10000980,24947999,2190-11-06T20:57,2190-11-08T15:58,,EW EMER.,P434W4,EMERGENCY ROOM,HOME HEALTH CARE,Medicare,ENGLISH,MARRIED,BLACK/AFRICAN AMERICAN,2190-11-06T15:30,2190-11-06T23:16,0,2
3,10000980,25242409,2191-04-03T18:48,2191-04-11T16:21,,EW EMER.,P33K8A,EMERGENCY ROOM,SKILLED NURSING FACILITY,Medicare,ENGLISH,MARRIED,BLACK/AFRICAN AMERICAN,2191-04-03T12:36,2191-04-03T20:29,0,3
4,10000980,25911675,2191-05-23T15:33,2191-05-24T17:14,,EW EMER.,P29CGZ,EMERGENCY ROOM,HOME HEALTH CARE,Medicare,ENGLISH,MARRIED,BLACK/AFRICAN AMERICAN,2191-05-22T16:06,2191-05-23T17:56,0,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45754,19967684,25782860,2113-12-20T00:19,2113-12-23T17:36,,OBSERVATION ADMIT,P536JC,EMERGENCY ROOM,HOME HEALTH CARE,Other,ENGLISH,WIDOWED,WHITE,2113-12-19T15:53,2113-12-20T01:47,0,0
45755,19973083,22962012,2123-09-21T13:46,2123-09-28T00:00,,URGENT,P07HDB,TRANSFER FROM HOSPITAL,HOME HEALTH CARE,Other,ENGLISH,SINGLE,BLACK/CARIBBEAN ISLAND,,,0,0
45756,19973083,21885760,2123-10-04T15:53,2123-10-09T20:09,,OBSERVATION ADMIT,P47E1G,PHYSICIAN REFERRAL,HOME HEALTH CARE,Other,ENGLISH,SINGLE,BLACK/CARIBBEAN ISLAND,,,0,1
45757,19973083,20741363,2123-10-12T04:00,2123-10-20T16:35,,OBSERVATION ADMIT,P47E1G,WALK-IN/SELF REFERRAL,HOME HEALTH CARE,Other,ENGLISH,SINGLE,BLACK/CARIBBEAN ISLAND,2123-10-12T01:51,2123-10-12T04:53,0,2


## Check for Readmission within 30, 90, and 180 days

In [31]:
feature_engineer.check_event_within_timeframe(
    partition_column='subject_id',
    event_column='admittime', 
    timeframe=30
)

In [32]:
feature_engineer.check_event_within_timeframe(
    partition_column='subject_id',
    event_column='admittime', 
    timeframe=90
)

In [33]:
feature_engineer.check_event_within_timeframe(
    partition_column='subject_id',
    event_column='admittime', 
    timeframe=180
)

In [34]:
admissions_df = feature_engineer.get_processed_data()
admissions_df.toPandas()

Unnamed: 0,subject_id,hadm_id,admittime,dischtime,deathtime,admission_type,admit_provider_id,admission_location,discharge_location,insurance,...,race,edregtime,edouttime,hospital_expire_flag,previous_hadm_id,next_admittime_date,days_to_next_admittime,admittime_within_30_days,admittime_within_90_days,admittime_within_180_days
0,10000980,29654838,2188-01-03T17:41,2188-01-05T17:30,,EW EMER.,P20N5X,EMERGENCY ROOM,HOME HEALTH CARE,Medicare,...,BLACK/AFRICAN AMERICAN,2188-01-03T12:23,2188-01-03T18:42,0,0,2189-06-27T07:38,541.0,0,0,0
1,10000980,26913865,2189-06-27T07:38,2189-07-03T03:00,,EW EMER.,P30KEH,EMERGENCY ROOM,HOME HEALTH CARE,Medicare,...,BLACK/AFRICAN AMERICAN,2189-06-27T06:25,2189-06-27T08:42,0,1,2190-11-06T20:57,497.0,0,0,0
2,10000980,24947999,2190-11-06T20:57,2190-11-08T15:58,,EW EMER.,P434W4,EMERGENCY ROOM,HOME HEALTH CARE,Medicare,...,BLACK/AFRICAN AMERICAN,2190-11-06T15:30,2190-11-06T23:16,0,2,2191-04-03T18:48,148.0,0,0,1
3,10000980,25242409,2191-04-03T18:48,2191-04-11T16:21,,EW EMER.,P33K8A,EMERGENCY ROOM,SKILLED NURSING FACILITY,Medicare,...,BLACK/AFRICAN AMERICAN,2191-04-03T12:36,2191-04-03T20:29,0,3,2191-05-23T15:33,50.0,0,1,1
4,10000980,25911675,2191-05-23T15:33,2191-05-24T17:14,,EW EMER.,P29CGZ,EMERGENCY ROOM,HOME HEALTH CARE,Medicare,...,BLACK/AFRICAN AMERICAN,2191-05-22T16:06,2191-05-23T17:56,0,4,2191-07-16T14:21,54.0,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45754,19967684,25782860,2113-12-20T00:19,2113-12-23T17:36,,OBSERVATION ADMIT,P536JC,EMERGENCY ROOM,HOME HEALTH CARE,Other,...,WHITE,2113-12-19T15:53,2113-12-20T01:47,0,0,,,0,0,0
45755,19973083,22962012,2123-09-21T13:46,2123-09-28T00:00,,URGENT,P07HDB,TRANSFER FROM HOSPITAL,HOME HEALTH CARE,Other,...,BLACK/CARIBBEAN ISLAND,,,0,0,2123-10-04T15:53,13.0,1,1,1
45756,19973083,21885760,2123-10-04T15:53,2123-10-09T20:09,,OBSERVATION ADMIT,P47E1G,PHYSICIAN REFERRAL,HOME HEALTH CARE,Other,...,BLACK/CARIBBEAN ISLAND,,,0,1,2123-10-12T04:00,8.0,1,1,1
45757,19973083,20741363,2123-10-12T04:00,2123-10-20T16:35,,OBSERVATION ADMIT,P47E1G,WALK-IN/SELF REFERRAL,HOME HEALTH CARE,Other,...,BLACK/CARIBBEAN ISLAND,2123-10-12T01:51,2123-10-12T04:53,0,2,2123-10-26T02:05,14.0,1,1,1


## Merge Admissions data with Qualifying Admissions data

In [35]:
merged_df = qualifying_hosp_admissions_df.join(
    admissions_df,
    (qualifying_hosp_admissions_df['subject_id'] == admissions_df['subject_id']) & 
    (qualifying_hosp_admissions_df['hadm_id'] == admissions_df['hadm_id']),
    how='left'
)

In [36]:
merged_df.toPandas()

Unnamed: 0,icd_code,icd_version,seq_num,hadm_id,itemid,ref_range_lower,ref_range_upper,subject_id,valuenum,bnp_outside_ref_range,...,race,edregtime,edouttime,hospital_expire_flag,previous_hadm_id,next_admittime_date,days_to_next_admittime,admittime_within_30_days,admittime_within_90_days,admittime_within_180_days
0,42833,9,2,23778490,50963,0.0,192.0,13455616,2639.0,1,...,BLACK/AFRICAN AMERICAN,2182-05-24T10:07,2182-05-24T20:11,0,0,2182-06-06T06:33,13.0,1,1,1
1,I5033,10,2,23991472,50963,0.0,192.0,13455616,296.0,1,...,BLACK/AFRICAN AMERICAN,,,0,8,2186-01-17T00:00,180.0,0,0,1
2,I5033,10,2,23991472,50963,0.0,192.0,13455616,30.0,0,...,BLACK/AFRICAN AMERICAN,,,0,8,2186-01-17T00:00,180.0,0,0,1
3,I5033,10,1,20753261,50963,0.0,192.0,13455616,273.0,1,...,BLACK/AFRICAN AMERICAN,,,0,6,2185-06-28T18:32,76.0,0,1,1
4,I5033,10,2,20663270,50963,0.0,192.0,13455616,224.0,1,...,BLACK/AFRICAN AMERICAN,,,0,3,2184-10-07T21:05,20.0,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22461,I110,10,1,29511412,50963,0.0,852.0,11119871,2603.0,1,...,WHITE,,,0,0,,,0,0,0
22462,I5023,10,15,29511412,50963,0.0,852.0,11119871,4068.0,1,...,WHITE,,,0,0,,,0,0,0
22463,I5033,10,3,25829101,50963,0.0,852.0,15592191,1705.0,1,...,WHITE - OTHER EUROPEAN,,,0,2,,,0,0,0
22464,I5033,10,1,26453292,50963,0.0,852.0,15592191,973.0,1,...,WHITE - OTHER EUROPEAN,,,0,0,2189-06-27T10:49,375.0,0,0,0


In [38]:
value_counts = merged_df.groupBy('race').count().orderBy('count', ascending=False)

# Show the result
value_counts.show()

+--------------------+-----+
|                race|count|
+--------------------+-----+
|               WHITE|14652|
|BLACK/AFRICAN AME...| 2941|
|             UNKNOWN|  884|
|               OTHER|  614|
|     WHITE - RUSSIAN|  559|
|WHITE - OTHER EUR...|  530|
|HISPANIC/LATINO -...|  395|
|     ASIAN - CHINESE|  223|
|HISPANIC/LATINO -...|  213|
|  BLACK/CAPE VERDEAN|  190|
|BLACK/CARIBBEAN I...|  146|
|               ASIAN|  144|
|  HISPANIC OR LATINO|  129|
|       BLACK/AFRICAN|  105|
|          PORTUGUESE|   96|
|ASIAN - SOUTH EAS...|   76|
|AMERICAN INDIAN/A...|   65|
|HISPANIC/LATINO -...|   63|
|WHITE - EASTERN E...|   62|
|    UNABLE TO OBTAIN|   61|
+--------------------+-----+
only showing top 20 rows

