# COVID-19 Data Cleaning Script

### Imports

In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

### Loading Data

In [10]:
# Load the CSV
df = pd.read_csv("Covid Data.csv")

### Print functions to understand data

In [11]:
def printColumnValueCounts(df):
    for col in df.columns:
        print(f"Value counts for '{col}':")
        print(df[col].value_counts(dropna=False))
        print("-" * 50)  # Adds a separator for readability
        
def summarize_dataframe(df, num_unique_threshold=20):
    """
    For each column in the DataFrame:
      - If the column is numeric (dtype is int/float) and has more than `num_unique_threshold`
        unique (non-null) values, compute its mean and standard deviation.
      - Otherwise, treat it as categorical/boolean and compute the percentage breakdown
        of its values (including NaNs).
      
      Additionally, for every column, include the raw value counts (with NaNs included).
      
    Returns:
      A dictionary with column names as keys and a summary dictionary as values.
    """
    summary = {}
    
    for col in df.columns:
        col_summary = {}
        
        # Compute value counts (including NaN)
        counts = df[col].value_counts(dropna=False)
        counts_dict = {}
        for key, value in counts.items():
            # Convert NaN key to a string for clarity
            if pd.isna(key):
                counts_dict['NaN'] = value
            else:
                counts_dict[key] = value
        col_summary['Value Counts'] = counts_dict
        
        # Also compute the percentage breakdown of values (including NaN)
        total = counts.sum()
        perc_breakdown = (counts / total * 100).round(2)
        perc_dict = {}
        for key, value in perc_breakdown.items():
            if pd.isna(key):
                perc_dict['NaN'] = f"{value}%"
            else:
                perc_dict[key] = f"{value}%"
        
        # Determine if we treat the column as numeric or categorical
        num_unique = df[col].nunique(dropna=True)
        if pd.api.types.is_numeric_dtype(df[col]) and num_unique > num_unique_threshold:
            col_summary['Type'] = 'Numeric'
            col_summary['Mean'] = df[col].mean(skipna=True)
            col_summary['Standard Deviation'] = df[col].std(skipna=True)
        else:
            col_summary['Type'] = 'Categorical'
            col_summary['Percentage Breakdown'] = perc_dict
        
        summary[col] = col_summary
    return summary

def printSummary(summaryDict):
    for col, summ in summaryDict.items():
      print(f"Summary for '{col}':")
      print("  Value Counts:")
      for val, count in summ['Value Counts'].items():
          print(f"    {val}: {count}")
      
      print(f"  Type: {summ['Type']}")
      if summ['Type'] == 'Numeric':
          print(f"  Mean: {summ['Mean']:.2f}")
          print(f"  Standard Deviation: {summ['Standard Deviation']:.2f}")
      else:
          print("  Percentage Breakdown:")
          for val, perc in summ['Percentage Breakdown'].items():
              print(f"    {val}: {perc}")
      print("-" * 50)


### Cleaning Columns
In the Boolean features, 1 means "yes" and 2 means "no". values as 97-99 are missing data.

In boolean categorical columns, 97 - 98 is unknown, so we set those to NAN.

For PREGNANT, all males should have NAN, and then we should not allow women to be unknown, so we set then to -1 to make them as invalid. rows that contain -1 will be dropped later. 

DATE_DIED get's remapped to be DIED. If a person has a invalid date of death (9999-99-99), they did not die. 

PNEUMONIA is an output column. It cannot have unknown values. Mark those as -1 to be dropped later. 

Dataset specified that if CLASIFFICATION_FINAL was higher than 3, it meant a patient did not have COVID.

In [12]:
# Columns where 97, 98, and 99 should be set to NaN
columns_to_nan = [
    "ASTHMA", "CARDIOVASCULAR", "COPD", "DIABETES", "HIPERTENSION", "ICU", 
    "INTUBED", "INMSUPR", "OBESITY", "OTHER_DISEASE", "RENAL_CHRONIC", "TOBACCO"
]
df[columns_to_nan] = df[columns_to_nan].replace({97: np.nan, 98: np.nan, 99: np.nan})

# Handle 'PREGNANT' column
df.loc[df["SEX"] == 2, "PREGNANT"] = np.nan  # If male, set PREGNANT to NaN
df.loc[(df["SEX"] == 1) & (df["PREGNANT"].isin([97, 98])), "PREGNANT"] = -1  # If female and 97 or 98, set to -1

# Process 'DATE_DIED'
df["DATE_DIED"] = df["DATE_DIED"].apply(lambda x: 2 if x == "9999-99-99" else 1)

# Process 'PNEUMONIA'
df["PNEUMONIA"] = df["PNEUMONIA"].replace(99, -1)

# Process 'CLASIFFICATION_FINAL'
df["CLASIFFICATION_FINAL"] = df["CLASIFFICATION_FINAL"].apply(lambda x: x if x in [1, 2, 3] else 0)

### Removing any columns where we had invalid values


In [13]:
df = df[~df.isin([-1]).any(axis=1)]

### Creating new target column 

This turns the CLASIFFICATION_FINAL into a binary classification of COVID-19 PRESENCE

In [14]:
# Create 'COVID-19 PRESENCE' column based on 'COVID-19 SEVERITY'
df["COVID-19 PRESENCE"] = df["CLASIFFICATION_FINAL"].map({1: 1, 2: 1, 3: 1, 0: 2})

### Renaming Columns for Clarity

In [15]:
# Rename columns
df = df.rename(columns={
    "HIPERTENSION": "HYPERTENSION",
    "CLASIFFICATION_FINAL": "COVID-19 SEVERITY",
    "DATE_DIED": "DEATH"
})

# List of output columns that should be at the end
output_columns = ["COVID-19 PRESENCE", "COVID-19 SEVERITY", "DEATH", "PNEUMONIA"]

# Get all columns sorted alphabetically, excluding the output columns
sorted_columns = sorted([col for col in df.columns if col not in output_columns])

# Create the new column order with output columns at the end
new_column_order = sorted_columns + output_columns

# Reorder the DataFrame
df = df[new_column_order]

### Data Summarization

We view the data from the following subgroups:

Original Data

All Male Data

All Female Data

Non-pregnant Females

Pregnant Females

In [16]:
all_data_summary = summarize_dataframe(df)
printSummary(all_data_summary)

Summary for 'AGE':
  Value Counts:
    30: 26570
    31: 25534
    28: 24856
    29: 24705
    34: 24535
    32: 24507
    37: 24348
    35: 24316
    36: 24290
    33: 24219
    38: 24159
    27: 23911
    40: 23609
    39: 23504
    26: 22823
    46: 22413
    45: 22177
    41: 22133
    47: 22081
    43: 22079
    42: 21952
    44: 21869
    48: 21703
    25: 20668
    49: 20539
    50: 19913
    51: 18741
    52: 18233
    24: 17883
    53: 16921
    54: 16073
    55: 15952
    56: 15619
    23: 15017
    57: 14456
    58: 13584
    59: 13122
    22: 12469
    60: 12396
    21: 11073
    61: 10864
    63: 9757
    20: 9744
    62: 9704
    64: 8685
    65: 8566
    66: 7912
    19: 7808
    67: 7329
    68: 6902
    69: 6173
    70: 6155
    71: 5416
    72: 5285
    18: 5162
    73: 4872
    74: 4413
    17: 4323
    75: 4310
    1: 4168
    76: 3821
    0: 3811
    16: 3764
    77: 3728
    78: 3512
    15: 3323
    80: 2945
    79: 2932
    14: 2894
    2: 2717
    13: 2697
    

In [17]:
male_df = df[df["SEX"] == 2]
male_summary = summarize_dataframe(male_df)
printSummary(male_summary)

Summary for 'AGE':
  Value Counts:
    30: 12907
    31: 12471
    34: 12033
    35: 12007
    28: 12000
    29: 11995
    38: 11982
    37: 11948
    32: 11891
    33: 11779
    36: 11712
    40: 11652
    39: 11615
    27: 11466
    45: 11107
    41: 11081
    46: 11080
    43: 10980
    26: 10891
    44: 10818
    47: 10767
    42: 10702
    48: 10627
    49: 10004
    25: 9896
    50: 9826
    51: 9272
    52: 9108
    53: 8495
    24: 8390
    55: 8210
    54: 8157
    56: 8127
    57: 7525
    58: 7160
    59: 7114
    23: 7005
    60: 6612
    22: 6106
    61: 5923
    21: 5435
    63: 5291
    62: 5207
    64: 4762
    20: 4759
    65: 4745
    66: 4330
    67: 4082
    68: 3852
    19: 3650
    70: 3492
    69: 3428
    71: 3002
    72: 2920
    73: 2643
    18: 2540
    74: 2489
    1: 2373
    75: 2372
    76: 2159
    0: 2085
    77: 2069
    17: 2005
    78: 1909
    16: 1796
    15: 1714
    79: 1621
    80: 1583
    2: 1523
    14: 1440
    13: 1371
    81: 1350
    82: 

In [18]:
female_df = df[df["SEX"] == 1]
female_summary = summarize_dataframe(female_df)
printSummary(female_summary)

Summary for 'AGE':
  Value Counts:
    30: 13663
    31: 13063
    28: 12856
    29: 12710
    32: 12616
    36: 12578
    34: 12502
    27: 12445
    33: 12440
    37: 12400
    35: 12309
    38: 12177
    40: 11957
    26: 11932
    39: 11889
    46: 11333
    47: 11314
    42: 11250
    43: 11099
    48: 11076
    45: 11070
    41: 11052
    44: 11051
    25: 10772
    49: 10535
    50: 10087
    24: 9493
    51: 9469
    52: 9125
    53: 8426
    23: 8012
    54: 7916
    55: 7742
    56: 7492
    57: 6931
    58: 6424
    22: 6363
    59: 6008
    60: 5784
    21: 5638
    20: 4985
    61: 4941
    62: 4497
    63: 4466
    19: 4158
    64: 3923
    65: 3821
    66: 3582
    67: 3247
    68: 3050
    69: 2745
    70: 2663
    18: 2622
    71: 2414
    72: 2365
    17: 2318
    73: 2229
    16: 1968
    75: 1938
    74: 1924
    1: 1795
    0: 1726
    76: 1662
    77: 1659
    15: 1609
    78: 1603
    14: 1454
    80: 1362
    13: 1326
    79: 1311
    2: 1194
    12: 1188
    81

In [19]:
pregnant_female_df = female_df[female_df["PREGNANT"] == 1]
pregnant_female_summary = summarize_dataframe(pregnant_female_df)
printSummary(pregnant_female_summary)

Summary for 'AGE':
  Value Counts:
    27: 494
    28: 464
    30: 449
    26: 449
    29: 439
    25: 417
    31: 414
    32: 378
    24: 366
    23: 359
    33: 350
    22: 343
    21: 334
    34: 304
    20: 301
    35: 270
    19: 251
    36: 233
    18: 188
    37: 185
    17: 140
    38: 136
    39: 121
    40: 99
    16: 87
    41: 56
    42: 52
    15: 43
    43: 37
    44: 20
    45: 11
    14: 10
    47: 9
    46: 8
    48: 5
    0: 5
    52: 4
    59: 4
    66: 2
    54: 2
    1: 2
    55: 2
    57: 2
    12: 2
    87: 2
    61: 2
    65: 2
    49: 2
    51: 2
    3: 1
    72: 1
    11: 1
    13: 1
    71: 1
    76: 1
    73: 1
    50: 1
    53: 1
    80: 1
    89: 1
    56: 1
    63: 1
  Type: Numeric
  Mean: 28.09
  Standard Deviation: 6.76
--------------------------------------------------
Summary for 'ASTHMA':
  Value Counts:
    2.0: 7648
    1.0: 214
    NaN: 8
  Type: Categorical
  Percentage Breakdown:
    2.0: 97.18%
    1.0: 2.72%
    NaN: 0.1%
--------------------

In [20]:
not_pregnant_female_df = female_df[female_df["PREGNANT"] == 2]
not_pregnant_female_summary = summarize_dataframe(not_pregnant_female_df)
printSummary(not_pregnant_female_summary)

Summary for 'AGE':
  Value Counts:
    30: 13214
    31: 12649
    28: 12392
    36: 12345
    29: 12271
    32: 12238
    37: 12215
    34: 12198
    33: 12090
    38: 12041
    35: 12039
    27: 11951
    40: 11858
    39: 11768
    26: 11483
    46: 11325
    47: 11305
    42: 11198
    48: 11071
    43: 11062
    45: 11059
    44: 11031
    41: 10996
    49: 10533
    25: 10355
    50: 10086
    51: 9467
    24: 9127
    52: 9121
    53: 8425
    54: 7914
    55: 7740
    23: 7653
    56: 7491
    57: 6929
    58: 6424
    22: 6020
    59: 6004
    60: 5784
    21: 5304
    61: 4939
    20: 4684
    62: 4497
    63: 4465
    64: 3923
    19: 3907
    65: 3819
    66: 3580
    67: 3247
    68: 3050
    69: 2745
    70: 2663
    18: 2434
    71: 2413
    72: 2364
    73: 2228
    17: 2178
    75: 1938
    74: 1924
    16: 1881
    1: 1793
    0: 1721
    76: 1661
    77: 1659
    78: 1603
    15: 1566
    14: 1444
    80: 1361
    13: 1325
    79: 1311
    2: 1194
    12: 1186
    81

In [21]:
# Save the Cleaned Data
df.to_csv("Cleaned Covid Data.csv", index=False)

# Converting clinical data csv to textual prommpts

LLMs need to take in textual prompts. Here we generate those in a human readable way. This is hopefully to aid the LLM in having a better understanding of the we pass it and allow it to make better predictions. 

In [22]:
def create_prompt(row, target=None):
    """
    Generate a human-readable paragraph-style prompt from a single patient datapoint, omitting unknown values.
    
    Args:
      row (pd.Series): A row from the DataFrame representing patient data.
      target (str): The prediction target column (one of: "COVID-19 PRESENCE",
                    "COVID-19 SEVERITY", "DEATH", "PNEUMONIA"). If provided,
                    only that outcome will be included; the other output columns will be omitted.
                    
    Returns:
      str: A formatted natural language prompt as a paragraph.
    """
    output_columns = {"COVID-19 PRESENCE", "COVID-19 SEVERITY", "DEATH", "PNEUMONIA"}
    
    # Determine which columns to include in the prompt
    if target is not None:
        include_columns = set(row.index) - (output_columns - {target})
    else:
        include_columns = set(row.index)
    
    # ---- Sentence 1: Demographics & Treatment Information ----
    basic_info = []
    
    if "AGE" in include_columns and not pd.isna(row["AGE"]):
        basic_info.append(f"{row['AGE']} years old")
    
    if "SEX" in include_columns and not pd.isna(row["SEX"]):
        if row["SEX"] == 1:
            if "PREGNANT" in include_columns and not pd.isna(row["PREGNANT"]):
                if row["PREGNANT"] == 1:
                    basic_info.append("pregnant female")
                elif row["PREGNANT"] == 2:
                    basic_info.append("female")
            else:
                basic_info.append("female")
        elif row["SEX"] == 2:
            basic_info.append("male")
    
    if "PATIENT_TYPE" in include_columns and not pd.isna(row["PATIENT_TYPE"]):
        basic_info.append(f"categorized as patient type {row['PATIENT_TYPE']}")
    
    treatment_info = []
    
    if "MEDICAL_UNIT" in include_columns and not pd.isna(row["MEDICAL_UNIT"]):
        treatment_info.append(f"treated at medical unit number {row['MEDICAL_UNIT']}")
    
    if "USMR" in include_columns and not pd.isna(row["USMR"]):
        usmr_level_map = {1: "first-level", 2: "second-level", 3: "third-level"}
        if row["USMR"] in usmr_level_map:
            treatment_info.append(f"in a {usmr_level_map[row['USMR']]} facility")
    
    sentence1 = "The patient is " + " ".join(basic_info)
    if treatment_info:
        sentence1 += " who was " + " and ".join(treatment_info)
    sentence1 += "."
    
    # ---- Sentence 2: Clinical Conditions ----
    conditions = []
    
    condition_mappings = {
        "ASTHMA": ("has a history of asthma", "does not have a history of asthma"),
        "CARDIOVASCULAR": ("has cardiovascular issues", "does not have cardiovascular issues"),
        "COPD": ("has COPD", "does not have COPD"),
        "DIABETES": ("has diabetes", "does not have diabetes"),
        "HYPERTENSION": ("has hypertension", "does not have hypertension"),
        "ICU": ("was admitted to the ICU", "was not admitted to the ICU"),
        "INTUBED": ("was intubated", "was not intubated"),
        "INMSUPR": ("is immunosuppressed", "is not immunosuppressed"),
        "OBESITY": ("is obese", "is not obese"),
        "OTHER_DISEASE": ("has other underlying diseases", "does not have other underlying diseases"),
        "RENAL_CHRONIC": ("has chronic renal issues", "does not have chronic renal issues"),
        "TOBACCO": ("uses tobacco", "does not use tobacco"),
    }
    
    for col, (positive_text, negative_text) in condition_mappings.items():
        if col in include_columns and not pd.isna(row[col]):
            if row[col] == 1:
                conditions.append(positive_text)
            elif row[col] == 2:
                conditions.append(negative_text)
    
    # Join all condition phrases in natural language.
    if conditions:
        if len(conditions) > 1:
            sentence2 = "The patient " + ", ".join(conditions[:-1]) + ", and " + conditions[-1] + "."
        else:
            sentence2 = "The patient " + conditions[0] + "."
    else:
        sentence2 = ""
    
    # Combine sentences into one paragraph.
    prompt = sentence1 + " " + sentence2
    return prompt.strip()


def generate_target_prompt(target):
    """
    Generate a prediction instruction prompt for a given target output column.
    
    Args:
      target (str): The name of the output column.
      
    Returns:
      str: A string that instructs the LLM on what to predict.
    """
    if target == "COVID-19 PRESENCE":
        return ("Please provide your prediction for COVID-19 presence, where 2 indicates "
                "COVID-19 positive and 1 indicates COVID-19 negative.")
    elif target == "COVID-19 SEVERITY":
        return ("Please provide your prediction for COVID-19 severity, where 0 indicates "
                "no COVID-19 and 1, 2, or 3 indicate increasing levels of symptom severity.")
    elif target == "DEATH":
        return ("Please provide your prediction for death outcome, where 2 indicates the patient "
                "is alive and 1 indicates the patient is deceased.")
    elif target == "PNEUMONIA":
        return ("Please provide your prediction for pneumonia, where 2 indicates pneumonia is present "
                "and 1 indicates pneumonia is not present.")
    else:
        raise Exception(f"Invalid Target! Found: {target}")

def add_target_prompts(df, targets=None):
    """
    For each target output column, add a new column to the DataFrame with the corresponding
    prediction prompt.
    
    Args:
      df (pd.DataFrame): The original DataFrame.
      targets (list of str, optional): List of target column names. 
        Defaults to ["COVID-19 PRESENCE", "COVID-19 SEVERITY", "DATE_DIED", "PNEUMONIA"].
    
    Returns:
      pd.DataFrame: The DataFrame with additional prompt columns.
    """
    if targets is None:
        targets = ["COVID-19 PRESENCE", "COVID-19 SEVERITY", "DEATH", "PNEUMONIA"]
    
    for target in targets:
        prompt_col = target + " PROMPT"
        df[prompt_col] = generate_target_prompt(target)
    
    return df


In [23]:
def create_prompt_dataframe(df):
    """
    Create a new DataFrame with columns:
    - 'SEX' (original)
    - 'PREGNANT' (original)
    - 'TEXTUAL PROMPT' (natural language patient summary)
    - Target-specific prompt columns (from add_target_prompts)
    - Output columns ('COVID-19 PRESENCE', 'COVID-19 SEVERITY', 'DEATH', 'PNEUMONIA')

    Args:
      df (pd.DataFrame): Original patient dataset.

    Returns:
      pd.DataFrame: Transformed DataFrame with formatted data.
    """
    output_columns = ["COVID-19 PRESENCE", "COVID-19 SEVERITY", "DEATH", "PNEUMONIA"]
    
    # Generate textual prompts
    df["PATIENT DATAPOINT PROMPT"] = df.apply(lambda row: create_prompt(row), axis=1)
    
    # Initialize the final dataframe with core columns
    final_df = df[["SEX", "PREGNANT", "PATIENT DATAPOINT PROMPT"]].copy()
    
    # # Add target-specific prompt columns
    # final_df = add_target_prompts(final_df)
    
    # Add output columns at the end
    for col in output_columns:
        if col in df.columns:
            final_df[col] = df[col]

    return final_df

text_prompt_df = create_prompt_dataframe(df)
text_prompt_df.to_csv("Cleaned Prompt Covid Data.csv", index=False)


# Considering INTUBED as Prediction Column 

Due to INTUBED having high NAN values, we did not want to consider it as both a potential feature and prediction column. In order to due so, we must drop all rows where Intubed is NAN. This greately reduces our dataset size. Additionally, it removes all data points where the patient returned home (PATIENT_TYPE=1) leaving only data points where a patient was hospitalized (PATIENT_TYPE=2)

In [24]:
# # List of output columns that should be at the end
# output_columns = ["COVID-19 PRESENCE", "COVID-19 SEVERITY", "DATE_DIED", "INTUBED", "PNEUMONIA"]

# # Get all columns sorted alphabetically, excluding the output columns
# sorted_columns = sorted([col for col in df.columns if col not in output_columns])

# # Create the new column order with output columns at the end
# new_column_order = sorted_columns + output_columns

# # Reorder the DataFrame
# df = df[new_column_order]
# df = df.dropna(subset=['INTUBED'])

### Data Summarization

We view the data from the following subgroups:

Original Data

All Male Data

All Female Data

Non-pregnant Females

Pregnant Females

In [25]:
# all_data_summary = summarize_dataframe(df)
# printSummary(all_data_summary)

In [26]:
# male_df = df[df["SEX"] == 2]
# male_summary = summarize_dataframe(male_df)
# printSummary(male_summary)

In [27]:
# female_df = df[df["SEX"] == 1]
# female_summary = summarize_dataframe(female_df)
# printSummary(female_summary)

In [28]:
# pregnant_female_df = female_df[female_df["PREGNANT"] == 1]
# pregnant_female_summary = summarize_dataframe(pregnant_female_df)
# printSummary(pregnant_female_summary)

In [29]:
# not_pregnant_female_df = female_df[female_df["PREGNANT"] == 2]
# not_pregnant_female_summary = summarize_dataframe(not_pregnant_female_df)
# printSummary(not_pregnant_female_summary)

In [30]:
# # Save the Cleaned Data
# df.to_csv("Cleaned Covid Data (Intubed as Prediction Col.).csv", index=False)