# COVID-19 Data Cleaning Script

### Imports

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

### Loading Data

In [None]:
# Load the CSV
df = pd.read_csv("Covid Data.csv")

### Print functions to understand data

In [None]:
def printColumnValueCounts(df):
    for col in df.columns:
        print(f"Value counts for '{col}':")
        print(df[col].value_counts(dropna=False))
        print("-" * 50)  # Adds a separator for readability
        
def summarize_dataframe(df, num_unique_threshold=20):
    """
    For each column in the DataFrame:
      - If the column is numeric (dtype is int/float) and has more than `num_unique_threshold`
        unique (non-null) values, compute its mean and standard deviation.
      - Otherwise, treat it as categorical/boolean and compute the percentage breakdown
        of its values (including NaNs).
      
      Additionally, for every column, include the raw value counts (with NaNs included).
      
    Returns:
      A dictionary with column names as keys and a summary dictionary as values.
    """
    summary = {}
    
    for col in df.columns:
        col_summary = {}
        
        # Compute value counts (including NaN)
        counts = df[col].value_counts(dropna=False)
        counts_dict = {}
        for key, value in counts.items():
            # Convert NaN key to a string for clarity
            if pd.isna(key):
                counts_dict['NaN'] = value
            else:
                counts_dict[key] = value
        col_summary['Value Counts'] = counts_dict
        
        # Also compute the percentage breakdown of values (including NaN)
        total = counts.sum()
        perc_breakdown = (counts / total * 100).round(2)
        perc_dict = {}
        for key, value in perc_breakdown.items():
            if pd.isna(key):
                perc_dict['NaN'] = f"{value}%"
            else:
                perc_dict[key] = f"{value}%"
        
        # Determine if we treat the column as numeric or categorical
        num_unique = df[col].nunique(dropna=True)
        if pd.api.types.is_numeric_dtype(df[col]) and num_unique > num_unique_threshold:
            col_summary['Type'] = 'Numeric'
            col_summary['Mean'] = df[col].mean(skipna=True)
            col_summary['Standard Deviation'] = df[col].std(skipna=True)
        else:
            col_summary['Type'] = 'Categorical'
            col_summary['Percentage Breakdown'] = perc_dict
        
        summary[col] = col_summary
    return summary

def printSummary(summaryDict):
    for col, summ in summaryDict.items():
      print(f"Summary for '{col}':")
      print("  Value Counts:")
      for val, count in summ['Value Counts'].items():
          print(f"    {val}: {count}")
      
      print(f"  Type: {summ['Type']}")
      if summ['Type'] == 'Numeric':
          print(f"  Mean: {summ['Mean']:.2f}")
          print(f"  Standard Deviation: {summ['Standard Deviation']:.2f}")
      else:
          print("  Percentage Breakdown:")
          for val, perc in summ['Percentage Breakdown'].items():
              print(f"    {val}: {perc}")
      print("-" * 50)


### Cleaning Columns
In the Boolean features, 1 means "yes" and 2 means "no". values as 97-99 are missing data.

In boolean categorical columns, 97 - 98 is unknown, so we set those to NAN.

For PREGNANT, all males should have NAN, and then we should not allow women to be unknown, so we set then to -1 to make them as invalid. rows that contain -1 will be dropped later. 

DATE_DIED get's remapped to be DIED. If a person has a invalid date of death (9999-99-99), they did not die. 

PNEUMONIA is an output column. It cannot have unknown values. Mark those as -1 to be dropped later. 

Dataset specified that if CLASIFFICATION_FINAL was higher than 3, it meant a patient did not have COVID.

In [None]:
# Columns where 97, 98, and 99 should be set to NaN
columns_to_nan = [
    "ASTHMA", "CARDIOVASCULAR", "COPD", "DIABETES", "HIPERTENSION", "ICU", 
    "INTUBED", "INMSUPR", "OBESITY", "OTHER_DISEASE", "RENAL_CHRONIC", "TOBACCO"
]
df[columns_to_nan] = df[columns_to_nan].replace({97: np.nan, 98: np.nan, 99: np.nan})

# Handle 'PREGNANT' column
df.loc[df["SEX"] == 2, "PREGNANT"] = np.nan  # If male, set PREGNANT to NaN
df.loc[(df["SEX"] == 1) & (df["PREGNANT"].isin([97, 98])), "PREGNANT"] = -1  # If female and 97 or 98, set to -1

# Process 'DATE_DIED'
df["DATE_DIED"] = df["DATE_DIED"].apply(lambda x: 2 if x == "9999-99-99" else 1)

# Process 'PNEUMONIA'
df["PNEUMONIA"] = df["PNEUMONIA"].replace(99, -1)

# Process 'CLASIFFICATION_FINAL'
df["CLASIFFICATION_FINAL"] = df["CLASIFFICATION_FINAL"].apply(lambda x: x if x in [1, 2, 3] else 0)

### Removing any columns where we had invalid values


In [None]:
df = df[~df.isin([-1]).any(axis=1)]

### Creating new target column 

This turns the CLASIFFICATION_FINAL into a binary classification of COVID-19 PRESENCE

In [None]:
# Create 'COVID-19 PRESENCE' column based on 'COVID-19 SEVERITY'
df["COVID-19 PRESENCE"] = df["CLASIFFICATION_FINAL"].map({1: 1, 2: 1, 3: 1, 0: 2})

### Renaming Columns for Clarity

In [None]:
# Rename columns
df = df.rename(columns={
    "HIPERTENSION": "HYPERTENSION",
    "CLASIFFICATION_FINAL": "COVID-19 SEVERITY",
    "DATE_DIED": "DEATH"
})

# List of output columns that should be at the end
output_columns = ["COVID-19 PRESENCE", "COVID-19 SEVERITY", "DEATH", "PNEUMONIA"]

# Get all columns sorted alphabetically, excluding the output columns
sorted_columns = sorted([col for col in df.columns if col not in output_columns])

# Create the new column order with output columns at the end
new_column_order = sorted_columns + output_columns

# Reorder the DataFrame
df = df[new_column_order]

### Data Summarization

We view the data from the following subgroups:

Original Data

All Male Data

All Female Data

Non-pregnant Females

Pregnant Females

In [None]:
all_data_summary = summarize_dataframe(df)
printSummary(all_data_summary)

In [None]:
male_df = df[df["SEX"] == 2]
male_summary = summarize_dataframe(male_df)
printSummary(male_summary)

In [None]:
female_df = df[df["SEX"] == 1]
female_summary = summarize_dataframe(female_df)
printSummary(female_summary)

In [None]:
pregnant_female_df = female_df[female_df["PREGNANT"] == 1]
pregnant_female_summary = summarize_dataframe(pregnant_female_df)
printSummary(pregnant_female_summary)

In [None]:
not_pregnant_female_df = female_df[female_df["PREGNANT"] == 2]
not_pregnant_female_summary = summarize_dataframe(not_pregnant_female_df)
printSummary(not_pregnant_female_summary)

In [None]:
# Save the Cleaned Data
df.to_csv("Cleaned Covid Data.csv", index=False)

# Converting clinical data csv to textual prommpts

LLMs need to take in textual prompts. Here we generate those in a human readable way. This is hopefully to aid the LLM in having a better understanding of the we pass it and allow it to make better predictions. 

In [None]:
def create_prompt(row, target=None):
    """
    Generate a human-readable paragraph-style prompt from a single patient datapoint.
    
    Args:
      row (pd.Series): A row from the DataFrame representing patient data.
      target (str): The prediction target column (one of: "COVID-19 PRESENCE",
                    "COVID-19 SEVERITY", "DEATH", "PNEUMONIA"). If provided,
                    only that outcome will be included; the other output columns will be omitted.
                    
    Returns:
      str: A formatted natural language prompt as a paragraph.
    """
    output_columns = {"COVID-19 PRESENCE", "COVID-19 SEVERITY", "DEATH", "PNEUMONIA"}
    
    # Determine which columns to include in the prompt
    if target is not None:
        include_columns = set(row.index) - (output_columns - {target})
    else:
        include_columns = set(row.index)
    
    # ---- Sentence 1: Demographics & Treatment Information ----
    basic_info = []
    if "AGE" in include_columns:
        basic_info.append(f"{row['AGE']} years old")
    
    if "SEX" in include_columns:
        # For females, check pregnancy status
        if row["SEX"] == 1:
            if "PREGNANT" in include_columns:
                if row["PREGNANT"] == 1:
                    basic_info.append("pregnant female")
                elif row["PREGNANT"] == 2:
                    basic_info.append("female")
                else:
                    basic_info.append("female (pregnancy status unknown)")
            else:
                basic_info.append("female")
        elif row["SEX"] == 2:
            basic_info.append("male")
        else:
            basic_info.append("of unknown gender")
    
    # Patient type, if available
    if "PATIENT_TYPE" in include_columns:
        basic_info.append(f"categorized as patient type {row['PATIENT_TYPE']}")
    
    # Treatment unit information from MEDICAL_UNIT
    treatment_info = []
    if "MEDICAL_UNIT" in include_columns:
        treatment_info.append(f"treated at medical unit number {row['MEDICAL_UNIT']}")
    
    # Include USMR information (facility level)
    if "USMR" in include_columns:
        usmr_level = ("first-level" if row["USMR"] == 1 
                      else "second-level" if row["USMR"] == 2 
                      else "third-level" if row["USMR"] == 3 
                      else "of unknown level")
        treatment_info.append(f"in a {usmr_level} facility")
    
    sentence1 = "The patient is " + " ".join(basic_info)
    if treatment_info:
        sentence1 += " who was " + " and ".join(treatment_info)
    sentence1 += "."
    
    # ---- Sentence 2: Clinical Conditions ----
    conditions = []
    
    if "ASTHMA" in include_columns:
        phrase = ("has a history of asthma" if row["ASTHMA"] == 1 
                  else "does not have a history of asthma" if row["ASTHMA"] == 2 
                  else "has an unknown history of asthma")
        conditions.append(phrase)
    
    if "CARDIOVASCULAR" in include_columns:
        phrase = ("has cardiovascular issues" if row["CARDIOVASCULAR"] == 1 
                  else "does not have cardiovascular issues" if row["CARDIOVASCULAR"] == 2 
                  else "has an unknown cardiovascular status")
        conditions.append(phrase)
    
    if "COPD" in include_columns:
        phrase = ("has COPD" if row["COPD"] == 1 
                  else "does not have COPD" if row["COPD"] == 2 
                  else "has an unknown COPD status")
        conditions.append(phrase)
    
    if "DIABETES" in include_columns:
        phrase = ("has diabetes" if row["DIABETES"] == 1 
                  else "does not have diabetes" if row["DIABETES"] == 2 
                  else "has an unknown diabetes status")
        conditions.append(phrase)
    
    if "HYPERTENSION" in include_columns:
        phrase = ("has hypertension" if row["HYPERTENSION"] == 1 
                  else "does not have hypertension" if row["HYPERTENSION"] == 2 
                  else "has an unknown hypertension status")
        conditions.append(phrase)
    
    if "ICU" in include_columns:
        phrase = ("was admitted to the ICU" if row["ICU"] == 1 
                  else "was not admitted to the ICU" if row["ICU"] == 2 
                  else "has an unknown ICU admission status")
        conditions.append(phrase)
    
    if "INTUBED" in include_columns:
        phrase = ("was intubated" if row["INTUBED"] == 1 
                  else "was not intubated" if row["INTUBED"] == 2 
                  else "has an unknown intubation status")
        conditions.append(phrase)
    
    if "INMSUPR" in include_columns:
        phrase = ("is immunosuppressed" if row["INMSUPR"] == 1 
                  else "is not immunosuppressed" if row["INMSUPR"] == 2 
                  else "has an unknown immunosuppression status")
        conditions.append(phrase)
    
    if "OBESITY" in include_columns:
        phrase = ("is obese" if row["OBESITY"] == 1 
                  else "is not obese" if row["OBESITY"] == 2 
                  else "has an unknown obesity status")
        conditions.append(phrase)
    
    if "OTHER_DISEASE" in include_columns:
        phrase = ("has other underlying diseases" if row["OTHER_DISEASE"] == 1 
                  else "does not have other underlying diseases" if row["OTHER_DISEASE"] == 2 
                  else "has an unknown status regarding other diseases")
        conditions.append(phrase)
    
    if "RENAL_CHRONIC" in include_columns:
        phrase = ("has chronic renal issues" if row["RENAL_CHRONIC"] == 1 
                  else "does not have chronic renal issues" if row["RENAL_CHRONIC"] == 2 
                  else "has an unknown chronic renal status")
        conditions.append(phrase)
    
    if "TOBACCO" in include_columns:
        phrase = ("uses tobacco" if row["TOBACCO"] == 1 
                  else "does not use tobacco" if row["TOBACCO"] == 2 
                  else "has an unknown tobacco use status")
        conditions.append(phrase)
    
    # Join all condition phrases in natural language.
    if conditions:
        if len(conditions) > 1:
            sentence2 = "The patient " + ", ".join(conditions[:-1]) + ", and " + conditions[-1] + "."
        else:
            sentence2 = "The patient " + conditions[0] + "."
    else:
        sentence2 = ""
    
    # Combine sentences into one paragraph.
    prompt = sentence1 + " " + sentence2
    return prompt


def generate_target_prompt(target):
    """
    Generate a prediction instruction prompt for a given target output column.
    
    Args:
      target (str): The name of the output column.
      
    Returns:
      str: A string that instructs the LLM on what to predict.
    """
    if target == "COVID-19 PRESENCE":
        return ("Please provide your prediction for COVID-19 presence, where 2 indicates "
                "COVID-19 positive and 1 indicates COVID-19 negative.")
    elif target == "COVID-19 SEVERITY":
        return ("Please provide your prediction for COVID-19 severity, where 0 indicates "
                "no COVID-19 and 1, 2, or 3 indicate increasing levels of symptom severity.")
    elif target == "DEATH":
        return ("Please provide your prediction for death outcome, where 2 indicates the patient "
                "is alive and 1 indicates the patient is deceased.")
    elif target == "PNEUMONIA":
        return ("Please provide your prediction for pneumonia, where 2 indicates pneumonia is present "
                "and 1 indicates pneumonia is not present.")
    else:
        raise Exception(f"Invalid Target! Found: {target}")

def add_target_prompts(df, targets=None):
    """
    For each target output column, add a new column to the DataFrame with the corresponding
    prediction prompt.
    
    Args:
      df (pd.DataFrame): The original DataFrame.
      targets (list of str, optional): List of target column names. 
        Defaults to ["COVID-19 PRESENCE", "COVID-19 SEVERITY", "DATE_DIED", "PNEUMONIA"].
    
    Returns:
      pd.DataFrame: The DataFrame with additional prompt columns.
    """
    if targets is None:
        targets = ["COVID-19 PRESENCE", "COVID-19 SEVERITY", "DEATH", "PNEUMONIA"]
    
    for target in targets:
        prompt_col = target + " PROMPT"
        df[prompt_col] = generate_target_prompt(target)
    
    return df


In [None]:
def create_prompt_dataframe(df):
    """
    Create a new DataFrame with columns:
    - 'SEX' (original)
    - 'PREGNANT' (original)
    - 'TEXTUAL PROMPT' (natural language patient summary)
    - Target-specific prompt columns (from add_target_prompts)
    - Output columns ('COVID-19 PRESENCE', 'COVID-19 SEVERITY', 'DEATH', 'PNEUMONIA')

    Args:
      df (pd.DataFrame): Original patient dataset.

    Returns:
      pd.DataFrame: Transformed DataFrame with formatted data.
    """
    output_columns = ["COVID-19 PRESENCE", "COVID-19 SEVERITY", "DEATH", "PNEUMONIA"]
    
    # Generate textual prompts
    df["PATIENT DATAPOINT PROMPT"] = df.apply(lambda row: create_prompt(row), axis=1)
    
    # Initialize the final dataframe with core columns
    final_df = df[["SEX", "PREGNANT", "PATIENT DATAPOINT PROMPT"]].copy()
    
    # # Add target-specific prompt columns
    # final_df = add_target_prompts(final_df)
    
    # Add output columns at the end
    for col in output_columns:
        if col in df.columns:
            final_df[col] = df[col]

    return final_df

text_prompt_df = create_prompt_dataframe(df)
text_prompt_df.to_csv("Text Prompts.csv", index=False)


In [None]:
def split_data_dual(df_tab, df_prompt, dataset_type="Original", test_size=0.2, random_state=42):
    """
    Splits the original tabular DataFrame and its associated prompt DataFrame into training and test sets.
    The split is done on df_tab (which is assumed to have the same index as df_prompt) and then the same
    indices are used to extract rows from df_prompt.

    The test set is created to be balanced (50% male, 50% female) by performing stratified splitting on the SEX column.
    The training set is then filtered based on the dataset_type:
      - "Original": use the full training set.
      - "Gender-Balance": sample equal numbers of males and females.
      - "Male-Only": keep only male data points.
      - "Female-Only": keep only female data points.
      - "Non-pregnant Females": from females, select rows where PREGNANT == 2.
      - "Pregnant Females": from females, select rows where PREGNANT == 1.

    Returns:
      A tuple of tuples:
        ((train_tab, train_prompt), (test_tab, test_prompt))
    """
    # Split df_tab by gender for stratification.
    male_data = df_tab[df_tab['SEX'] == 2]
    female_data = df_tab[df_tab['SEX'] == 1]
    
    # Determine test fraction for each gender. Since overall test_size is for the entire dataset,
    # and each gender should contribute equally, we use test_size/0.5 for each group.
    male_train, male_test = train_test_split(male_data, test_size=test_size/0.5, random_state=random_state)
    female_train, female_test = train_test_split(female_data, test_size=test_size/0.5, random_state=random_state)
    
    # Combine the test sets to form a balanced test set.
    test_df_tab = pd.concat([male_test, female_test])
    
    # Combine remaining data for training.
    train_df_tab = pd.concat([male_train, female_train])
    
    # Filter the training set based on dataset_type.
    if dataset_type == "Original":
        filtered_train_tab = train_df_tab.copy()
    elif dataset_type == "Gender-Balance":
        min_count = min(len(male_train), len(female_train))
        filtered_males = male_train.sample(n=min_count, random_state=random_state)
        filtered_females = female_train.sample(n=min_count, random_state=random_state)
        filtered_train_tab = pd.concat([filtered_males, filtered_females])
    elif dataset_type == "Male-Only":
        filtered_train_tab = train_df_tab[train_df_tab['SEX'] == 2]
    elif dataset_type == "Female-Only":
        filtered_train_tab = train_df_tab[train_df_tab['SEX'] == 1]
    elif dataset_type == "Non-pregnant Females":
        filtered_train_tab = train_df_tab[(train_df_tab['SEX'] == 1) & (train_df_tab['PREGNANT'] == 2)]
    elif dataset_type == "Pregnant Females":
        filtered_train_tab = train_df_tab[(train_df_tab['SEX'] == 1) & (train_df_tab['PREGNANT'] == 1)]
    else:
        raise ValueError("Invalid dataset_type provided. Choose from: 'Original', 'Gender-Balance', 'Male-Only', 'Female-Only', 'Non-pregnant Females', 'Pregnant Females'.")
    
    # Get the indices for training and test sets.
    train_idx = filtered_train_tab.index
    test_idx = test_df_tab.index

    # Select the corresponding rows from the prompt DataFrame.
    train_df_prompt = df_prompt.loc[train_idx]
    test_df_prompt = df_prompt.loc[test_idx]
    
    return (filtered_train_tab, train_df_prompt), (test_df_tab, test_df_prompt)

# Example usage:
# Assume df_tabular is your tabular data and df_prompts is the corresponding prompt dataframe.
df_tabular = pd.read_csv("Cleaned Covid Data.csv")
df_prompts = pd.read_csv("Text Prompts.csv")

# For example, to split using the "Pregnant Females" subset for training:
((train_tab, train_prompt), (test_tab, test_prompt)) = split_data_dual( df_tabular, df_prompts, dataset_type="Pregnant Females", test_size=0.2)

In [None]:
train_prompt.columns

# Considering INTUBED as Prediction Column 

Due to INTUBED having high NAN values, we did not want to consider it as both a potential feature and prediction column. In order to due so, we must drop all rows where Intubed is NAN. This greately reduces our dataset size. Additionally, it removes all data points where the patient returned home (PATIENT_TYPE=1) leaving only data points where a patient was hospitalized (PATIENT_TYPE=2)

In [None]:
# # List of output columns that should be at the end
# output_columns = ["COVID-19 PRESENCE", "COVID-19 SEVERITY", "DATE_DIED", "INTUBED", "PNEUMONIA"]

# # Get all columns sorted alphabetically, excluding the output columns
# sorted_columns = sorted([col for col in df.columns if col not in output_columns])

# # Create the new column order with output columns at the end
# new_column_order = sorted_columns + output_columns

# # Reorder the DataFrame
# df = df[new_column_order]
# df = df.dropna(subset=['INTUBED'])

### Data Summarization

We view the data from the following subgroups:

Original Data

All Male Data

All Female Data

Non-pregnant Females

Pregnant Females

In [None]:
# all_data_summary = summarize_dataframe(df)
# printSummary(all_data_summary)

In [None]:
# male_df = df[df["SEX"] == 2]
# male_summary = summarize_dataframe(male_df)
# printSummary(male_summary)

In [None]:
# female_df = df[df["SEX"] == 1]
# female_summary = summarize_dataframe(female_df)
# printSummary(female_summary)

In [None]:
# pregnant_female_df = female_df[female_df["PREGNANT"] == 1]
# pregnant_female_summary = summarize_dataframe(pregnant_female_df)
# printSummary(pregnant_female_summary)

In [None]:
# not_pregnant_female_df = female_df[female_df["PREGNANT"] == 2]
# not_pregnant_female_summary = summarize_dataframe(not_pregnant_female_df)
# printSummary(not_pregnant_female_summary)

In [None]:
# # Save the Cleaned Data
# df.to_csv("Cleaned Covid Data (Intubed as Prediction Col.).csv", index=False)