# COVID-19 Data Cleaning Script

### Imports

In [None]:
import pandas as pd
import numpy as np

### Loading Data

In [None]:
# Load the CSV
df = pd.read_csv("Covid Data.csv")

### Print functions to understand data

In [None]:
def printColumnValueCounts(df):
    for col in df.columns:
        print(f"Value counts for '{col}':")
        print(df[col].value_counts(dropna=False))
        print("-" * 50)  # Adds a separator for readability
        
def summarize_dataframe(df, num_unique_threshold=20):
    """
    For each column in the DataFrame:
      - If the column is numeric (dtype is int/float) and has more than `num_unique_threshold`
        unique (non-null) values, compute its mean and standard deviation.
      - Otherwise, treat it as categorical/boolean and compute the percentage breakdown
        of its values (including NaNs).
      
      Additionally, for every column, include the raw value counts (with NaNs included).
      
    Returns:
      A dictionary with column names as keys and a summary dictionary as values.
    """
    summary = {}
    
    for col in df.columns:
        col_summary = {}
        
        # Compute value counts (including NaN)
        counts = df[col].value_counts(dropna=False)
        counts_dict = {}
        for key, value in counts.items():
            # Convert NaN key to a string for clarity
            if pd.isna(key):
                counts_dict['NaN'] = value
            else:
                counts_dict[key] = value
        col_summary['Value Counts'] = counts_dict
        
        # Also compute the percentage breakdown of values (including NaN)
        total = counts.sum()
        perc_breakdown = (counts / total * 100).round(2)
        perc_dict = {}
        for key, value in perc_breakdown.items():
            if pd.isna(key):
                perc_dict['NaN'] = f"{value}%"
            else:
                perc_dict[key] = f"{value}%"
        
        # Determine if we treat the column as numeric or categorical
        num_unique = df[col].nunique(dropna=True)
        if pd.api.types.is_numeric_dtype(df[col]) and num_unique > num_unique_threshold:
            col_summary['Type'] = 'Numeric'
            col_summary['Mean'] = df[col].mean(skipna=True)
            col_summary['Standard Deviation'] = df[col].std(skipna=True)
        else:
            col_summary['Type'] = 'Categorical'
            col_summary['Percentage Breakdown'] = perc_dict
        
        summary[col] = col_summary
        
    return summary

def printSummary(summaryDict):
    for col, summ in summaryDict.items():
      print(f"Summary for '{col}':")
      print("  Value Counts:")
      for val, count in summ['Value Counts'].items():
          print(f"    {val}: {count}")
      
      print(f"  Type: {summ['Type']}")
      if summ['Type'] == 'Numeric':
          print(f"  Mean: {summ['Mean']:.2f}")
          print(f"  Standard Deviation: {summ['Standard Deviation']:.2f}")
      else:
          print("  Percentage Breakdown:")
          for val, perc in summ['Percentage Breakdown'].items():
              print(f"    {val}: {perc}")
      print("-" * 50)

In [None]:
printColumnValueCounts(df)

### Cleaning Columns
In the Boolean features, 1 means "yes" and 2 means "no". values as 97-99 are missing data.

In boolean categorical columns, 97 - 98 is unknown, so we set those to NAN.

For PREGNANT, all males should have NAN, and then we should not allow women to be unknown, so we set then to -1 to make them as invalid. rows that contain -1 will be dropped later. 

DATE_DIED get's remapped to be DIED. If a person has a invalid date of death (9999-99-99), they did not die. 

PNEUMONIA is an output column. It cannot have unknown values. Mark those as -1 to be dropped later. 

Dataset specified that if CLASIFFICATION_FINAL was higher than 3, it meant a patient did not have COVID.

In [None]:
# Columns where 97, 98, and 99 should be set to NaN
columns_to_nan = [
    "ASTHMA", "CARDIOVASCULAR", "COPD", "DIABETES", "HIPERTENSION", "ICU", 
    "INTUBED", "INMSUPR", "OBESITY", "OTHER_DISEASE", "RENAL_CHRONIC", "TOBACCO"
]
df[columns_to_nan] = df[columns_to_nan].replace({97: np.nan, 98: np.nan, 99: np.nan})

# Handle 'PREGNANT' column
df.loc[df["SEX"] == 2, "PREGNANT"] = np.nan  # If male, set PREGNANT to NaN
df.loc[(df["SEX"] == 1) & (df["PREGNANT"].isin([97, 98])), "PREGNANT"] = -1  # If female and 97 or 98, set to -1

# Process 'DATE_DIED'
df["DATE_DIED"] = df["DATE_DIED"].apply(lambda x: 2 if x == "9999-99-99" else 1)

# Process 'PNEUMONIA'
df["PNEUMONIA"] = df["PNEUMONIA"].replace(99, -1)

# Process 'CLASIFFICATION_FINAL'
df["CLASIFFICATION_FINAL"] = df["CLASIFFICATION_FINAL"].apply(lambda x: x if x in [1, 2, 3] else 0)

In [None]:
printColumnValueCounts(df)

### Removing any columns where we had invalid values


In [None]:
df = df[~df.isin([-1]).any(axis=1)]

In [None]:
printColumnValueCounts(df)

### Creating new target column 

This turns the CLASIFFICATION_FINAL into a binary classification of COVID-19 PRESENCE

In [None]:
# Create 'COVID-19 PRESENCE' column based on 'COVID-19 SEVERITY'
df["COVID-19 PRESENCE"] = df["CLASIFFICATION_FINAL"].map({1: 1, 2: 1, 3: 1, 0: 2})

### Renaming Columns for Clarity

In [None]:
# Rename columns
df = df.rename(columns={
    "HIPERTENSION": "HYPERTENSION",
    "CLASIFFICATION_FINAL": "COVID-19 SEVERITY"
})

# List of output columns that should be at the end
output_columns = ["COVID-19 PRESENCE", "COVID-19 SEVERITY", "DATE_DIED", "PNEUMONIA"]

# Get all columns sorted alphabetically, excluding the output columns
sorted_columns = sorted([col for col in df.columns if col not in output_columns])

# Create the new column order with output columns at the end
new_column_order = sorted_columns + output_columns

# Reorder the DataFrame
df = df[new_column_order]

### Data Summarization

We view the data from the following subgroups:

Original Data

All Male Data

All Female Data

Non-pregnant Females

Pregnant Females

In [None]:
all_data_summary = summarize_dataframe(df)
printSummary(all_data_summary)

In [None]:
male_df = df[df["SEX"] == 2]
male_summary = summarize_dataframe(male_df)
printSummary(male_summary)

In [None]:
female_df = df[df["SEX"] == 1]
female_summary = summarize_dataframe(female_df)
printSummary(female_summary)

In [None]:
pregnant_female_df = female_df[female_df["PREGNANT"] == 1]
pregnant_female_summary = summarize_dataframe(pregnant_female_df)
printSummary(pregnant_female_summary)

In [None]:
not_pregnant_female_df = female_df[female_df["PREGNANT"] == 2]
not_pregnant_female_summary = summarize_dataframe(not_pregnant_female_df)
printSummary(not_pregnant_female_summary)

In [None]:
# # Save the Cleaned Data
# df.to_excel("Cleaned Covid Data.xlsx", index=False)

# Considering INTUBED as Prediction Column 

Due to INTUBED having high NAN values, we did not want to consider it as both a potential feature and prediction column. In order to due so, we must drop all rows where Intubed is NAN. This greately reduces our dataset size. Additionally, it removes all data points where the patient returned home (PATIENT_TYPE=1) leaving only data points where a patient was hospitalized (PATIENT_TYPE=2)

In [None]:
# List of output columns that should be at the end
output_columns = ["COVID-19 PRESENCE", "COVID-19 SEVERITY", "DATE_DIED", "INTUBED", "PNEUMONIA"]

# Get all columns sorted alphabetically, excluding the output columns
sorted_columns = sorted([col for col in df.columns if col not in output_columns])

# Create the new column order with output columns at the end
new_column_order = sorted_columns + output_columns

# Reorder the DataFrame
df = df[new_column_order]
df = df.dropna(subset=['INTUBED'])

### Data Summarization

We view the data from the following subgroups:

Original Data

All Male Data

All Female Data

Non-pregnant Females

Pregnant Females

In [None]:
all_data_summary = summarize_dataframe(df)
printSummary(all_data_summary)

In [None]:
male_df = df[df["SEX"] == 2]
male_summary = summarize_dataframe(male_df)
printSummary(male_summary)

In [None]:
female_df = df[df["SEX"] == 1]
female_summary = summarize_dataframe(female_df)
printSummary(female_summary)

In [None]:
pregnant_female_df = female_df[female_df["PREGNANT"] == 1]
pregnant_female_summary = summarize_dataframe(pregnant_female_df)
printSummary(pregnant_female_summary)

In [None]:
not_pregnant_female_df = female_df[female_df["PREGNANT"] == 2]
not_pregnant_female_summary = summarize_dataframe(not_pregnant_female_df)
printSummary(not_pregnant_female_summary)

In [None]:
# # Save the Cleaned Data
# df.to_excel("Cleaned Covid Data (Intubed as Prediction Col.).xlsx", index=False)