In [None]:
import pandas as pd
import numpy as np
import missingno as msno
from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt
import seaborn as sns
%config InlineBackend.figure_format='retina'

 ___
 # Load clean data

In [None]:
data_date = "XXX"
raw_data_path = f"XXX"
data_path = f"XXX"
dict_path = f"XXX"
col_meta_path = "XXX"

data_df_raw = pd.read_csv(raw_data_path, low_memory=False)
data_df = pd.read_csv(data_path, low_memory=False)
dict_df = pd.read_csv(dict_path)
col_meta = pd.read_excel(col_meta_path, sheet_name="columns_metadata")

In [None]:
def explain_col(col):
    temp = dict_df[dict_df["Variable / Field Name"] == col]
    print(temp)
    
def data_to_use(df):
    """
    Function to remove columns that should not be included for predictions/clustering.
    """
    return df.loc[:, 
        (~df.columns.str.contains('label')) & 
        (~df.columns.str.contains('record_id'))
        ]

In [None]:
data_df

___
# Impute missing data

In [None]:
num_cols = col_meta[col_meta["data_type"]=="numerical"]["column"].values
cat_cols = col_meta[col_meta["data_type"]=="categorical"]["column"].values

In [None]:
num_cols

In [None]:
cat_cols

In [None]:
# Make a copy for imputation and remove record_id and _label columns
df_imputed = data_to_use(data_df).copy()

# Filter cat_cols and num_cols only to include those that exist in data_df
cat_cols_present = [col for col in cat_cols if col in df_imputed.columns]
num_cols_present = [col for col in num_cols if col in df_imputed.columns]

# If you do this before imputation, 'nan' is recognized as a separate category
# Treat coded categoricals as strings
df_imputed[cat_cols_present] = df_imputed[cat_cols_present].astype(str)

# Create imputers
num_imputer = SimpleImputer(strategy='mean')
cat_imputer = SimpleImputer(strategy='most_frequent')

# Apply imputers only to columns that exist
if num_cols_present:
    df_imputed[num_cols_present] = num_imputer.fit_transform(df_imputed[num_cols_present])
if cat_cols_present:
    df_imputed[cat_cols_present] = cat_imputer.fit_transform(df_imputed[cat_cols_present])

# Restore categorical columns as string
df_imputed[cat_cols_present] = df_imputed[cat_cols_present].astype(str)

Visualize effects of imputed values:

In [None]:
# Plot gaps before imputation
msno.matrix(data_df_raw)
plt.title("Missing Data (Raw Data)")
plt.savefig("figures/2_data_gaps_overview_raw_data.png", dpi=300, bbox_inches="tight")
plt.show()

In [None]:
# Plot gabs before imputation
msno.matrix(data_df)
plt.title("Missing Data (Clean Data Before Imputation)")
plt.savefig("figures/2_data_gaps_overview_clean.png", dpi=300, bbox_inches="tight")
plt.show()

In [None]:
# Plot features with most imputed values
# Count missing (i.e., imputed) values per column
missing_counts = data_df.isna().sum()

# Filter to only columns with missing values
missing_counts = missing_counts[missing_counts > 0]

# Sort by number of missing values
missing_counts = missing_counts.sort_values(ascending=False)

# Display top N (e.g., 20) most imputed columns
top_missing = missing_counts.head(20)

# Print them
# print(top_missing)

# Optional: bar plot
top_missing.plot(kind='barh', figsize=(10, 6), color='salmon')
plt.gca().invert_yaxis()  # Highest on top
plt.title("Top 20 Columns with Most Imputed Values")
plt.xlabel("Number of Missing (Imputed) Values")
plt.tight_layout()

plt.savefig("figures/2_top_imputed_features.png", dpi=300, bbox_inches="tight")

plt.show()

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

def plot_imputed(feature, feature_type='cat', save=False, fontsize=14, bins=30, figsize=(10, 5)):
    """
    Visualizes original vs imputed values for a given feature.

    Parameters:
        feature (str): The feature column to plot
        feature_type (str): 'cat' for categorical, 'num' for numerical
        save (bool): Whether to save the figure
        fontsize (int): Font size for all labels
        bins (int): Number of bins for histogram (numerical only)
    """
    # Mask for missing (i.e., imputed) rows in the original data
    imputed_mask = data_df[feature].isna()

    # Final values from imputed data
    final_vals = df_imputed[feature]
    original_vals = final_vals[~imputed_mask]
    imputed_vals = final_vals[imputed_mask]

    # Categorical feature: stacked bar
    if feature_type == 'cat':
        # Count occurrences
        vc_original = original_vals.dropna().astype(str).value_counts()
        vc_imputed = imputed_vals.dropna().astype(str).value_counts()

        # Combine into DataFrame
        categories = sorted(set(vc_original.index).union(vc_imputed.index))
        plot_df = pd.DataFrame(index=categories)
        plot_df['Original'] = vc_original
        plot_df['Imputed'] = vc_imputed
        plot_df = plot_df.fillna(0)

        # Plot
        ax = plot_df[['Original', 'Imputed']].plot(
            kind='bar',
            stacked=True,
            color=['blue', 'orange'],
            figsize=figsize
        )
        ax.set_xlabel("Category", fontsize=fontsize)
        ax.set_ylabel("Number of Samples", fontsize=fontsize)
        ax.set_title(f"Final Distribution of '{feature}' (Original vs. Imputed)", fontsize=fontsize)
        ax.tick_params(axis='both', labelsize=fontsize)
        ax.legend(title="Source", fontsize=fontsize, title_fontsize=fontsize)
        plt.xticks(rotation=45, ha='right')

    # Numerical feature: overlaid histogram
    elif feature_type == 'num':
        plt.figure(figsize=figsize)

        # Histogram of original values, return bin edges
        counts, bin_edges, _ = plt.hist(
            original_vals, bins=bins, color='blue', alpha=0.7, label='Original'
        )

        # Count of imputed values (should only be a single value (mean) for numerical categories)
        num_imputed = imputed_vals.dropna().shape[0]
        unique_imputed_vals = imputed_vals.dropna().unique()

        if len(unique_imputed_vals) == 1:
            imputed_val = unique_imputed_vals[0]

            # Estimate bin width
            bin_width = bin_edges[1] - bin_edges[0]

            # Add a bar representing the imputed value
            plt.bar(
                x=imputed_val,
                height=num_imputed,
                width=bin_width,
                color='orange',
                alpha=0.8,
                align='center',
                label='Imputed'
            )
        else:
            # Multiple imputed values – fallback to histogram
            plt.hist(imputed_vals, bins=bins, color='orange', alpha=0.5, label='Imputed')

        plt.xlabel(feature, fontsize=fontsize)
        plt.ylabel("Count", fontsize=fontsize)
        plt.title(f"Final Distribution of '{feature}' (Original vs. Imputed)", fontsize=fontsize)
        plt.legend(title="Source", fontsize=fontsize, title_fontsize=fontsize)
        plt.xticks(fontsize=fontsize)
        plt.yticks(fontsize=fontsize)

    else:
        raise ValueError("feature_type must be either 'cat' or 'num'")

    plt.tight_layout()
    if save:
        plt.savefig(f"figures/2_imputation_overview_{feature}.png", dpi=300, bbox_inches="tight")

    plt.show()

In [None]:
feature = 'rodent_touch'
plot_imputed(feature)

In [None]:
feature = 'funeral_body'
plot_imputed(feature)

In [None]:
feature = 'pregnancy_crf'
plot_imputed(feature, save=True)

In [None]:
feature = 'dbp'
plot_imputed(feature, feature_type='num', save=True)

### Save data

In [None]:
# Add pathogen label and record_id columns back
label_cols = [col for col in data_df.columns if col.endswith('_label')]
cols_to_copy = label_cols + ['record_id']
df_imputed[cols_to_copy] = data_df[cols_to_copy].values

df_imputed.to_csv("XXX", index=False)