In [None]:
#%pip install defusedxml
#%pip install openpyxl
#%pip install scikit-bio
#%pip install scikit-learn
import sys
import numpy as np
import pandas as pd
from skbio.stats.composition import clr, multiplicative_replacement

### Join Reference Columns with Validation Data ###

In [None]:
# NB: File can be found at /manitou/pmg/projects/korem_lab/Data/Freedberg_inulin_trial/validation_data/metadata
val_ref = pd.read_excel("/Users/mcarrion/Korem_Lab/ICU_Reference.xlsx")
val_ref.head()

In [None]:
#val_abundance = pd.read_csv("/Users/mcarrion/Korem_Lab/combined/filtered_seqtab_overlap.csv",index_col=0)
val_abundance = pd.read_csv("/Users/mcarrion/Korem_Lab/scrub_results/unified_scrubbed_counts.csv",index_col=0)
val_abundance.head()

In [None]:
# Pivot ra table
# Reset index to make sample names (SB1, SB2, ...) a column
val_abundance = val_abundance.reset_index().rename(columns={"index": "sample"})
val_abundance = val_abundance[val_abundance["sample"].str.startswith("val_")]
val_abundance["sample"] = val_abundance["sample"].str.split("_").str[1]

# Remove/Rename erroneous samples
val_abundance.loc[val_abundance["sample"] == "BS199", "sample"] = "SB199"
val_abundance = val_abundance[~val_abundance["sample"].astype(str).str.startswith(("PCR", "Ext", "Pos","Unnamed"))]

val_abundance.head()

In [None]:
# Identify columns containing sample numbers
sample_cols = [col for col in val_ref.columns if col.startswith("samplenumber")]

# Melt the reference DataFrame while keeping all other columns
val_ref_long = val_ref.melt(
    id_vars=[col for col in val_ref.columns if col not in sample_cols],  # Keep all non-sample columns
    value_vars=sample_cols,  
    var_name="sample_type",  
    value_name="sample"
)

# Show the structure of the reshaped ref
val_ref_long.head()

In [None]:
# Merge based on the sample column
merged_df_val = val_abundance.merge(val_ref_long, on="sample", how="left")

# Show the final merged structure
merged_df_val.head(100)

In [None]:
# add binary death and infection columns
merged_df_val["death"] = merged_df_val["Date of death"].notnull().astype(int)
merged_df_val["infection"] = merged_df_val["infectiondate1"].notnull().astype(int)
merged_df_val = merged_df_val.rename(columns={"sample": "Sample"})
merged_df_val.head()

### Join Reference Columns with Original Data ###

In [None]:
# orig_abundance = pd.read_csv("/Users/mcarrion/Korem_Lab/combined/filtered_seqtab_overlap.csv",index_col=0)
orig_abundance = pd.read_csv("/Users/mcarrion/Korem_Lab/scrub_results/unified_scrubbed_counts.csv",index_col=0)
orig_abundance = orig_abundance.reset_index().rename(columns={"index": "sample"})
orig_abundance = orig_abundance[orig_abundance["sample"].str.startswith("orig_")]
orig_abundance["sample"] = orig_abundance["sample"].str.removeprefix("orig_")
orig_abundance.head()

In [None]:
#NB: File can be found at /manitou/pmg/projects/korem_lab/Data/Freedberg_inulin_trial/metadata
orig_ref = pd.read_excel("/Users/mcarrion/Korem_Lab/orig_data_outcomes.xls")
orig_ref.head()

In [None]:
# Step 1: Extract the part before the dash in "Sample"
orig_abundance["id"] = orig_abundance["sample"].str.split("-").str[0]

# Step 2: Merge the tables on "id"
merged_df_orig = orig_abundance.merge(orig_ref, on="id", how="inner")
merged_df_orig["sample"] = merged_df_orig["sample"].str.split("_").str[0]

# Display merged DataFrame
merged_df_orig.head()

In [None]:
merged_df_orig.to_pickle("/Users/mcarrion/Korem_Lab/combined/merged_df_orig.pkl")
merged_df_val.to_pickle("/Users/mcarrion/Korem_Lab/combined/merged_df_val.pkl")

Add in SOFA Scores (& other feature engineering) - For original cohort

In [None]:
# NB: File can be found at /manitou/pmg/projects/korem_lab/Data/Freedberg_inulin_trial/metadata
sofa = pd.read_csv('/burg/pmg/users/mc5672/prev_analysis/data/metadata/sofascores.csv', index_col=0)
sofa_scores = sofa[['sofascore']]
merged_df_orig = merged_df_orig.merge(
    sofa_scores,
    how='left',
    left_on='sample',
    right_index=True
)
merged_df_orig.head()

In [None]:
merged_df_orig['day'] = merged_df_orig['sample'].str.extract(r'-D(\d+)').astype(int)
merged_df_orig['death_next10'] = (
    (merged_df_orig['death_day'] >= merged_df_orig['day']) &
    (merged_df_orig['death_day'] <= merged_df_orig['day'] + 10)
)
merged_df_orig['death_next7'] = (
    (merged_df_orig['death_day'] >= merged_df_orig['day']) &
    (merged_df_orig['death_day'] <= merged_df_orig['day'] + 7)
)

In [None]:
# Do we need to balance data?
merged_df_orig['death_next10'].value_counts(normalize=True)

In [None]:
merged_df_orig['days_since_icu'] = merged_df_orig['sample'].str.extract(r'-D(\d+)').astype(float)


Add in Sofa scores (& other feature engineering) - For validation cohort

In [None]:
def compute_sofa(row):
    score = 0

    # Respiratory (PaO2/FiO2)
    try:
        pfr = row['Sofa_PaO2'] / (row['Sofa_FiO2'] / 100)
        if pfr < 100 and row.get('Vent (Y=1, No=2)') == 1:
            score += 4
        elif pfr < 200 and row.get('Vent (Y=1, No=2)') == 1:
            score += 3
        elif pfr < 300:
            score += 2
        elif pfr < 400:
            score += 1
    except:
        pass

    # Coagulation
    plts = row.get('Sofa_platelets')
    if pd.notnull(plts):
        if plts < 20:
            score += 4
        elif plts < 50:
            score += 3
        elif plts < 100:
            score += 2
        elif plts < 150:
            score += 1

    # Liver
    bili = row.get('Sofa_bilirubin')
    if pd.notnull(bili):
        if bili >= 12:
            score += 4
        elif bili >= 6:
            score += 3
        elif bili >= 2:
            score += 2
        elif bili >= 1.2:
            score += 1

    # Cardiovascular
    pressor = row.get('Pressors (Low dose=1, Medium=2, High=3)')
    if pd.notnull(pressor):
        if pressor == 3:
            score += 4
        elif pressor == 2:
            score += 3
        elif pressor == 1:
            score += 2
    else:
        if row.get('MAP < 70 (Yes=1, No=2)') == 1:
            score += 1

    # CNS
    gcs = row.get('GCS ')
    if pd.notnull(gcs):
        if gcs < 6:
            score += 4
        elif gcs < 9:
            score += 3
        elif gcs < 12:
            score += 2
        elif gcs < 15:
            score += 1

    # Renal
    cr = row.get('Sofa_creatinine')
    if pd.notnull(cr):
        if cr >= 5.0:
            score += 4
        elif cr >= 3.5:
            score += 3
        elif cr >= 2.0:
            score += 2
        elif cr >= 1.2:
            score += 1

    return score


In [None]:
import re

def get_sample_date(row):
    if pd.notnull(row["sample_type"]):
        match = re.search(r'(\d+)', row["sample_type"])
        if match:
            sample_num = match.group(1)
            col_name = f'sampledate{sample_num}'
            if col_name in merged_df_val.columns:
                return row[col_name]
    return np.nan

merged_df_val['date_of_sample'] = merged_df_val.apply(get_sample_date, axis=1)

merged_df_val['sofascore'] = merged_df_val.apply(compute_sofa, axis=1)

# Make sure both columns are datetime
merged_df_val['Date of death'] = pd.to_datetime(merged_df_val['Date of death'], errors='coerce')
merged_df_val['date_of_sample'] = pd.to_datetime(merged_df_val['date_of_sample'], errors='coerce')

# Compute days until death
merged_df_val['days_until_death'] = (merged_df_val['Date of death'] - merged_df_val['date_of_sample']).dt.days

# Create binary outcome: death within 10 days
merged_df_val['death_next10'] = merged_df_val['days_until_death'].between(0, 10)

In [None]:
merged_df_val['date_of_sample'] = pd.to_datetime(merged_df_val['date_of_sample'])
merged_df_val['Date of ICU admission'] = pd.to_datetime(merged_df_val['Date of ICU admission'])

# Calculate how many days after ICU admission the sample was taken
merged_df_val['days_since_icu'] = (merged_df_val['date_of_sample'] - merged_df_val['Date of ICU admission']).dt.days
merged_df_val = merged_df_val.dropna(subset=['days_since_icu'])

CLR Transform

In [None]:
# Get list of shared ASVs between the two cohorts
features = pd.read_csv("/manitou/pmg/users/mc5672/post_processing_data/unified_scrubbed_counts.csv",index_col=0).columns.to_list()


In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

class AbundanceScaler(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        # No fitting needed; just keep column names for consistency
        self.feature_names = list(X.columns)
        return self

    def transform(self, X):
        return X.div(X.sum(axis=1), axis=0)

    def get_feature_names_out(self, input_features=None):
        return self.feature_names

class LogRAScaler(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.feature_names = list(X.columns)
        return self

    def transform(self, X):
        rel_abundance = X.div(X.sum(axis=1), axis=0)
        return np.log1p(rel_abundance)

    def get_feature_names_out(self, input_features=None):
        return self.feature_names

In [None]:
from PredictionPipelineV3.PipelineSteps.Preprocessor import CLRScaler
from sklearn.preprocessing import StandardScaler

# Set this to one of: 'clr', 'abun', 'log_ra'
scaler = 'clr'

# Set index
merged_df_orig = merged_df_orig.set_index('id')
X_day = merged_df_orig[['day']].copy()

# Split features
abundance_features = [f for f in features if f.lower() != 'sofascore']
sofa_feature = ['sofascore']
X_abundance = merged_df_orig[abundance_features].copy()
if scaler == 'clr':
	#X_abundance = np.log1p(X_abundance.div(X_abundance.sum(axis=1), axis=0))  #convert to log_rel_abundance
	X_abundance = X_abundance.div(X_abundance.sum(axis=1), axis=0)  #convert to rel_abundance

X_sofa = merged_df_orig[sofa_feature].copy()

# Map scaler string to actual scaler class
scaler_map = {
    'clr': CLRScaler,
    'abun': AbundanceScaler,
    'log_ra': LogRAScaler
}

# Initialize and apply selected scaler
if scaler not in scaler_map:
    raise ValueError(f"Unknown scaler type: {scaler}")

abundance_scaler = scaler_map[scaler]()
X_abundance_proc = pd.DataFrame(
    abundance_scaler.fit_transform(X_abundance),
    index=X_abundance.index,
    columns=X_abundance.columns
)

# Preprocess SOFA
sofa_scaler = StandardScaler()
X_sofa_proc = pd.DataFrame(
    sofa_scaler.fit_transform(X_sofa),
    index=X_sofa.index,
    columns=X_sofa.columns
)

# Combine features
X_new = pd.concat([X_abundance_proc, X_sofa_proc, X_day], axis=1)
X_new = X_new.rename(columns={'day': 'days_since_icu'})

# Recombine with metadata
merged_meta = merged_df_orig.drop(columns=X_new.columns, errors='ignore')
merged_df_orig_processed = pd.concat([merged_meta, X_new], axis=1).reset_index(drop=False)

# Save if needed
#output_path = f"/manitou/pmg/users/mc5672/post_processing_data/merged_df_orig_{scaler}_logra.csv"
#merged_df_orig_processed.to_csv(output_path, index=False)

Apply same values to validation data

In [None]:
# Ensure 'Sample' is set as index for matching row alignment
merged_df_val = merged_df_val.set_index('Sample')

# Define abundance and sofa features
abundance_features = [f for f in features if f.lower() != 'sofascore']
sofa_feature = ['sofascore']
X_abundance_val = merged_df_val[abundance_features].copy()
if scaler == 'clr':
	# X_abundance_val = np.log1p(X_abundance_val.div(X_abundance_val.sum(axis=1), axis=0))  #convert to log_rel_abundance
	X_abundance_val = X_abundance_val.div(X_abundance_val.sum(axis=1), axis=0)  #convert to rel_abundance

	
X_sofa_val = merged_df_val[sofa_feature].copy()
X_day_val = merged_df_val[['days_since_icu']].copy()

# Apply the already-fitted scalers from training
X_abundance_val_proc = pd.DataFrame(
    abundance_scaler.transform(X_abundance_val),
    index=X_abundance_val.index,
    columns=X_abundance_val.columns
)

X_sofa_val_proc = pd.DataFrame(
    sofa_scaler.transform(X_sofa_val),
    index=X_sofa_val.index,
    columns=X_sofa_val.columns
)

# Combine processed validation data
X_val_new = pd.concat([X_abundance_val_proc, X_sofa_val_proc, X_day_val], axis=1)

# Recombine with metadata, avoiding column duplication
val_meta = merged_df_val.drop(columns=X_val_new.columns, errors='ignore')
merged_df_val_processed = pd.concat([val_meta, X_val_new], axis=1).reset_index(drop=False)
 
# Optionally save to CSV
#output_path_val = f"/manitou/pmg/users/mc5672/post_processing_data/merged_df_val_{scaler}_for_testing.csv"
#merged_df_val_processed.to_csv(output_path_val, index=False)
