# Preprocess Conditions Input Data


Create a single, unified file of annotated conditions and other Conditions information needed based on the INCLUDE LinkML model to use as the input to linkml-map.

This notebook is used with the Brain Power study.

In [None]:
# Imports
import pandas as pd

pd.set_option("display.max_colwidth", None)

In [None]:
# Read in annotated data file

annotated_df = pd.read_csv(
    "../data/BrainPower-STUDY/harmonica_conditions_data/tmp/output/hp_mondo_maxo-combined_ontology_annotations-20250730-210251.tsv",
    sep="\t",
)

# annotated_df[annotated_df["condition_name"] == "vsd"].head(5)

In [None]:
# There are multiple rows per participant (id), one row for each set of ontology annotation results. Collapse these
# rows back together.

# Columns to group by
group_cols = ["id", "timepoint", "condition_name", "has_condition", "UUID"]

# Columns to combine
agg_cols = [
    "hpo_result_curie",
    "hpo_result_label",
    "hpo_result_match_type",
    "mondo_result_curie",
    "mondo_result_label",
    "mondo_result_match_type",
    "annotation_source",
    "annotation_method",
    "ontology",
]

# Define how to aggregate: join unique non-null values
aggregations = {col: lambda x: ", ".join(sorted(set(x.dropna().astype(str)))) for col in agg_cols}

# Group and aggregate
annotated_combined_df = annotated_df.groupby(group_cols, dropna=False).agg(aggregations).reset_index()


# annotated_combined_df[annotated_combined_df['condition_name'] == 'vsd'].head(5)

In [None]:
# Trim out unnecessary columns from annotated_df

columns_to_keep = [
    "id",
    "timepoint",
    "condition_name",
    "hpo_result_curie",
    "hpo_result_label",
    "mondo_result_curie",
    "mondo_result_label",
]

annotated_trimmed_df = annotated_combined_df[columns_to_keep]


# annotated_trimmed_df[annotated_trimmed_df["condition_name"] == "vsd"].head(5)

In [None]:
# Read in Health Conditions data dictionary in order to process text for "Condition or Measure Source Text"

conditions_dataDict_df = pd.read_csv("../data/BrainPower-STUDY/data_dictionary/Health Conditions Data Dictionary.csv")

In [None]:
# Format conditionMeasureSourceText value which is the 'description' text for the rows after
# the 'variable' 'timepoint', e.g. vsd, sd, etc. and is the text up to the first "(" and then appended with "- Yes"

conditions_dataDict_df.loc[conditions_dataDict_df.index >= 2, "conditionMeasureSourceText"] = (
    conditions_dataDict_df.loc[conditions_dataDict_df.index >= 2, "description"]
    .str.split("(", n=1)
    .str[0]  # Take part before the first '('
    .str.strip()
    + " - Yes"  # Append " - Yes"
)

In [None]:
# Merge the conditionMeasureSourceText into annotated_trimmed_df

annotated_trimmed_df = annotated_trimmed_df.merge(
    conditions_dataDict_df[["variable", "conditionMeasureSourceText"]],
    how="left",
    left_on="condition_name",
    right_on="variable",
)

# Drop the now redundant 'variable' column (optional)
annotated_trimmed_df.drop(columns="variable", inplace=True)

# annotated_trimmed_df[annotated_trimmed_df["condition_name"] == "vsd"].head(5)

In [None]:
# REVIEW the merged results
# It is expected to have multiple rows per participant for different 'condition_name' values and these rows
# could have values where an ontology annotation exists in both the HPO and Mondo related columns for that same condition
# if an annotation for these values was found.

# Display example data for id=1303
# display(annotated_trimmed_df[annotated_trimmed_df['id'] == 1303].head())

In [None]:
# Read in "ageateventandlatency.tsv" file

age_events_df = pd.read_csv("../data/BrainPower-STUDY/raw_data/TSV/ageateventandlatency.tsv", sep="\t")
# age_events_df.head()

In [None]:
# Add new columns for 'age_at_visit_timepoint_1', 'age_at_visit_timepoint_3' and 'age_at_visit_timepoint_5'.
# These values come from the "ageateventandlatency.tsv" file, the age_events_df above.

# Pivot: use id as index, spread age_at_visit across timepoints
pivoted = age_events_df.pivot(index="id", columns="timepoint", values="age_at_visit")

# Rename columns
pivoted.columns = [f"age_at_timepoint_{int(col)}" for col in pivoted.columns]

# Reset index to get 'id' back as a column
age_events_pivot_df = pivoted.reset_index()

# age_events_pivot_df.head()

In [None]:
# Join these age values from age_events_pivot_df with values in annotated_trimmed_df

annotated_trimmed_with_ages_df = annotated_trimmed_df.merge(age_events_pivot_df, on="id", how="left")

annotated_trimmed_with_ages_df.to_csv("annotated_trimmed_with_ages_df.tsv", sep="\t", index=False)

# annotated_trimmed_with_ages_df[annotated_trimmed_with_ages_df['id'] == 1303].head()

In [None]:
# Read in anthropometrics.tsv file --> remember this file has one row per timepoint value

anthropometrics_df = pd.read_csv("../data/BrainPower-STUDY/raw_data/TSV/anthropometrics.tsv", sep="\t")

In [None]:
# For each timepoint value, e.g. 1, 3, and 5, we need separate columns for each metric of interest,
# for example height_timepoint_1, height_timepoint_3, and height_timepoint_5
# This needs to be done for all columns in this file that are in the INCLUDE LinkML model
# These variables: height, weight, bmi, waist, sbp, dbp, resting_hr

# NOTE: These should have values for Other Label as: Body Height, Body Weight, Body Mass Index,
# Waist Circumference at umbilicus by Tape measure, Systolic blood pressure, Diastolic blood pressure, Heart rate --resting


# Melt so each variable/timepoint is a row
melted_anthropometrics_df = pd.melt(
    anthropometrics_df, id_vars=["id", "timepoint"], var_name="variable", value_name="value"
)

# Create new column names
melted_anthropometrics_df["new_col"] = (
    melted_anthropometrics_df["variable"] + "_timepoint_" + melted_anthropometrics_df["timepoint"].astype(str)
)

# Pivot back
pivoted_anthropometrics_df = melted_anthropometrics_df.pivot_table(
    index="id", columns="new_col", values="value"
).reset_index()

# pivoted_anthropometrics_df.head()

In [None]:
# Join "annotated_trimmed_with_ages_df" with "pivoted_anthropometrics_df" on 'id' as a left join and drop the column 'new_col'

# Left join on 'id'
all_cols_df = annotated_trimmed_with_ages_df.merge(pivoted_anthropometrics_df, on="id", how="left")

# all_cols_df.head()

In [None]:
# Convert empty column values to an empty string
all_cols_df = all_cols_df.fillna("")

In [None]:
# Review results

# all_cols_df[all_cols_df['id'] == 1302]

In [None]:
# Save to file
all_cols_df.to_csv(
    "../data/BrainPower-STUDY/raw_data/TSV_Transformed/healthconditions_all_cols.tsv", sep="\t", index=False
)