# Phenome-Wide analysis on TOPMed studies

In [None]:
import json
from pprint import pprint

import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
from scipy import stats

In [None]:
# Pandas DataFrame display options
pd.set_option("max.rows", 435)

# Matplotlib display parameters
fig_size = plt.rcParams["figure.figsize"]
fig_size[0] = 14
fig_size[1] = 8
plt.rcParams["figure.figsize"] = fig_size
font = {'weight' : 'bold',
        'size'   : 12}
plt.rc('font', **font)

# Phenome-wide analysis using Harmonized Variables

In [None]:
harmonized = pd.read_csv("../studies_stats/harmonized_details_stats.csv", index_col=None)\
.loc[lambda df: df["harmonized variables"] != "\\_Consents\\Short Study Accession with Consent Code\\",:]

pd.DataFrame(harmonized["harmonized variables"].str.split("\\").to_list())\
.rename({2: "category", 3: "variable_name"}, axis=1)\
.loc[:, ["category", "variable_name"]]\
.join(harmonized)\
.loc[lambda df: ~ df["variable_name"].str.startswith("age"), :]

## Harmonized Variables Info

- Quality checking:
    - Discarding following variables:
        - Identifiers
        - Only unique values / only null values
- Variable type conversion to categorical
- Counting non-null values to select studies

In [None]:
studies_info = pd.read_csv("../env_variables/studies_info.csv")
mask_harmonized = studies_info["harmonized"] == True
harmonized_studies_info = studies_info[mask_harmonized]

In [None]:
harmonized_studies_info

# Results

In [None]:
pprint(studies_info.loc[studies_info["phs"].isin(studies), "official_study_name"].values.tolist())

In [None]:
df_pvalues = pd.read_csv("../results/df_results/df_pvalues.csv", usecols=["level_1", "level_2", "pvalues"])\
.rename({"level_1": "Dependent Variable Complete Name",
         "level_2": "Independent Variable Complete Name",
         "pvalues": "pvalue"}, axis=1)\
.set_index(["Dependent Variable Complete Name", "Independent Variable Complete Name"])

In [None]:
df_params = pd.read_csv("../results/df_results/df_params.csv", usecols=["level_1", "level_2", "level_3", "level_4", "param"])\
.rename({"level_1": "Dependent Variable Complete Name",
         "level_2": "Independent Variable Complete Name",
         "level_3": "Categorical binary variable",
         "level_4": "indicator",
         "param": "value"}, axis=1)

In [None]:
adjusted_alpha = 0.05 / len(wide_df_params)

In [None]:
wide_df_params = df_params.set_index(["Dependent Variable Complete Name", 
                                      "Independent Variable Complete Name",
                                      "Categorical binary variable",
                                      "indicator"])\
.unstack("indicator")\
.droplevel(0, axis=1)\
.rename({"pvalue": "pvalue_subcategory"}, axis=1)\
.reset_index("Categorical binary variable", drop=False)

In [None]:
wide_df_params["Categorical binary variable"] = wide_df_params["Categorical binary variable"].str.extract(r"((?<=\\)[^\\.]+?$)")

In [None]:
wide_df_params["pos_OR"] = np.where(wide_df_params["OR"] >= 1, wide_df_params["OR"], 1/wide_df_params["OR"])

In [None]:
wide_df_params[["OR", "lb", "ub", "pos_OR"]] = wide_df_params[["OR", "lb", "ub", "pos_OR"]].round(3)

In [None]:
wide_df_params["OR_CI"] = "[" + wide_df_params["lb"].astype("str") + "; " + wide_df_params["ub"].astype("str") + "]"

In [None]:
wide_df_results = df_pvalues.join(wide_df_params, how="left").reset_index(drop=False)

In [None]:
wide_df_results["adjusted pvalue"] = wide_df_results["pvalue"] * len(wide_df_params)
wide_df_results["adjusted pvalue_subcategory"] = wide_df_results["adjusted pvalue_subcategory"] * len(wide_df_params)
wide_df_results["significant"] = wide_df_results["pvalue"] < adjusted_alpha
wide_df_results["adjusted alpha"] = adjusted_alpha

In [None]:
results_formated = pd.concat([wide_df_results,
           pd.DataFrame(
               wide_df_results["Dependent Variable Complete Name"].str.split("\\").tolist()
           ).loc[:, [2, 3]]
          ], axis=1)\
.rename({
    2: "Harmonized Variable Category",
    3: "Harmonized Variable Name"
}, axis=1)

In [None]:
variablesDict = pd.read_csv("../env_variables/multiIndex_variablesDict.csv", low_memory=False)\
[["name", "simplified_name", "categoryValues", "observationCount", "categorical", "nb_modalities", "level_0"]]\
.rename({"level_0": "BDC study",
        "simplified_name": "Dependent Variable Name"}, axis=1)\
.set_index("name")

In [None]:
variablesDict["categorical_ref_value"] = variablesDict["categoryValues"].str.extract("((?<=')[^'.]+?(?='\]$))")

In [None]:
results_formated = results_formated.join(variablesDict, on="Independent Variable Complete Name", how="inner")

In [None]:
results_formated[
    ["BDC study", 
     "Harmonized Variable Category",
     "Harmonized Variable Name",
     "Dependent Variable Name",
     "Categorical binary variable",
     "categorical_ref_value",
     "pvalue",
     "adjusted pvalue", 
     "OR", 
     "OR_CI",
     "pvalue_subcategory", 
     "adjusted pvalue_subcategory",
     "lb", "ub", "pos_OR", 
     "significant",
     "Dependent Variable Complete Name", 
     "Independent Variable Complete Name", 
     "observationCount", "categorical", "nb_modalities"]
]\
.to_csv("../results/results_formated.csv", index=False)

## Plots

In [None]:
wide_df_params["OR"].replace({np.inf: np.NaN, -np.inf: np.NaN}).dropna().loc[lambda x: x<=10].plot(kind="hist", bins=100)

In [None]:
wide_df_params["pos_OR"].replace({np.inf: np.NaN, -np.inf: np.NaN}).dropna().loc[lambda x: x<=50].plot(kind="hist", bins=100)

In [None]:
studies = df_pvalues.loc[df_pvalues["pvalues"].notnull(), "level_0"].unique()
number_included_studies = len(studies)

In [None]:
print("Number of studies with at least one non null pvalue {}".format(number_included_studies))

In [None]:
df_pvalues.groupby("level_0").size().sort_values().reindex(studies_info.set_index("phs")["official_study_name"].to_dict()).dropna()

In [None]:
df_pvalues.loc[df_pvalues["pvalues"].notnull(),:]

In [None]:
results_grouped = df_params.pivot_table(columns="level_4", index=["level_0", "level_1", "level_2"], values="param")\
.sort_values("pvalue")