# Phenome-Wide analysis on TOPMed studies

In [None]:
import json
from pprint import pprint

import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
from scipy import stats

In [None]:
# Pandas DataFrame display options
pd.set_option("max.rows", 435)

# Matplotlib display parameters
fig_size = plt.rcParams["figure.figsize"]
fig_size[0] = 14
fig_size[1] = 8
plt.rcParams["figure.figsize"] = fig_size
font = {'weight' : 'bold',
        'size'   : 12}
plt.rc('font', **font)

# Phenome-wide analysis using Harmonized Variables

In [None]:
harmonized = pd.read_csv("../studies_stats/harmonized_details_stats.csv", index_col=None)\
.loc[lambda df: df["harmonized variables"] != "\\_Consents\\Short Study Accession with Consent Code\\",:]

pd.DataFrame(harmonized["harmonized variables"].str.split("\\").to_list())\
.rename({2: "category", 3: "variable_name"}, axis=1)\
.loc[:, ["category", "variable_name"]]\
.join(harmonized)\
.loc[lambda df: ~ df["variable_name"].str.startswith("age"), :]

## Harmonized Variables Info

- Quality checking:
    - Discarding following variables:
        - Identifiers
        - Only unique values / only null values
- Variable type conversion to categorical
- Counting non-null values to select studies

In [None]:
studies_info = pd.read_csv("../env_variables/studies_info.csv")
mask_harmonized = studies_info["harmonized"] == True
harmonized_studies_info = studies_info[mask_harmonized]

In [None]:
harmonized_studies_info

# Results

In [None]:
pprint(studies_info.loc[studies_info["phs"].isin(studies), "official_study_name"].values.tolist())

In [None]:
df_pvalues = pd.read_csv("../results/df_results/df_pvalues.csv", usecols=["level_1", "level_2", "pvalues"])\
.rename({"level_1": "Dependent Variable Complete Name",
         "level_2": "Independent Variable Complete Name",
         "pvalues": "pvalue"}, axis=1)\
.set_index(["Dependent Variable Complete Name", "Independent Variable Complete Name"])

In [None]:
df_params = pd.read_csv("../results/df_results/df_params.csv", usecols=["level_1", "level_2", "level_3", "level_4", "param"])\
.rename({"level_1": "Dependent Variable Complete Name",
         "level_2": "Independent Variable Complete Name",
         "level_3": "Categorical binary variable",
         "level_4": "indicator",
         "param": "value"}, axis=1)

In [None]:
adjusted_alpha = 0.05 / len(wide_df_params)

In [None]:
wide_df_params = df_params.set_index(["Dependent Variable Complete Name", 
                                      "Independent Variable Complete Name",
                                      "Categorical binary variable",
                                      "indicator"])\
.unstack("indicator")\
.droplevel(0, axis=1)\
.rename({"pvalue": "pvalue_subcategory"}, axis=1)\
.reset_index("Categorical binary variable", drop=False)

In [None]:
wide_df_params["Categorical binary variable"] = wide_df_params["Categorical binary variable"].str.extract(r"((?<=\\)[^\\.]+?$)")

In [None]:
wide_df_params["pos_OR"] = np.where(wide_df_params["OR"] >= 1, wide_df_params["OR"], 1/wide_df_params["OR"])

In [None]:
wide_df_params[["OR", "lb", "ub", "pos_OR"]] = wide_df_params[["OR", "lb", "ub", "pos_OR"]].round(3)

In [None]:
wide_df_params["OR_CI"] = "[" + wide_df_params["lb"].astype("str") + "; " + wide_df_params["ub"].astype("str") + "]"

In [None]:
wide_df_results = df_pvalues.join(wide_df_params, how="left").reset_index(drop=False)

In [None]:
wide_df_results["adjusted pvalue"] = wide_df_results["pvalue"] * len(wide_df_params)
wide_df_results["adjusted pvalue_subcategory"] = wide_df_results["adjusted pvalue_subcategory"] * len(wide_df_params)
wide_df_results["significant"] = wide_df_results["pvalue"] < adjusted_alpha
wide_df_results["adjusted alpha"] = adjusted_alpha

In [None]:
results_formated = pd.concat([wide_df_results,
           pd.DataFrame(
               wide_df_results["Dependent Variable Complete Name"].str.split("\\").tolist()
           ).loc[:, [2, 3]]
          ], axis=1)\
.rename({
    2: "Harmonized Variable Category",
    3: "Harmonized Variable Name"
}, axis=1)

In [None]:
variablesDict = pd.read_csv("../env_variables/multiIndex_variablesDict.csv", low_memory=False)\
[["name", "simplified_name", "categoryValues", "observationCount", "categorical", "nb_modalities", "level_0"]]\
.rename({"level_0": "BDC study",
        "simplified_name": "Dependent Variable Name"}, axis=1)\
.set_index("name")

In [None]:
variablesDict["categorical_ref_value"] = variablesDict["categoryValues"].str.extract("((?<=')[^'.]+?(?='\]$))")

In [None]:
results_formated = results_formated.join(variablesDict, on="Independent Variable Complete Name", how="inner")

In [None]:
results_formated[
    ["BDC study", 
     "Harmonized Variable Category",
     "Harmonized Variable Name",
     "Dependent Variable Name",
     "Categorical binary variable",
     "categorical_ref_value",
     "pvalue",
     "adjusted pvalue", 
     "OR", 
     "OR_CI",
     "pvalue_subcategory", 
     "adjusted pvalue_subcategory",
     "lb", "ub", "pos_OR", 
     "significant",
     "Dependent Variable Complete Name", 
     "Independent Variable Complete Name", 
     "observationCount", "categorical", "nb_modalities"]
]\
.to_csv("../results/results_formated.csv", index=False)

## Plots

In [None]:
wide_df_params["OR"].replace({np.inf: np.NaN, -np.inf: np.NaN}).dropna().loc[lambda x: x<=10].plot(kind="hist", bins=100)

In [None]:
wide_df_params["pos_OR"].replace({np.inf: np.NaN, -np.inf: np.NaN}).dropna().loc[lambda x: x<=50].plot(kind="hist", bins=100)

In [None]:
studies = df_pvalues.loc[df_pvalues["pvalues"].notnull(), "level_0"].unique()
number_included_studies = len(studies)

In [None]:
print("Number of studies with at least one non null pvalue {}".format(number_included_studies))

In [None]:
df_pvalues.groupby("level_0").size().sort_values().reindex(studies_info.set_index("phs")["official_study_name"].to_dict()).dropna()

In [None]:
df_pvalues.loc[df_pvalues["pvalues"].notnull(),:]

In [None]:
results_grouped = df_params.pivot_table(columns="level_4", index=["level_0", "level_1", "level_2"], values="param")\
.sort_values("pvalue")

In [None]:
from typing import List

def get_plt_grid_indices(nb_values: int=None, 
                         nb_cols  : int=None, 
                         nb_rows  : int=None) -> List[tuple]:
    """
    A utility function to get list of tuples (matplotlib-like grid indices) from given parameters
    Iterate column first
    Return 
    """
    passed_args = locals()
    def _check_args(nb_values=None, 
                   nb_cols=None, 
                   nb_rows=None):
        args = locals()
        args = {k:v for k,v in args.items() if v is not None}
        if len(args) == 3:
            assert(args["nb_cols"] * args["nb_rows"] >= args["nb_values"]), "discrepancies in the passed arguments values"
            assert((max(args["nb_cols"], args["nb_rows"]) - 1) * min(args["nb_cols"], args["nb_rows"]) < args["nb_values"]), "discrepancies in the passed arguments values"
        elif (len(args) == 1) & ("nb_values" not in args):
            raise ValueError("Only {0} passed, please pass the complementary\
            dimension argument, or the nb_values".format(args))
        elif len(args) == 0:
            raise ValueError("No arguments passed")

    def _get_complementary_values(nb_values: int=None, 
                                  nb_cols  : int=None, 
                                  nb_rows  : int=None) -> tuple:
        args = locals()
        args = {k:v for k,v in args.items() if v is not None}
        if ("nb_values" in args) & (len(args) == 1):
            nb_cols = np.floor(np.sqrt(nb_values))
            nb_rows = nb_cols + 1
            return int(nb_values), int(nb_cols), int(nb_rows)
        elif ("nb_values" in args) & (len(args) == 2):
            if "nb_cols" in args:
                nb_rows = np.ceil(nb_values/nb_cols)
            elif "nb_rows" in args:
                nb_cols = np.ceil(nb_values/nb_rows)
            return int(nb_values), int(nb_cols), int(nb_rows)
        elif "nb_values" not in args:
            nb_values = nb_rows * nb_cols
            return int(nb_values), int(nb_cols), int(nb_rows)
        else:
            return nb_values, nb_cols, nb_rows
    
    def _get_facet_grid_vec(nb_values, nb_cols, nb_rows):
        first_dim = np.arange(0, nb_cols)
        second_dim = np.arange(0, nb_rows)
        vec_indices = []
        for ind_col in second_dim:
            for ind_row in first_dim:
                vec_indices.append((ind_col, ind_row))
        return vec_indices[0:nb_values]
    
    _check_args(**passed_args)
    nb_values, nb_cols, nb_rows = _get_complementary_values(**passed_args)
    vec_indices = _get_facet_grid_vec(nb_values, nb_cols, nb_rows)
    return vec_indices, (nb_cols, nb_rows)

In [None]:
nb_tested_var = df_pvalues["level_1"].nunique()

In [None]:
grouped_df = df_pvalues.groupby("level_1")

In [None]:
adjusted_alpha = 0.05 / len(df_pvalues["pvalues"])
df_pvalues["p_adj"] = df_pvalues["pvalues"] * len(df_pvalues["pvalues"])

In [None]:
df_pvalues['log_p'] = -np.log10(df_pvalues['pvalues'])
df_pvalues = df_pvalues.replace({np.inf: np.NaN})

In [None]:
df_pvalues["log_p"] = -np.log10(df_pvalues["pvalues"])
df_pvalues["log_p_adj"] = -np.log10(df_pvalues["p_adj"])

df_pvalues = df_pvalues.replace({np.inf: np.NaN})

fig = plt.figure()
ax = fig.add_subplot(111)
colors = plt.get_cmap('Set1')
x_labels = []
x_labels_pos = []

y_lims = (0, df_pvalues["log_p"].max(skipna=True) + 50)
threshold_top_values = df_pvalues["log_p"].sort_values(ascending=False)[0:6].iloc[-1]

df_pvalues["ind"] = np.arange(1, len(df_pvalues) + 1)
# df_pvalues["group"] = df_pvalues["group"].str.replace("[0-9]", "")
df_grouped = df_pvalues.groupby(('level_0'))
for num, (name, group) in enumerate(df_grouped):
    group.plot(kind='scatter', x='ind', y='log_p', color=colors.colors[num % len(colors.colors)], ax=ax, s=20)
    x_labels.append(name)
    x_labels_pos.append(
        (group['ind'].iloc[-1] - (group['ind'].iloc[-1] - group['ind'].iloc[0]) / 2))  # Set label in the middle



In [None]:
harmonized_var = df_pvalues["level_1"].unique()


In [None]:
print(harmonized_var[0])

In [None]:
sub_df = df_pvalues.loc[df_pvalues["level_1"] == harmonized_var[0], :].copy()

In [None]:
sub_df["ind"] = np.arange(1, len(sub_df) + 1)
# df_pvalues["group"] = df_pvalues["group"].str.replace("[0-9]", "")
df_grouped = sub_df.groupby(('level_0'))
for num, (name, group) in enumerate(df_grouped):
    group.plot(kind='scatter', x='ind', y='log_p', color=colors.colors[num % len(colors.colors)], ax=ax, s=20)
    x_labels.append(name)
    x_labels_pos.append(
        (group['ind'].iloc[-1] - (group['ind'].iloc[-1] - group['ind'].iloc[0]) / 2))  # Set label in the middle

In [None]:
sub_df.loc[sub_df["pvalues"].notnull(),:]

In [None]:
indices = get_plt_grid_indices(nb_tested_var)
f, axes = plt.subplots(3, 3, sharey=True)
# df_pvalues["group"] = df_pvalues["group"].str.replace("[0-9]", "")
for plot_indice, (covariate, df_group_1) in enumerate(df_pvalues.groupby("level_1")):
    df_grouped = df_group_1.groupby(('level_0'))
    n_col = indices[plot_indice][0]
    n_rows = indices[plot_indice][1]
    ax = axes[n_col, n_rows]
    for num, (name, group) in enumerate(df_grouped):
        group.plot(kind='scatter', x='ind', y='log_p', color=colors.colors[num % len(colors.colors)], ax=ax, s=20)
        x_labels.append(name)
        x_labels_pos.append(
            (group['ind'].iloc[-1] - (group['ind'].iloc[-1] - group['ind'].iloc[0]) / 2))  # Set label in the middle

# Gender

In [None]:
multiIndex_variablesDict = pd.read_csv("multiIndex_variablesDict.csv", index_col=list(range(0, 13)), low_memory=False)

## P-values

In [None]:
df_pvalues.shape

In [None]:
df_pvalues = pd.read_csv("df_pvalues.csv", index_col=0)

In [None]:
df_pvalues.head()

In [None]:
adjusted_alpha = 0.05/len(df_pvalues["pvalue"])

In [None]:
df_pvalues.loc[df_pvalues["pvalue"] < adjusted_alpha,].shape

## Manhattan plot

In [None]:
def manhattan_plot(df_pvalues,
                   threshold_group_cat=5,
                  title_plot="Statistical Association Between Exposition Status and Phenotypes"):
    
    adjusted_alpha = 0.05/len(df_pvalues["pvalue"])
    df_pvalues["p_adj"] = df_pvalues["pvalue"] / len(df_pvalues["pvalue"])
    df_pvalues['log_p'] = -np.log10(df_pvalues['pvalue'])
    
    df_pvalues["group"] = df_pvalues.index
    group_counts = df_pvalues["group"].value_counts()
    group_to_merge = group_counts[group_counts < threshold_group_cat].index
    mask_group_to_merge = df_pvalues["group"].isin(group_to_merge)
    df_pvalues.loc[mask_group_to_merge, "group"] = "Other"
    df_pvalues = df_pvalues.sort_values(by="group", axis=0)
    
    dic_renaming = {
    'Genetic Epidemiology of COPD (COPDGene)': 'COPDGene',
    'Genetic Epidemiology Network of Arteriopathy (GENOA)': 'GENOA',
    'NHLBI TOPMed: Genetics of Cardiometabolic Health in the Amish': 'Genetics',
    'Genome-wide Association Study of Adiposity in Samoans': 'GEWAS Samoans',
    'Genetics of Lipid Lowering Drugs and Diet Network (GOLDN) Lipidomics Study': 'GOLDN',
    'Heart and Vascular Health Study (HVH)': 'HVH'
}
    df_pvalues["group"] = df_pvalues["group"].replace(dic_renaming)
    
    df_pvalues["variable"] = df_pvalues["variable"].str.replace("[0-9]+[A-z]*", "").to_frame()
    order_studies = df_pvalues.index.get_level_values(0).unique().tolist()[::-1]
    #df_pvalues = df_pvalues.reindex(order_studies, level=0)
    fig = plt.figure()
    ax = fig.add_subplot(111)
    colors = plt.get_cmap('Set1')
    x_labels = []
    x_labels_pos = []

    y_lims = (0, df_pvalues["log_p"].max(skipna=True) + 50)
    threshold_top_values = df_pvalues["log_p"].sort_values(ascending=False)[0:6].iloc[-1]

    df_pvalues["ind"] = np.arange(1, len(df_pvalues)+1)
    #df_pvalues["group"] = df_pvalues["group"].str.replace("[0-9]", "")
    df_grouped = df_pvalues.groupby(('group'))
    for num, (name, group) in enumerate(df_grouped):
        group.plot(kind='scatter', x='ind', y='log_p',color=colors.colors[num % len(colors.colors)], ax=ax, s=20)
        x_labels.append(name)
        x_labels_pos.append((group['ind'].iloc[-1] - (group['ind'].iloc[-1] - group['ind'].iloc[0])/2)) # Set label in the middle

        pair_ind = 0 # To shift label which might overlap because to close
        for n, row in group.iterrows():
    #        if pair_ind %2 == 0:
    #            shift = 1.1
    #        else:
    #            shift = -1.1
            if row["log_p"] > threshold_top_values:
                ax.text(row['ind'] + 3, row["log_p"] + 0.05, row["simplified_varName"], rotation=0, alpha=1, size=8, color="black")
    #            pair_ind += 1

    ax.set_xticks(x_labels_pos)
    ax.set_xticklabels(x_labels)
    ax.set_xlim([0, len(df_pvalues) +1])
    ax.set_ylim(y_lims)
    ax.set_ylabel('-log(p-values)', style="italic")
    ax.set_xlabel('Phenotypes', fontsize=15)
    ax.axhline(y=-np.log10(adjusted_alpha), linestyle=":", color="black", label="Bonferonni Adjusted Threshold")
    plt.xticks(fontsize = 9,rotation=30)
    plt.yticks(fontsize = 8)
    plt.title(title_plot, 
              loc="left",
              style="oblique", 
              fontsize = 20,
             y=1)
    xticks = ax.xaxis.get_major_ticks()
    handles, labels = ax.get_legend_handles_labels()
    plt.legend(handles = handles, labels = labels, loc = "upper left")
    plt.show()
    return

In [None]:
manhattan_plot(df_pvalues, title_plot="Association Between Sex and Phenotypes")

## COPDGene

In [None]:
study_name = 'Genetic Epidemiology of COPD (COPDGene)'

In [None]:
study_df = df_pvalues.loc[study_name,:].copy()

study_groups = multiIndex_variablesDict.loc[study_name, "varName"]\
.reset_index(1, drop=False)\
.reset_index(drop=True)\
.set_index("varName")
study_groups.columns = ["grouping"]

study_df=study_df.join(study_groups, on="variable")
study_df = study_df.set_index("grouping", drop=True)

In [None]:
manhattan_plot(study_df)

## Genetic Epidemiology Network of Arteriopathy (GENOA)

In [None]:
study_name = 'Genetic Epidemiology Network of Arteriopathy (GENOA)'

In [None]:
study_df = df_pvalues.loc[study_name,:].copy()

study_groups = multiIndex_variablesDict.loc[study_name, "varName"]\
.reset_index(1, drop=False)\
.reset_index(drop=True)\
.set_index("varName")
study_groups.columns = ["grouping"]

study_df=study_df.join(study_groups, on="variable")
study_df = study_df.set_index("grouping", drop=True)

In [None]:
manhattan_plot(study_df)

## NHLBI TOPMed: Genetics of Cardiometabolic Health in the Amish

In [None]:
study_name = 'NHLBI TOPMed: Genetics of Cardiometabolic Health in the Amish'

In [None]:
study_df = df_pvalues.loc[study_name,:].copy()

study_groups = multiIndex_variablesDict.loc[study_name, "varName"]\
.reset_index(0, drop=False)\
.reset_index(drop=True)\
.set_index("varName")
study_groups.columns = ["grouping"]

study_df=study_df.join(study_groups, on="variable")
study_df = study_df.set_index("grouping", drop=True)

In [None]:
manhattan_plot(study_df, threshold_group_cat=0)

## Genome-wide Association Study of Adiposity in Samoans

In [None]:
study_name = 'Genome-wide Association Study of Adiposity in Samoans'

In [None]:
study_df = df_pvalues.loc[study_name,:].copy()

study_groups = multiIndex_variablesDict.loc[study_name, "varName"]\
.reset_index(1, drop=False)\
.reset_index(drop=True)\
.set_index("varName")
study_groups.columns = ["grouping"]

study_df=study_df.join(study_groups, on="variable")
study_df = study_df.set_index("grouping", drop=True)

In [None]:
manhattan_plot(study_df)

## Genetics of Lipid Lowering Drugs and Diet Network (GOLDN) Lipidomics Study

In [None]:
study_name = 'Genetics of Lipid Lowering Drugs and Diet Network (GOLDN) Lipidomics Study'

In [None]:
study_df = df_pvalues.loc[study_name,:].copy()

study_groups = multiIndex_variablesDict.loc[study_name, "varName"]\
.reset_index(1, drop=False)\
.reset_index(drop=True)\
.set_index("varName")
study_groups.columns = ["grouping"]

study_df=study_df.join(study_groups, on="variable")
study_df = study_df.set_index("grouping", drop=True)

In [None]:
manhattan_plot(study_df)

## Heart and Vascular Health Study (HVH)

In [None]:
study_name = 'Heart and Vascular Health Study (HVH)'

In [None]:
study_df = df_pvalues.loc[study_name,:].copy()

study_groups = multiIndex_variablesDict.loc[study_name, "varName"]\
.reset_index(0, drop=False)\
.reset_index(drop=True)\
.set_index("varName")
study_groups.columns = ["grouping"]

study_df=study_df.join(study_groups, on="variable")
study_df = study_df.set_index("grouping", drop=True)

In [None]:
manhattan_plot(study_df)

# Smoking status

In [None]:
df_pvalues_bis = pd.read_csv("df_pvalues_bis.csv", index_col=[0, 1])
df_pvalues_bis = df_pvalues_bis.rename_axis(["study", "dependent_var"], axis=0).reset_index("dependent_var", drop=False)

In [None]:
mask_smoking = df_pvalues_bis["dependent_var"] == "\\DCC Harmonized data set\\03 - Baseline common covariates\\Indicates whether subject ever regularly smoked cigarettes.\\"
smoking_df = df_pvalues_bis.loc[mask_smoking, :].copy()

In [None]:
manhattan_plot(smoking_df)

In [None]:
df_pvalues_bis = pd.read_csv("df_pvalues_bis.csv", index_col=[0, 1])
df_pvalues_bis = df_pvalues_bis.rename_axis(["study", "dependent_var"], axis=0).reset_index("dependent_var", drop=False)

In [None]:
mask_smoking = df_pvalues_bis["dependent_var"] == "\\DCC Harmonized data set\\03 - Baseline common covariates\\Indicates whether subject ever regularly smoked cigarettes.\\"
smoking_df = df_pvalues_bis.loc[mask_smoking, :].copy()

### COPDGene

In [None]:
study_name = 'Genetic Epidemiology of COPD (COPDGene)'

In [None]:
study_df = smoking_df.loc[study_name,:].copy()

study_groups = multiIndex_variablesDict.loc[study_name, "varName"]\
.reset_index(1, drop=False)\
.reset_index(drop=True)\
.set_index("varName")
study_groups.columns = ["grouping"]

study_df=study_df.join(study_groups, on="variable")
study_df = study_df.set_index("grouping", drop=True)

In [None]:
manhattan_plot(study_df)

## Genetic Epidemiology Network of Arteriopathy (GENOA)

In [None]:
study_name = 'Genetic Epidemiology Network of Arteriopathy (GENOA)'

In [None]:
study_df = smoking_df.loc[study_name,:].copy()

study_groups = multiIndex_variablesDict.loc[study_name, "varName"]\
.reset_index(1, drop=False)\
.reset_index(drop=True)\
.set_index("varName")
study_groups.columns = ["grouping"]

study_df=study_df.join(study_groups, on="variable")
study_df = study_df.set_index("grouping", drop=True)

In [None]:
manhattan_plot(study_df)

## Genome-wide Association Study of Adiposity in Samoans

In [None]:
study_name = 'Genome-wide Association Study of Adiposity in Samoans'

In [None]:
study_df = smoking_df.loc[study_name,:].copy()

study_groups = multiIndex_variablesDict.loc[study_name, "varName"]\
.reset_index(1, drop=False)\
.reset_index(drop=True)\
.set_index("varName")
study_groups.columns = ["grouping"]

study_df=study_df.join(study_groups, on="variable")
study_df = study_df.set_index("grouping", drop=True)

In [None]:
manhattan_plot(study_df)

# Antihypertensive status

In [None]:
mask_aht_medication = df_pvalues_bis["dependent_var"] == "\\DCC Harmonized data set\\05 - Blood pressure\\Indicator for use of antihypertensive medication at the time of blood pressure measurement.\\"
aht_medication = df_pvalues_bis.loc[mask_aht_medication,:].copy()

In [None]:
manhattan_plot(aht_medication)

## Genetic Epidemiology Network of Arteriopathy (GENOA)

In [None]:
study_name = 'Genetic Epidemiology Network of Arteriopathy (GENOA)'

In [None]:
study_df = aht_medication.loc[study_name,:].copy()

study_groups = multiIndex_variablesDict.loc[study_name, "varName"]\
.reset_index(1, drop=False)\
.reset_index(drop=True)\
.set_index("varName")
study_groups.columns = ["grouping"]

study_df=study_df.join(study_groups, on="variable")
study_df = study_df.set_index("grouping", drop=True)

In [None]:
manhattan_plot(study_df, title_plot="Association with Hypertension Medication")

## Genome-wide Association Study of Adiposity in Samoans

In [None]:
study_name = 'Genome-wide Association Study of Adiposity in Samoans'

In [None]:
study_df = aht_medication.loc[study_name,:].copy()

study_groups = multiIndex_variablesDict.loc[study_name, "varName"]\
.reset_index(1, drop=False)\
.reset_index(drop=True)\
.set_index("varName")
study_groups.columns = ["grouping"]

study_df=study_df.join(study_groups, on="variable")
study_df = study_df.set_index("grouping", drop=True)

In [None]:
manhattan_plot(study_df, title_plot="Association with Hypertension Medication")

## To be improved

- Regex on variable names to get categories (example every variable name containing stenosis, blood pressure): possibility to leverage ontologies
- Odds Ratio
- Defining categories in a better way 
- PheWAS using different variants
- Grouping subcategories across studies
- Integrating all studies
- Better description of individual studies