# Phenome-Wide analysis on TOPMed studies

In [None]:
import json
from pprint import pprint

import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
from scipy import stats

In [None]:
# Pandas DataFrame display options
pd.set_option("max.rows", 435)

# Matplotlib display parameters
fig_size = plt.rcParams["figure.figsize"]
fig_size[0] = 14
fig_size[1] = 8
plt.rcParams["figure.figsize"] = fig_size
font = {'weight' : 'bold',
        'size'   : 12}
plt.rc('font', **font)

# Phenome-wide analysis using Harmonized Variables

In [None]:
harmonized = pd.read_csv("studies_stats/harmonized_details_stats.csv", index_col=0)

In [None]:
harmonized.sort_values("unique values", ascending=True)

### Using following harmonized variables:
- Sex (\\DCC Harmonized data set\\01 - Demographics\\Subject sex  as recorded by the study.\\)
- Regular smoker status (\\DCC Harmonized data set\\03 - Baseline common covariates\\Indicates whether subject ever regularly smoked cigarettes.\\)
- Hypertensive medication usage (\\DCC Harmonized data set\\05 - Blood pressure\\Indicator for use of antihypertensive medication at the time of blood pressure measurement.\\)

# Included studies

- Genetic Epidemiology of COPD (COPDGene)
- Genetic Epidemiology Network of Arteriopathy (GENOA)
- NHLBI TOPMed: Genetics of Cardiometabolic Health in the Amish
- Genome-wide Association Study of Adiposity in Samoans
- Genetics of Lipid Lowering Drugs and Diet Network (GOLDN) Lipidomics Study
- Heart and Vascular Health Study (HVH)

## Variable informations

- Quality checking:
    - Discarding following variables:
        - Identifiers
        - Only unique values / only null values
- Variable type conversion to categorical
- Counting non-null values to select studies

In [None]:
studies_info = pd.read_csv("studies_stats/studies_stats.csv", index_col=0, header=[0, 1])\
.drop_duplicates()

In [None]:
studies_info

In [None]:
study_names = [
    'Genetic Epidemiology of COPD (COPDGene)',
    'Genetic Epidemiology Network of Arteriopathy (GENOA)',
    'NHLBI TOPMed: Genetics of Cardiometabolic Health in the Amish',
    'Genome-wide Association Study of Adiposity in Samoans',
    'Genetics of Lipid Lowering Drugs and Diet Network (GOLDN) Lipidomics Study',
    'Heart and Vascular Health Study (HVH)'
]

In [None]:
studies_info.loc[study_names,]\
.sort_values(("Number variables with non-null values", "Mean non-null value count per variable"),
                                            axis=0, ascending=False)

# Gender

In [None]:
multiIndex_variablesDict = pd.read_csv("multiIndex_variablesDict.csv", index_col=list(range(0, 13)), low_memory=False)

## P-values

In [None]:
df_pvalues.shape

In [None]:
df_pvalues = pd.read_csv("df_pvalues.csv", index_col=0)

In [None]:
df_pvalues.head()

In [None]:
adjusted_alpha = 0.05/len(df_pvalues["pvalue"])

In [None]:
df_pvalues.loc[df_pvalues["pvalue"] < adjusted_alpha,].shape

## Manhattan plot

In [None]:
def manhattan_plot(df_pvalues,
                   threshold_group_cat=5,
                  title_plot="Statistical Association Between Exposition Status and Phenotypes"):
    
    adjusted_alpha = 0.05/len(df_pvalues["pvalue"])
    df_pvalues["p_adj"] = df_pvalues["pvalue"] / len(df_pvalues["pvalue"])
    df_pvalues['log_p'] = -np.log10(df_pvalues['pvalue'])
    
    df_pvalues["group"] = df_pvalues.index
    group_counts = df_pvalues["group"].value_counts()
    group_to_merge = group_counts[group_counts < threshold_group_cat].index
    mask_group_to_merge = df_pvalues["group"].isin(group_to_merge)
    df_pvalues.loc[mask_group_to_merge, "group"] = "Other"
    df_pvalues = df_pvalues.sort_values(by="group", axis=0)
    
    dic_renaming = {
    'Genetic Epidemiology of COPD (COPDGene)': 'COPDGene',
    'Genetic Epidemiology Network of Arteriopathy (GENOA)': 'GENOA',
    'NHLBI TOPMed: Genetics of Cardiometabolic Health in the Amish': 'Genetics',
    'Genome-wide Association Study of Adiposity in Samoans': 'GEWAS Samoans',
    'Genetics of Lipid Lowering Drugs and Diet Network (GOLDN) Lipidomics Study': 'GOLDN',
    'Heart and Vascular Health Study (HVH)': 'HVH'
}
    df_pvalues["group"] = df_pvalues["group"].replace(dic_renaming)
    
    df_pvalues["variable"] = df_pvalues["variable"].str.replace("[0-9]+[A-z]*", "").to_frame()
    order_studies = df_pvalues.index.get_level_values(0).unique().tolist()[::-1]
    #df_pvalues = df_pvalues.reindex(order_studies, level=0)
    fig = plt.figure()
    ax = fig.add_subplot(111)
    colors = plt.get_cmap('Set1')
    x_labels = []
    x_labels_pos = []

    y_lims = (0, df_pvalues["log_p"].max(skipna=True) + 50)
    threshold_top_values = df_pvalues["log_p"].sort_values(ascending=False)[0:6].iloc[-1]

    df_pvalues["ind"] = np.arange(1, len(df_pvalues)+1)
    #df_pvalues["group"] = df_pvalues["group"].str.replace("[0-9]", "")
    df_grouped = df_pvalues.groupby(('group'))
    for num, (name, group) in enumerate(df_grouped):
        group.plot(kind='scatter', x='ind', y='log_p',color=colors.colors[num % len(colors.colors)], ax=ax, s=20)
        x_labels.append(name)
        x_labels_pos.append((group['ind'].iloc[-1] - (group['ind'].iloc[-1] - group['ind'].iloc[0])/2)) # Set label in the middle

        pair_ind = 0 # To shift label which might overlap because to close
        for n, row in group.iterrows():
    #        if pair_ind %2 == 0:
    #            shift = 1.1
    #        else:
    #            shift = -1.1
            if row["log_p"] > threshold_top_values:
                ax.text(row['ind'] + 3, row["log_p"] + 0.05, row["simplified_varName"], rotation=0, alpha=1, size=8, color="black")
    #            pair_ind += 1

    ax.set_xticks(x_labels_pos)
    ax.set_xticklabels(x_labels)
    ax.set_xlim([0, len(df_pvalues) +1])
    ax.set_ylim(y_lims)
    ax.set_ylabel('-log(p-values)', style="italic")
    ax.set_xlabel('Phenotypes', fontsize=15)
    ax.axhline(y=-np.log10(adjusted_alpha), linestyle=":", color="black", label="Bonferonni Adjusted Threshold")
    plt.xticks(fontsize = 9,rotation=30)
    plt.yticks(fontsize = 8)
    plt.title(title_plot, 
              loc="left",
              style="oblique", 
              fontsize = 20,
             y=1)
    xticks = ax.xaxis.get_major_ticks()
    handles, labels = ax.get_legend_handles_labels()
    plt.legend(handles = handles, labels = labels, loc = "upper left")
    plt.show()
    return

In [None]:
manhattan_plot(df_pvalues, title_plot="Association Between Sex and Phenotypes")

## COPDGene

In [None]:
study_name = 'Genetic Epidemiology of COPD (COPDGene)'

In [None]:
study_df = df_pvalues.loc[study_name,:].copy()

study_groups = multiIndex_variablesDict.loc[study_name, "varName"]\
.reset_index(1, drop=False)\
.reset_index(drop=True)\
.set_index("varName")
study_groups.columns = ["grouping"]

study_df=study_df.join(study_groups, on="variable")
study_df = study_df.set_index("grouping", drop=True)

In [None]:
manhattan_plot(study_df)

## Genetic Epidemiology Network of Arteriopathy (GENOA)

In [None]:
study_name = 'Genetic Epidemiology Network of Arteriopathy (GENOA)'

In [None]:
study_df = df_pvalues.loc[study_name,:].copy()

study_groups = multiIndex_variablesDict.loc[study_name, "varName"]\
.reset_index(1, drop=False)\
.reset_index(drop=True)\
.set_index("varName")
study_groups.columns = ["grouping"]

study_df=study_df.join(study_groups, on="variable")
study_df = study_df.set_index("grouping", drop=True)

In [None]:
manhattan_plot(study_df)

## NHLBI TOPMed: Genetics of Cardiometabolic Health in the Amish

In [None]:
study_name = 'NHLBI TOPMed: Genetics of Cardiometabolic Health in the Amish'

In [None]:
study_df = df_pvalues.loc[study_name,:].copy()

study_groups = multiIndex_variablesDict.loc[study_name, "varName"]\
.reset_index(0, drop=False)\
.reset_index(drop=True)\
.set_index("varName")
study_groups.columns = ["grouping"]

study_df=study_df.join(study_groups, on="variable")
study_df = study_df.set_index("grouping", drop=True)

In [None]:
manhattan_plot(study_df, threshold_group_cat=0)

## Genome-wide Association Study of Adiposity in Samoans

In [None]:
study_name = 'Genome-wide Association Study of Adiposity in Samoans'

In [None]:
study_df = df_pvalues.loc[study_name,:].copy()

study_groups = multiIndex_variablesDict.loc[study_name, "varName"]\
.reset_index(1, drop=False)\
.reset_index(drop=True)\
.set_index("varName")
study_groups.columns = ["grouping"]

study_df=study_df.join(study_groups, on="variable")
study_df = study_df.set_index("grouping", drop=True)

In [None]:
manhattan_plot(study_df)

## Genetics of Lipid Lowering Drugs and Diet Network (GOLDN) Lipidomics Study

In [None]:
study_name = 'Genetics of Lipid Lowering Drugs and Diet Network (GOLDN) Lipidomics Study'

In [None]:
study_df = df_pvalues.loc[study_name,:].copy()

study_groups = multiIndex_variablesDict.loc[study_name, "varName"]\
.reset_index(1, drop=False)\
.reset_index(drop=True)\
.set_index("varName")
study_groups.columns = ["grouping"]

study_df=study_df.join(study_groups, on="variable")
study_df = study_df.set_index("grouping", drop=True)

In [None]:
manhattan_plot(study_df)

## Heart and Vascular Health Study (HVH)

In [None]:
study_name = 'Heart and Vascular Health Study (HVH)'

In [None]:
study_df = df_pvalues.loc[study_name,:].copy()

study_groups = multiIndex_variablesDict.loc[study_name, "varName"]\
.reset_index(0, drop=False)\
.reset_index(drop=True)\
.set_index("varName")
study_groups.columns = ["grouping"]

study_df=study_df.join(study_groups, on="variable")
study_df = study_df.set_index("grouping", drop=True)

In [None]:
manhattan_plot(study_df)

# Smoking status

In [None]:
df_pvalues_bis = pd.read_csv("df_pvalues_bis.csv", index_col=[0, 1])
df_pvalues_bis = df_pvalues_bis.rename_axis(["study", "dependent_var"], axis=0).reset_index("dependent_var", drop=False)

In [None]:
mask_smoking = df_pvalues_bis["dependent_var"] == "\\DCC Harmonized data set\\03 - Baseline common covariates\\Indicates whether subject ever regularly smoked cigarettes.\\"
smoking_df = df_pvalues_bis.loc[mask_smoking, :].copy()

In [None]:
manhattan_plot(smoking_df)

In [None]:
df_pvalues_bis = pd.read_csv("df_pvalues_bis.csv", index_col=[0, 1])
df_pvalues_bis = df_pvalues_bis.rename_axis(["study", "dependent_var"], axis=0).reset_index("dependent_var", drop=False)

In [None]:
mask_smoking = df_pvalues_bis["dependent_var"] == "\\DCC Harmonized data set\\03 - Baseline common covariates\\Indicates whether subject ever regularly smoked cigarettes.\\"
smoking_df = df_pvalues_bis.loc[mask_smoking, :].copy()

### COPDGene

In [None]:
study_name = 'Genetic Epidemiology of COPD (COPDGene)'

In [None]:
study_df = smoking_df.loc[study_name,:].copy()

study_groups = multiIndex_variablesDict.loc[study_name, "varName"]\
.reset_index(1, drop=False)\
.reset_index(drop=True)\
.set_index("varName")
study_groups.columns = ["grouping"]

study_df=study_df.join(study_groups, on="variable")
study_df = study_df.set_index("grouping", drop=True)

In [None]:
manhattan_plot(study_df)

## Genetic Epidemiology Network of Arteriopathy (GENOA)

In [None]:
study_name = 'Genetic Epidemiology Network of Arteriopathy (GENOA)'

In [None]:
study_df = smoking_df.loc[study_name,:].copy()

study_groups = multiIndex_variablesDict.loc[study_name, "varName"]\
.reset_index(1, drop=False)\
.reset_index(drop=True)\
.set_index("varName")
study_groups.columns = ["grouping"]

study_df=study_df.join(study_groups, on="variable")
study_df = study_df.set_index("grouping", drop=True)

In [None]:
manhattan_plot(study_df)

## Genome-wide Association Study of Adiposity in Samoans

In [None]:
study_name = 'Genome-wide Association Study of Adiposity in Samoans'

In [None]:
study_df = smoking_df.loc[study_name,:].copy()

study_groups = multiIndex_variablesDict.loc[study_name, "varName"]\
.reset_index(1, drop=False)\
.reset_index(drop=True)\
.set_index("varName")
study_groups.columns = ["grouping"]

study_df=study_df.join(study_groups, on="variable")
study_df = study_df.set_index("grouping", drop=True)

In [None]:
manhattan_plot(study_df)

# Antihypertensive status

In [None]:
mask_aht_medication = df_pvalues_bis["dependent_var"] == "\\DCC Harmonized data set\\05 - Blood pressure\\Indicator for use of antihypertensive medication at the time of blood pressure measurement.\\"
aht_medication = df_pvalues_bis.loc[mask_aht_medication,:].copy()

In [None]:
manhattan_plot(aht_medication)

## Genetic Epidemiology Network of Arteriopathy (GENOA)

In [None]:
study_name = 'Genetic Epidemiology Network of Arteriopathy (GENOA)'

In [None]:
study_df = aht_medication.loc[study_name,:].copy()

study_groups = multiIndex_variablesDict.loc[study_name, "varName"]\
.reset_index(1, drop=False)\
.reset_index(drop=True)\
.set_index("varName")
study_groups.columns = ["grouping"]

study_df=study_df.join(study_groups, on="variable")
study_df = study_df.set_index("grouping", drop=True)

In [None]:
manhattan_plot(study_df, title_plot="Association with Hypertension Medication")

## Genome-wide Association Study of Adiposity in Samoans

In [None]:
study_name = 'Genome-wide Association Study of Adiposity in Samoans'

In [None]:
study_df = aht_medication.loc[study_name,:].copy()

study_groups = multiIndex_variablesDict.loc[study_name, "varName"]\
.reset_index(1, drop=False)\
.reset_index(drop=True)\
.set_index("varName")
study_groups.columns = ["grouping"]

study_df=study_df.join(study_groups, on="variable")
study_df = study_df.set_index("grouping", drop=True)

In [None]:
manhattan_plot(study_df, title_plot="Association with Hypertension Medication")

## To be improved

- Regex on variable names to get categories (example every variable name containing stenosis, blood pressure): possibility to leverage ontologies
- Odds Ratio
- Defining categories in a better way 
- PheWAS using different variants
- Grouping subcategories across studies
- Integrating all studies
- Better description of individual studies