# Analysis of data after cleaning/normalization

## Setup

In [None]:
import pandas as pd
import numpy as np
import math
import seaborn as sns

In [None]:
METADATA_FILE = 'gwas_trait_metadata.csv'
CLEANED_FILE_SUFFIX = '_cleaned.csv'
UNKNOWN_GENE = 'UNKNOWN'
CHILD_TRAIT_DELIMITER = ';'

metadata_df = pd.read_csv(METADATA_FILE)
all_traits = metadata_df['Trait'].tolist()
print(all_traits)

In [None]:
def trait_to_cleaned_filename(trait):
    return trait.replace(" ", "_") + CLEANED_FILE_SUFFIX


trait_to_df = {
    trait: pd.read_csv(trait_to_cleaned_filename(trait)) for trait in all_traits
}

In [None]:
trait_to_df['schizophrenia'].head()

## Comparing summary stats of data for all traits

In [None]:
trait_summaries = []
for trait in all_traits:
  trait_df = trait_to_df[trait]
  if trait == 'attention deficit hyperactivity disorder':
    # Shorten for plots
    trait = 'ADHD'
  trait_df['parent_trait'] = trait
  
  num_unknown_genes = len(trait_df.loc[trait_df['gene'] == UNKNOWN_GENE])
  trait_summary = {
      'parent_trait': trait,
      'num_variants': len(trait_df),
      'num_unique_genes': len(trait_df['gene'].unique()),
      'num_unknown_genes': num_unknown_genes,
      'min_pval': trait_df['p_value'].min(),
      'max_pval': trait_df['p_value'].max(),
  }
  trait_summaries.append(trait_summary)

summary_df = pd.DataFrame(trait_summaries)
summary_df

In [None]:
summary_df.plot(kind='barh', title='# entries per trait',
                x='parent_trait', y='num_variants',
                xlabel='Trait')

In [None]:
summary_df.plot(kind='barh', title='# unique genes per trait',
                x='parent_trait', y='num_unique_genes',
                xlabel='Trait')

In [None]:
all_df = pd.concat(df for df in trait_to_df.values())
all_df.head()

In [None]:
ax = sns.boxplot(x='parent_trait', y='p_value', data=all_df)
ax.set_xticklabels(ax.get_xticklabels(), rotation=90)
ax.set(title='P-value ranges per trait')

The IQRs of all traits look roughly the same except for personality disorder (and schizophrenia has a much lower IQR and many outliers). It looks like filtering data points with p-values > 6e-6 may remove most of the outliers (and may need to exclude personality disorder; it also appears to have very little coverage based on the previous plots).

## Comparing gene & variant overlap

Do pairwise comparison to see which traits share implicated genes.

In [None]:
trait_to_genes = {}
for trait in all_traits:
  genes = set(trait_to_df[trait]['gene'].unique())
  genes.remove(UNKNOWN_GENE)
  trait_to_genes[trait] = genes

 
for trait_a in all_traits:
  for trait_b in all_traits:
    if trait_a == trait_b:
      continue
    
    overlapping_genes = trait_to_genes[trait_a].intersection(trait_to_genes[trait_b])
    if len(overlapping_genes) > 0:
      print(f'{trait_a} and {trait_b} have {len(overlapping_genes)} overlapping genes.')

Do a similar check, except for child traits of each given parent trait

In [None]:
for parent_trait in all_traits:
  trait_row = metadata_df.loc[metadata_df['Trait'] == parent_trait]
  child_trait_entry = trait_row['Child traits'].astype(str)
  if len(child_trait_entry) == 0:
    continue

  child_traits = child_trait_entry.tolist()[0].split(CHILD_TRAIT_DELIMITER)
  child_traits = [c_trait.strip().lower() for c_trait in child_traits]
  trait_to_genes = {}
  for child_trait in child_traits:
    parent_df = trait_to_df[parent_trait]
    child_trait_df = parent_df.loc[parent_df['trait'] == child_trait]
    child_trait_genes = set(child_trait_df['gene'].unique())
    if UNKNOWN_GENE in child_trait_genes:
      child_trait_genes.remove(UNKNOWN_GENE)
    trait_to_genes[child_trait] = child_trait_genes

  for child_trait_a in child_traits:
    for child_trait_b in child_traits:
      if child_trait_a == child_trait_b:
        continue
      
      genes_a = trait_to_genes[child_trait_a]
      genes_b = trait_to_genes[child_trait_b]
      overlapping_genes = genes_a.intersection(genes_b)
      if len(overlapping_genes) > 0:
        print(f'{child_trait_a} and {child_trait_b} have {len(overlapping_genes)} overlapping genes.')  

Finally check if any variants are implicated in multiple (parent) traits

In [None]:
trait_to_variants = {}
for trait in all_traits:
  variants = set(trait_to_df[trait]['variant_and_allele'].unique())
  trait_to_variants[trait] = variants

 
for trait_a in all_traits:
  for trait_b in all_traits:
    if trait_a == trait_b:
      continue
    
    overlapping_variants = trait_to_variants[trait_a].intersection(trait_to_variants[trait_b])
    if len(overlapping_variants) > 0:
      print(f'{trait_a} and {trait_b} have {len(overlapping_variants)} overlapping variants.')

Sanity-check a few of those.

In [None]:
adhd_variants = trait_to_variants['attention deficit hyperactivity disorder']
autism_variants = trait_to_variants['autism spectrum disorder']
overlapping_variants = adhd_variants.intersection(autism_variants)
print(overlapping_variants)

In [None]:
depression_variants = trait_to_variants['unipolar depression']
anxiety_variants = trait_to_variants['anxiety disorder']
overlapping_variants = depression_variants.intersection(anxiety_variants)
print(overlapping_variants)