In [None]:
import pandas as pd
import numpy as np
import math

# Exploratory analysis of CSV data retrieved from GWAS Catalog.
This specific data is for Schizophrenia, but other data in GWAS catalog can be retrieved in the same format.

## Exploring raw data types, range of data, etc.

### Overall

In [None]:
raw_schizo_df = pd.read_csv('schizophrenia_gwas_catalog_2022.csv')
raw_schizo_df.head()

In [None]:
total_rows = len(raw_schizo_df)
print(total_rows)

### Variants

In [None]:
num_unique_variants = len(raw_schizo_df['Variant and risk allele'].unique())
print(f"{num_unique_variants} unique variants out of {total_rows} records.")

variant_counts = raw_schizo_df['Variant and risk allele'].value_counts()

In [None]:
# Explore entries for one repeated variant to assess differences.
duplicates = raw_schizo_df.groupby('Variant and risk allele').filter(lambda x: len(x) > 1)
one_variant = duplicates.iloc[0]['Variant and risk allele']
duplicates[duplicates['Variant and risk allele'] == one_variant]

### P-values

In [None]:
raw_schizo_df['P-value'].describe()

In [None]:
len(raw_schizo_df['Mapped gene'].unique())

### Genes

In [None]:
def has_multiple_genes(mapped_gene):
  return "," in mapped_gene


multi_gene_index = raw_schizo_df['Mapped gene'].apply(has_multiple_genes)
len(raw_schizo_df[multi_gene_index])

### Reported trait / Trait(s)

In [None]:
raw_schizo_df['Reported trait'].unique()

In [None]:
raw_schizo_df['Trait(s)'].unique()

In [None]:
num_just_schizo = len(raw_schizo_df[raw_schizo_df['Trait(s)'] == 'schizophrenia'])
print(f"{num_just_schizo} / {total_rows} rows are for the trait schizophrenia only.")

### Initial observations:


*   3849 records total
*   P-values are currently objects/strings
*   A lot of genes - 1427 unique values, although some normalization seems to be required (e.g. to fix "SLAMF1, SETP9"). After normalizing it may be good to analyze counts per gene - maybe genes only implicated once are less signficant than others which appear multiple times.
*   Many records have multiple traits in addition to schizophrenia (e.g. one trait value is "anorexia nervosa, obsessive-compulsive disorder, attention deficit hyperactivity disorder, Tourette syndrome, unipolar depression, autism spectrum disorder, schizophrenia, bipolar disorder"). I assume these studies examined patients with either condition, but it's not entirely clear without checking the studies themselves. To make this a scalable approach, it may be best to omit records that are for more than just schizophrenia to avoid any potential biases in the future similarity analysis.
* A fair amount of the variants in the dataset appear multiple times (e.g. reported by different studies). It's worth noting this, although at the moment it's unclear what the best way to handle this is. Maybe subsequent analysis should only focus on variants identified multiple time; maybe for each repeated variant, only the lowest p-value should be retained. However, some care should be applied given the above point about traits (maybe want the lowest p-value among records for just the trait schizophrenia).



## Cleaning/normalizing data

In [None]:
# Create copy of DF to hold normalized data and leave raw DF untouched.
schizo_df = raw_schizo_df.copy()

### P-values

In [None]:
def pval_to_num(pval):
  parts = pval.split(" x 10-")
  return float(parts[0]) * pow(10, -float(parts[1]))


print(pval_to_num("2 x 10-7"))

In [None]:
schizo_df['P-value_norm'] = raw_schizo_df['P-value'].apply(pval_to_num)

In [None]:
schizo_df['P-value_norm'].describe()

### Traits

In [None]:
# As mentioned above, it may be best to use the subset of data which focused 
# solely on the trait of interest (schizophrenia).
# There are some others that are probably fine to include (e.g. treatment 
# refractory schizophrenia), but for the sake of simplicity and 
# generalizability, we'll assume there is one canonical GWAS catalog trait of 
# interest for each condition to be analyzed.
canonical_trait = 'schizophrenia'
filtered_df = schizo_df[schizo_df['Trait(s)'] == canonical_trait]
print(f"Filtered from {len(schizo_df)} rows to {len(filtered_df)} rows.")

In [None]:
# The majority of the data is retained, so we'll use just this subset.
schizo_df = filtered_df

### Variants

In [None]:
# Sanity-check that all duplicated variants are reported to map to same gene(s)
# before we split multi-gene associations into separate rows.
# If all repeated variants map to same gene, we can just retain the entry with
# lowest p-value (or any really, since subsequent analysis just cares about 
# variant ID and implicated genes).
duplicate_variants = schizo_df.groupby('Variant and risk allele').filter(lambda x: len(x) > 1)['Variant and risk allele'].unique()
all_good = True
for variant in duplicate_variants:
  all_mapped_genes = schizo_df[schizo_df['Variant and risk allele'] == variant]['Mapped gene'].unique()
  if len(all_mapped_genes) > 1:
    print(f"Found variant, {variant}, with differing mapped gene values.")
    all_good = False


if all_good:
  print("No repeated variants with differing mapped gene values.")

In [None]:
# Proceed with just choosing the record with the lowest p-value.
# It may later be useful to revisit this step and retain these duplicates - 
# maybe only focusing on those associations that have been found in multiple
# independent studies will lead to better results in the subsequent analysis.
min_indices = schizo_df.groupby('Variant and risk allele')['P-value_norm'].idxmin()
schizo_df = schizo_df.loc[min_indices]

In [None]:
# Sanity-check duplicates are gone:
num_unique_variants = len(schizo_df['Variant and risk allele'].unique())
num_total = len(schizo_df)
print(f"{num_unique_variants} unique variants of {num_total} records")

### Genes

In [None]:
# Genes are comma-separated so `explode` can be used to create a new row for 
# each gene (with all other columns identical).
# https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.explode.html
schizo_df['gene_norm'] = raw_schizo_df['Mapped gene'].apply(lambda val: val.split(", "))
exploded_schizo_df = schizo_df.explode('gene_norm')
len(exploded_schizo_df)

In [None]:
# Sanity check that the final number of rows is expected:
schizo_df['gene_norm'].apply(lambda x: len(x)).value_counts()

In [None]:
# 1444 entries with one gene + 2 * 377 entries with two + 3 * 1 entries with three
assert len(exploded_schizo_df) == 1444 + 2 * 377 + 3 * 1

In [None]:
# Sanity-check passes so set schizo_df to the exploded version.
schizo_df = exploded_schizo_df

In [None]:
schizo_df['gene_norm'].value_counts()

In [None]:
# 491 / 4764 entries have "'-" for their gene; I'm assuming this indicates an 
# unknown/unconfirmed gene association.
UNKNOWN_GENE = "UNKNOWN"

def replace_unknown_gene(gene):
  return UNKNOWN_GENE if gene == "'-" else gene


schizo_df['gene_norm'] = schizo_df['gene_norm'].apply(replace_unknown_gene)
schizo_df['gene_norm'].value_counts()

## Output

Finally, write out the normalized version of the data for use
in further analysis.

In [None]:
schizo_df.head()

In [None]:
# Keep only the relevant, normalized columns for brevity. This can always be 
# updated later to retain more if there's a use for it.

out_df = schizo_df[['Variant and risk allele', 'P-value_norm', 'Trait(s)', 'gene_norm']]
column_remapping = {
    'Variant and risk allele': 'variant_and_allele', 
    'P-value_norm': 'p_value',
    'Trait(s)': 'trait',
    'gene_norm': 'gene',
}
out_df = out_df.rename(columns=column_remapping)
out_df.head()

In [None]:
out_df.to_csv('schizophrenia_gwas_catalog_2022_cleaned.csv')