In [2]:
import pandas as pd
import numpy as np
import math

# Exploratory analysis of CSV data retrieved from GWAS Catalog.
This specific data is for Schizophrenia, but other data in GWAS catalog can be retrieved in the same format.

## Exploring raw data types, range of data, etc.

### Overall

In [3]:
raw_schizo_df = pd.read_csv('schizophrenia_gwas_catalog_2022.csv')
raw_schizo_df.head()

Unnamed: 0,Variant and risk allele,P-value,P-value annotation,RAF,OR,Beta,CI,Mapped gene,Reported trait,Trait(s),Background trait(s),Study accession,Location
0,rs11265461-<b>C</b>,2 x 10-7,,0.41,1.45,'-,[1.26-1.67],"SLAMF1, SETP9",Schizophrenia (treatment resistant),treatment refractory schizophrenia,'-,GCST001458,1:160660353
1,rs230529-<b>T</b>,2 x 10-7,,0.47,1.45,'-,[1.26-1.66],NFKB1,Schizophrenia (treatment resistant),treatment refractory schizophrenia,'-,GCST001458,4:102536261
2,rs2237457-<b>T</b>,6 x 10-7,(Recessive model),0.36,1.74,'-,[NR],GRB10,Schizophrenia (treatment resistant),treatment refractory schizophrenia,'-,GCST002604,7:50658447
3,rs2269372-<b>A</b>,4 x 10-8,,NR,1.313,'-,[NR],RENBP,Schizophrenia,schizophrenia,'-,GCST002190,X:153942092
4,rs7597593-<b>T</b>,9 x 10-11,,NR,1.066,'-,[1.05-1.09],ZNF804A,Schizophrenia,schizophrenia,'-,GCST004946,2:184668853


In [4]:
total_rows = len(raw_schizo_df)
print(total_rows)

3849


### Variants

In [5]:
num_unique_variants = len(raw_schizo_df['Variant and risk allele'].unique())
print(f"{num_unique_variants} unique variants out of {total_rows} records.")

variant_counts = raw_schizo_df['Variant and risk allele'].value_counts()

2739 unique variants out of 3849 records.


In [6]:
# Explore entries for one repeated variant to assess differences.
duplicates = raw_schizo_df.groupby('Variant and risk allele').filter(lambda x: len(x) > 1)
one_variant = duplicates.iloc[0]['Variant and risk allele']
duplicates[duplicates['Variant and risk allele'] == one_variant]

Unnamed: 0,Variant and risk allele,P-value,P-value annotation,RAF,OR,Beta,CI,Mapped gene,Reported trait,Trait(s),Background trait(s),Study accession,Location
4,rs7597593-<b>T</b>,9 x 10-11,,NR,1.066,'-,[1.05-1.09],ZNF804A,Schizophrenia,schizophrenia,'-,GCST004946,2:184668853
747,rs7597593-<b>T</b>,2 x 10-11,,NR,1.069,'-,[1.05-1.09],ZNF804A,Schizophrenia,schizophrenia,'-,GCST007201,2:184668853
2878,rs7597593-<b>T</b>,3 x 10-12,,0.62,'-,'-,'-,ZNF804A,Broad depression or schizophrenia,"unipolar depression, schizophrenia",'-,GCST007257,2:184668853
3625,rs7597593-<b>T</b>,8 x 10-6,(5 degree of freedom test),NR,1.055,'-,[1.03-1.08],ZNF804A,"Autism spectrum disorder, attention deficit-hy...","attention deficit hyperactivity disorder, unip...",'-,GCST001877,2:184668853


### P-values

In [7]:
raw_schizo_df['P-value'].describe()

count         3849
unique         163
top       2 x 10-8
freq           201
Name: P-value, dtype: object

In [8]:
len(raw_schizo_df['Mapped gene'].unique())

1427

### Genes

In [9]:
def has_multiple_genes(mapped_gene):
  return "," in mapped_gene


multi_gene_index = raw_schizo_df['Mapped gene'].apply(has_multiple_genes)
len(raw_schizo_df[multi_gene_index])

913

### Reported trait / Trait(s)

In [10]:
raw_schizo_df['Reported trait'].unique()

array(['Schizophrenia (treatment resistant)', 'Schizophrenia',
       'Schizophrenia (MTAG)', 'Schizophrenia or bipolar disorder',
       'Schizophrenia (negative symptoms)', 'Methamphetamine dependence',
       'Early-onset schizophrenia',
       'Autism spectrum disorder or schizophrenia',
       'Gray matter volume (schizophrenia interaction)',
       'Schizophrenia (inflammation and infection response interaction)',
       'Broad depression or schizophrenia',
       'Dentate gyrus volume x schizophrenia interaction',
       'Schizophrenia vs type 2 diabetes',
       'Schizophrenia and type 2 diabetes',
       'Autism and schizophrenia (MTAG)',
       'Left superior temporal gyrus thickness (schizophrenia interaction)',
       'Bipolar disorder and schizophrenia',
       'Schizophrenia (cytomegalovirus infection interaction)',
       'Schizophrenia (age at onset)',
       'Schizophrenia or schizoaffective disorder',
       'Schizophrenia vs autism spectrum disorder (ordinary least s

In [11]:
raw_schizo_df['Trait(s)'].unique()

array(['treatment refractory schizophrenia', 'schizophrenia',
       'autism spectrum disorder, schizophrenia',
       'schizophrenia, grey matter volume measurement',
       'schizophrenia, cytomegalovirus seropositivity',
       'schizophrenia, HSV1 seropositivity',
       'schizophrenia, Toxoplasma gondii seropositivity',
       'unipolar depression, schizophrenia',
       'dentate gyrus volume measurement, schizophrenia',
       'schizophrenia, type 2 diabetes mellitus',
       'schizophrenia, bipolar disorder',
       'schizophrenia, left superior temporal gyrus thickness measurement',
       'schizophrenia, cytomegalovirus infection',
       'schizophrenia, age at onset',
       'schizophrenia, schizoaffective disorder',
       'anorexia nervosa, schizophrenia',
       'attention deficit hyperactivity disorder, schizophrenia',
       'Tourette syndrome, schizophrenia',
       'schizophrenia, sex interaction measurement',
       'schizophrenia, bipolar disorder, response to lithiu

In [12]:
num_just_schizo = len(raw_schizo_df[raw_schizo_df['Trait(s)'] == 'schizophrenia'])
print(f"{num_just_schizo} / {total_rows} rows are for the trait schizophrenia only.")

2564 / 3849 rows are for the trait schizophrenia only.


### Initial observations:


*   3849 records total
*   P-values are currently objects/strings
*   A lot of genes - 1427 unique values, although some normalization seems to be required (e.g. to fix "SLAMF1, SETP9"). After normalizing it may be good to analyze counts per gene - maybe genes only implicated once are less signficant than others which appear multiple times.
*   Many records have multiple traits in addition to schizophrenia (e.g. one trait value is "anorexia nervosa, obsessive-compulsive disorder, attention deficit hyperactivity disorder, Tourette syndrome, unipolar depression, autism spectrum disorder, schizophrenia, bipolar disorder"). I assume these studies examined patients with either condition, but it's not entirely clear without checking the studies themselves. To make this a scalable approach, it may be best to omit records that are for more than just schizophrenia to avoid any potential biases in the future similarity analysis.
* A fair amount of the variants in the dataset appear multiple times (e.g. reported by different studies). It's worth noting this, although at the moment it's unclear what the best way to handle this is. Maybe subsequent analysis should only focus on variants identified multiple time; maybe for each repeated variant, only the lowest p-value should be retained. However, some care should be applied given the above point about traits (maybe want the lowest p-value among records for just the trait schizophrenia).



## Cleaning/normalizing data

In [37]:
# Create copy of DF to hold normalized data and leave raw DF untouched.
schizo_df = raw_schizo_df

### P-values

In [38]:
def pval_to_num(pval):
  parts = pval.split(" x 10-")
  return float(parts[0]) * pow(10, -float(parts[1]))


print(pval_to_num("2 x 10-7"))

2e-07


In [39]:
schizo_df['P-value_norm'] = raw_schizo_df['P-value'].apply(pval_to_num)

In [40]:
schizo_df['P-value_norm'].describe()

count    3.849000e+03
mean     1.072234e-06
std      2.255918e-06
min      2.000000e-44
25%      3.000000e-10
50%      2.000000e-08
75%      6.000000e-07
max      1.000000e-05
Name: P-value_norm, dtype: float64

### Traits

In [41]:
# As mentioned above, it may be best to use the subset of data which focused 
# solely on the trait of interest (schizophrenia).
# There are some others that are probably fine to include (e.g. treatment 
# refractory schizophrenia), but for the sake of simplicity and 
# generalizability, we'll assume there is one canonical GWAS catalog trait of 
# interest for each condition to be analyzed.
canonical_trait = 'schizophrenia'
filtered_df = schizo_df[schizo_df['Trait(s)'] == canonical_trait]
print(f"Filtered from {len(schizo_df)} rows to {len(filtered_df)} rows.")

Filtered from 3849 rows to 2564 rows.


In [42]:
# The majority of the data is retained, so we'll use just this subset.
schizo_df = filtered_df

### Variants

In [19]:
# Sanity-check that all duplicated variants are reported to map to same gene(s)
# before we split multi-gene associations into separate rows.
# If all repeated variants map to same gene, we can just retain the entry with
# lowest p-value (or any really, since subsequent analysis just cares about 
# variant ID and implicated genes).
duplicate_variants = schizo_df.groupby('Variant and risk allele').filter(lambda x: len(x) > 1)['Variant and risk allele'].unique()
all_good = True
for variant in duplicate_variants:
  all_mapped_genes = schizo_df[schizo_df['Variant and risk allele'] == variant]['Mapped gene'].unique()
  if len(all_mapped_genes) > 1:
    print(f"Found variant, {variant}, with differing mapped gene values.")
    all_good = False


if all_good:
  print("No repeated variants with differing mapped gene values.")

No repeated variants with differing mapped gene values.


In [46]:
# Proceed with just choosing the record with the lowest p-value.
# It may later be useful to revisit this step and retain these duplicates - 
# maybe only focusing on those associations that have been found in multiple
# independent studies will lead to better results in the subsequent analysis.
schizo_df = schizo_df.groupby('Variant and risk allele')['P-value_norm'].min().reset_index()

In [47]:
# Sanity-check duplicates are gone:
num_unique_variants = len(schizo_df['Variant and risk allele'].unique())
num_total = len(schizo_df)
print(f"{num_unique_variants} unique variants of {num_total} records")

1822 unique variants of 1822 records


### Genes

In [24]:
# Genes are comma-separated so `explode` can be used to create a new row for 
# each gene (with all other columns identical).
# https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.explode.html
schizo_df['gene_norm'] = schizo_df['Mapped gene'].apply(lambda val: val.split(", "))
exploded_schizo_df = schizo_df.explode('gene_norm')
len(exploded_schizo_df)

2201

In [25]:
# Sanity check that the final number of rows is expected:
schizo_df['gene_norm'].apply(lambda x: len(x)).value_counts()

1    1444
2     377
3       1
Name: gene_norm, dtype: int64

In [27]:
# 1444 entries with one gene + 2 * 377 entries with two + 3 * 1 entries with three
assert len(exploded_schizo_df) == 1444 + 2 * 377 + 3 * 1

In [28]:
# Sanity-check passes so set schizo_df to the exploded version.
schizo_df = exploded_schizo_df

In [29]:
schizo_df['gene_norm'].value_counts()

'-           251
LINC01470     21
CACNA1C       15
Y_RNA         15
VRK2          11
            ... 
DNAJA3         1
MEMO1P1        1
SETD4          1
HMGB1P36       1
MIR548AE1      1
Name: gene_norm, Length: 1118, dtype: int64

In [30]:
# 491 / 4764 entries have "'-" for their gene; I'm assuming this indicates an 
# unknown/unconfirmed gene association.
UNKNOWN_GENE = "UNKNOWN"

def replace_unknown_gene(gene):
  return UNKNOWN_GENE if gene == "'-" else gene


schizo_df['gene_norm'] = schizo_df['gene_norm'].apply(replace_unknown_gene)
schizo_df['gene_norm'].value_counts()

UNKNOWN      251
LINC01470     21
CACNA1C       15
Y_RNA         15
VRK2          11
            ... 
DNAJA3         1
MEMO1P1        1
SETD4          1
HMGB1P36       1
MIR548AE1      1
Name: gene_norm, Length: 1118, dtype: int64

## Output

Finally, write out the normalized version of the data for use
in further analysis.

In [31]:
schizo_df.head()

Unnamed: 0,Variant and risk allele,P-value,P-value annotation,RAF,OR,Beta,CI,Mapped gene,Reported trait,Trait(s),Background trait(s),Study accession,Location,P-value_norm,gene_norm
3,rs2269372-<b>A</b>,4 x 10-8,,NR,1.313,'-,[NR],RENBP,Schizophrenia,schizophrenia,'-,GCST002190,X:153942092,4e-08,RENBP
5,rs6846161-<b>A</b>,1 x 10-9,,NR,1.067,'-,[1.05-1.09],GPM6A,Schizophrenia,schizophrenia,'-,GCST004946,4:175945308,1e-09,GPM6A
6,rs3735025-<b>T</b>,4 x 10-12,,NR,1.068,'-,[1.05-1.09],DGKI,Schizophrenia,schizophrenia,'-,GCST004946,7:137390098,4e-12,DGKI
7,rs2057884-<b>T</b>,2 x 10-12,,NR,1.069,'-,[1.05-1.09],SRPK2,Schizophrenia,schizophrenia,'-,GCST004946,7:105289803,2e-12,SRPK2
8,rs215411-<b>A</b>,6 x 10-10,,NR,1.069,'-,[1.05-1.09],RFPL4AP3,Schizophrenia,schizophrenia,'-,GCST004946,4:23421980,6e-10,RFPL4AP3


In [34]:
# Keep only the relevant, normalized columns for brevity. This can always be 
# updated later to retain more if there's a use for it.

out_df = schizo_df[['Variant and risk allele', 'P-value_norm', 'Trait(s)', 'gene_norm']]
column_remapping = {
    'Variant and risk allele': 'variant_and_allele', 
    'P-value_norm': 'p_value',
    'Trait(s)': 'trait',
    'gene_norm': 'gene',
}
out_df = out_df.rename(columns=column_remapping)
out_df.head()

Unnamed: 0,variant_and_allele,p_value,trait,gene
3,rs2269372-<b>A</b>,4e-08,schizophrenia,RENBP
5,rs6846161-<b>A</b>,1e-09,schizophrenia,GPM6A
6,rs3735025-<b>T</b>,4e-12,schizophrenia,DGKI
7,rs2057884-<b>T</b>,2e-12,schizophrenia,SRPK2
8,rs215411-<b>A</b>,6e-10,schizophrenia,RFPL4AP3


In [36]:
out_df.to_csv('schizophrenia_gwas_catalog_2022_cleaned.csv')