In [None]:
import pandas as pd
from pyspark.sql import DataFrame, SparkSession, Window
import pyspark.sql.functions as F

spark = (SparkSession.builder
         .master('local[*]')
         .config("spark.driver.memory", "15g")
         .appName('spark')
         .getOrCreate())

In [7]:
def drop_fully_null_columns(df, but_keep_these=[]):
    """Drops DataFrame columns that are fully null
    (i.e. the maximum value is null)

    Arguments:
        df {spark DataFrame} -- spark dataframe
        but_keep_these {list} -- list of columns to keep without checking for nulls

    Returns:
        spark DataFrame -- dataframe with fully null columns removed
    """

    # skip checking some columns
    cols_to_check = [col for col in df.columns if col not in but_keep_these]
    if len(cols_to_check) > 0:
        # drop columns for which the max is None
        rows_with_data = df.select(*cols_to_check).groupby().agg(*[F.max(c).alias(c) for c in cols_to_check]).take(1)[0]
        cols_to_drop = [c for c, const in rows_with_data.asDict().items() if const == None]
        new_df = df.drop(*cols_to_drop)

        return new_df
    else:
        return df

## Data from GWAS Catalog

In [8]:
studies = (
    spark.read.csv('data/gwas_catalog/studies_alternative', header=True, inferSchema=True, sep='\t')
    .filter(F.col('LINK').contains('34662886'))
)

assocs = (
    spark.read.csv('data/gwas_catalog/alternative', header=True, inferSchema=True, sep='\t')
    .filter(F.col('LINK').contains('34662886'))
)
assocs = drop_fully_null_columns(assocs)
studies = drop_fully_null_columns(studies)

print(assocs.printSchema())
print(assocs.first())

22/03/22 11:14:52 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

root
 |-- DATE ADDED TO CATALOG: string (nullable = true)
 |-- PUBMEDID: integer (nullable = true)
 |-- FIRST AUTHOR: string (nullable = true)
 |-- DATE: string (nullable = true)
 |-- JOURNAL: string (nullable = true)
 |-- LINK: string (nullable = true)
 |-- STUDY: string (nullable = true)
 |-- DISEASE/TRAIT: string (nullable = true)
 |-- INITIAL SAMPLE SIZE: string (nullable = true)
 |-- REPLICATION SAMPLE SIZE: string (nullable = true)
 |-- STRONGEST SNP-RISK ALLELE: string (nullable = true)
 |-- SNPS: string (nullable = true)
 |-- MERGED: integer (nullable = true)
 |-- INTERGENIC: integer (nullable = true)
 |-- RISK ALLELE FREQUENCY: string (nullable = true)
 |-- P-VALUE: double (nullable = true)
 |-- PVALUE_MLOG: double (nullable = true)
 |-- OR or BETA: double (nullable = true)
 |-- 95% CI (TEXT): string (nullable = true)
 |-- PLATFORM [SNPS PASSING QC]: string (nullable = true)
 |-- CNV: string (nullable = true)
 |-- MAPPED_TRAIT: string (nullable = true)
 |-- MAPPED_TRAIT_URI: s

In [44]:
print(studies.printSchema())
print(studies.first())

root
 |-- DATE ADDED TO CATALOG: string (nullable = true)
 |-- PUBMEDID: integer (nullable = true)
 |-- FIRST AUTHOR: string (nullable = true)
 |-- DATE: string (nullable = true)
 |-- JOURNAL: string (nullable = true)
 |-- LINK: string (nullable = true)
 |-- STUDY: string (nullable = true)
 |-- DISEASE/TRAIT: string (nullable = true)
 |-- INITIAL SAMPLE SIZE: string (nullable = true)
 |-- REPLICATION SAMPLE SIZE: string (nullable = true)
 |-- PLATFORM [SNPS PASSING QC]: string (nullable = true)
 |-- ASSOCIATION COUNT: integer (nullable = true)
 |-- MAPPED_TRAIT: string (nullable = true)
 |-- MAPPED_TRAIT_URI: string (nullable = true)
 |-- STUDY ACCESSION: string (nullable = true)
 |-- GENOTYPING TECHNOLOGY: string (nullable = true)

None
Row(DATE ADDED TO CATALOG='2022-03-01', PUBMEDID=34662886, FIRST AUTHOR='Backman JD', DATE='2021-10-18', JOURNAL='Nature', LINK='www.ncbi.nlm.nih.gov/pubmed/34662886', STUDY='Exome sequencing and analysis of 454,787 UK Biobank participants.', DISEASE/T

In [49]:
studies.filter(F.col('ASSOCIATION COUNT') != 0).count()

200

In [48]:
studies.select('MAPPED_TRAIT_URI').distinct().count()

1028

## Data from publication

Metrics:
- 8865 variant/gene/trait associations (`pub_data.filter(items=['Gene', 'Trait', 'Marker']).drop_duplicates().shape[0]`)
- 2283 gene/trait pairs (`pub_data.filter(items=['Gene', 'Trait']).drop_duplicates().shape[0]`)
- 18285 records. Breakdown per ancestry:
  - EUR    17544
  - SAS      433
  - AFR      182
  - EAS      126
- 564 genes (`pub_data.Gene.unique().shape[0]`)
- 492 traits (`pub_data.Trait.unique().shape[0]`)
- 611 variants (`pub_data.Marker.unique().shape[0]`)
- 973 GWAS Catalog accessions (`assocs_pub['Study Accession'].unique().shape[0]`). 168 associations without a study.
- 0 GWAS Catalog studies overlap 

In [45]:
# Load raw data
assocs_pub_non_european = pd.read_excel('data/publication/41586_2021_4103_MOESM5_ESM.xlsx', sheet_name='SD3')
assocs_pub_european = pd.read_excel('data/publication/41586_2021_4103_MOESM5_ESM.xlsx', sheet_name='SD2')
summary_stats = (
    pd.read_excel('data/publication/41586_2021_4103_MOESM5_ESM.xlsx', sheet_name='SD4', skiprows=0, header=1).iloc[2:]
    .reset_index(drop=True)
    .filter(items=['Reported trait', 'Study Accession'])
)

# Combine Data 2 with Data 3 and merge with sumstats to get study accession
assocs_pub = (
    pd.concat([assocs_pub_non_european, assocs_pub_european], ignore_index=True)
    .fillna(value={'Ancestry': 'EUR'})
    .drop_duplicates()
)
assocs_pub = assocs_pub.merge(summary_stats, left_on='Trait', right_on='Reported trait', how='left').drop('Reported trait', axis=1)
# Ancestry bug

# Distinguish between variant and collapsing analyses
burden_assocs_pub = assocs_pub.query('`Marker type` == "Burden"').drop_duplicates()
variant_assocs_pub = assocs_pub.query('`Marker type` != "Burden"').drop_duplicates()

variant_assocs_pub.iloc[0]



Ancestry                                                                                                         AFR
Gene                                                                                                           PCSK9
Trait                                                                                             LDL direct (30780)
Trait description                                                  Measured by enzymatic protective selection ana...
Trait type                                                                                                        QT
                                                                                         ...                        
Effect direction, UKB SAS ancestry                                                                               NaN
Effect direction consistent with EUR, UKB SAS ancestry                                                           NaN
N cases with 0|1|2 copies of effect allele, UKB SAS ancestry    

In [42]:
assocs_pub.query('Ancestry != "EUR"').iloc[0]

Ancestry                                                                                                         SAS
Gene                                                                                                            ALPL
Trait                                                                                   Alkaline phosphatase (30610)
Trait description                                                  Measured by AMP(IFCC) analysis on a Beckman Co...
Trait type                                                                                                        QT
                                                                                         ...                        
Effect direction, UKB SAS ancestry                                                                               NaN
Effect direction consistent with EUR, UKB SAS ancestry                                                           NaN
N cases with 0|1|2 copies of effect allele, UKB SAS ancestry    

22/03/22 15:03:04 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 1631607 ms exceeds timeout 120000 ms
22/03/22 15:03:04 WARN SparkContext: Killing executors is not supported by current scheduler.


In [34]:
assocs.toPandas().filter(['STUDY ACCESSION']).query('`STUDY ACCESSION` == "GCST90082999"').shape
print(assocs.filter(F.col('STUDY ACCESSION') == 'GCST90079274').count())

                                                                                

3


In [35]:
assocs_pub.filter(['Gene', 'Trait', 'Marker', 'Study Accession', 'P-value', 'Marker type']).query('`Study Accession` == "GCST90079274"')

Unnamed: 0,Gene,Trait,Marker,Study Accession,P-value,Marker type
741,ACAP3,6mm weak meridian left (5097),1:1303203:G:A,GCST90079274,9.25e-12,DelMissense
786,ANGPTL7,6mm weak meridian left (5097),M3.1,GCST90079274,4e-20,Burden
816,ANGPTL7,6mm weak meridian left (5097),1:11193627:G:T,GCST90079274,1.15e-14,DelMissense
4036,PRSS56,6mm weak meridian left (5097),M1.01,GCST90079274,1.02e-11,Burden
4038,PRSS56,6mm weak meridian left (5097),M1.1,GCST90079274,1.02e-11,Burden
4040,PRSS56,6mm weak meridian left (5097),M3.01,GCST90079274,1.02e-11,Burden
4042,PRSS56,6mm weak meridian left (5097),M3.1,GCST90079274,1.02e-11,Burden
6235,THBS4,6mm weak meridian left (5097),M3.01,GCST90079274,1.62e-21,Burden
6255,THBS4,6mm weak meridian left (5097),M3.1,GCST90079274,1.18e-11,Burden
6273,THBS4,6mm weak meridian left (5097),5:80080037:C:T,GCST90079274,1.79e-27,DelMissense


In [None]:
## GWAS Catalog report very few associations as being significant, why is that?
# Traits in GWASCat and Regeneron are not identical, I have to make a conversion:
# Raw: "Red blood cell erythrocyte distribution width (30070)" / GWASCat: "Red blood cell erythrocyte distribution width (UKB data field 30070)"

studies.withColumn('DISEASE/TRAIT', F.regexp_replace('DISEASE/TRAIT', 'UKB data field ', '')).select('DISEASE/TRAIT').show(truncate=False)

In [23]:
missing_assocs = (
    assocs_pub.filter(['Gene', 'Trait', 'Marker', 'Study Accession', 'P-value', 'Marker type'])
    .merge(assocs.toPandas().filter(['STUDY ACCESSION']), left_on='Study Accession', right_on='STUDY ACCESSION', how='outer', indicator=True)
    .query('_merge == "left_only"')
    .drop('_merge', axis=1)
    .drop_duplicates()
)

print(f"{missing_assocs.filter(['Gene', 'Trait', 'Marker']).drop_duplicates().shape[0]} associations are not in GWAS Catalog")
missing_assocs.head()

8973 associations are not in GWAS Catalog


Unnamed: 0,Gene,Trait,Marker,Study Accession,P-value,Marker type,STUDY ACCESSION
710,ALPL,Alkaline phosphatase (30610),M3.0001,GCST90082999,1.14e-13,Burden,
711,ALPL,Alkaline phosphatase (30610),M3.001,GCST90082999,1.14e-13,Burden,
712,ALPL,Alkaline phosphatase (30610),M3.01,GCST90082999,1.14e-13,Burden,
713,ALPL,Alkaline phosphatase (30610),M3.1,GCST90082999,1.14e-13,Burden,
714,ALPL,Alkaline phosphatase (30610),M3.001,GCST90082999,3.09e-13,Burden,


In [26]:
(
    missing_assocs.query('`Study Accession` == "GCST90082999"')
)

Unnamed: 0,Gene,Trait,Marker,Study Accession,P-value,Marker type,STUDY ACCESSION
710,ALPL,Alkaline phosphatase (30610),M3.0001,GCST90082999,1.140000e-13,Burden,
711,ALPL,Alkaline phosphatase (30610),M3.001,GCST90082999,1.140000e-13,Burden,
712,ALPL,Alkaline phosphatase (30610),M3.01,GCST90082999,1.140000e-13,Burden,
713,ALPL,Alkaline phosphatase (30610),M3.1,GCST90082999,1.140000e-13,Burden,
714,ALPL,Alkaline phosphatase (30610),M3.001,GCST90082999,3.090000e-13,Burden,
...,...,...,...,...,...,...,...
847,ASGR1,Alkaline phosphatase (30610),17:7173778:C:T,GCST90082999,1.700000e-16,DelMissense,
848,ASGR1,Alkaline phosphatase (30610),17:7173803:C:T,GCST90082999,2.770000e-19,pLOF,
849,ASGR1,Alkaline phosphatase (30610),17:7174274:C:T,GCST90082999,2.490000e-12,DelMissense,
850,ASGR1,Alkaline phosphatase (30610),17:7174401:A:C,GCST90082999,3.000000e-50,DelMissense,


In [132]:
missing_assocs['Marker type'].value_counts()

Burden         9912
DelMissense    1088
pLOF            564
Name: Marker type, dtype: int64