In [None]:
from pyspark.sql import DataFrame, SparkSession, Window
import pyspark.sql.functions as F

spark = (SparkSession.builder
         .master('local[*]')
         .config("spark.driver.memory", "15g")
         .appName('spark')
         .getOrCreate())

In [13]:
def drop_fully_null_columns(df, but_keep_these=[]):
    """Drops DataFrame columns that are fully null
    (i.e. the maximum value is null)

    Arguments:
        df {spark DataFrame} -- spark dataframe
        but_keep_these {list} -- list of columns to keep without checking for nulls

    Returns:
        spark DataFrame -- dataframe with fully null columns removed
    """

    # skip checking some columns
    cols_to_check = [col for col in df.columns if col not in but_keep_these]
    if len(cols_to_check) > 0:
        # drop columns for which the max is None
        rows_with_data = df.select(*cols_to_check).groupby().agg(*[F.max(c).alias(c) for c in cols_to_check]).take(1)[0]
        cols_to_drop = [c for c, const in rows_with_data.asDict().items() if const == None]
        new_df = df.drop(*cols_to_drop)

        return new_df
    else:
        return df

Row(DATE ADDED TO CATALOG='2022-03-01', PUBMEDID=34662886, FIRST AUTHOR='Backman JD', DATE='2021-10-18', JOURNAL='Nature', LINK='www.ncbi.nlm.nih.gov/pubmed/34662886', STUDY='Exome sequencing and analysis of 454,787 UK Biobank participants.', DISEASE/TRAIT='Usual walking pace - Brisk pace (UKB data field 924)', INITIAL SAMPLE SIZE='176,054 European ancestry cases, 252,201 European ancestry controls', REPLICATION SAMPLE SIZE='NA', PLATFORM [SNPS PASSING QC]='Affymetrix, Illumina [546001]', ASSOCIATION COUNT=0, MAPPED_TRAIT='physical activity measurement', MAPPED_TRAIT_URI='http://www.ebi.ac.uk/efo/EFO_0008002', STUDY ACCESSION='GCST90079511', GENOTYPING TECHNOLOGY='Genome-wide genotyping array, Exome-wide sequencing [UK Biobank/UK BiLEVE Axiom Array]')

In [43]:
# gwas_studies = spark.read.csv('data/studies_alternative', header=True, inferSchema=True, sep='\t')

studies = (
    spark.read.csv('data/studies_alternative', header=True, inferSchema=True, sep='\t')
    .filter(F.col('LINK').contains('34662886'))
)

assocs = (
    spark.read.csv('data/alternative', header=True, inferSchema=True, sep='\t')
    .filter(F.col('LINK').contains('34662886'))
)
assocs = drop_fully_null_columns(assocs)
studies = drop_fully_null_columns(studies)

print(assocs.printSchema())
print(assocs.first())

                                                                                

root
 |-- DATE ADDED TO CATALOG: string (nullable = true)
 |-- PUBMEDID: integer (nullable = true)
 |-- FIRST AUTHOR: string (nullable = true)
 |-- DATE: string (nullable = true)
 |-- JOURNAL: string (nullable = true)
 |-- LINK: string (nullable = true)
 |-- STUDY: string (nullable = true)
 |-- DISEASE/TRAIT: string (nullable = true)
 |-- INITIAL SAMPLE SIZE: string (nullable = true)
 |-- REPLICATION SAMPLE SIZE: string (nullable = true)
 |-- STRONGEST SNP-RISK ALLELE: string (nullable = true)
 |-- SNPS: string (nullable = true)
 |-- MERGED: integer (nullable = true)
 |-- INTERGENIC: integer (nullable = true)
 |-- RISK ALLELE FREQUENCY: string (nullable = true)
 |-- P-VALUE: double (nullable = true)
 |-- PVALUE_MLOG: double (nullable = true)
 |-- OR or BETA: double (nullable = true)
 |-- 95% CI (TEXT): string (nullable = true)
 |-- PLATFORM [SNPS PASSING QC]: string (nullable = true)
 |-- CNV: string (nullable = true)
 |-- MAPPED_TRAIT: string (nullable = true)
 |-- MAPPED_TRAIT_URI: s

In [44]:
print(studies.printSchema())
print(studies.first())

root
 |-- DATE ADDED TO CATALOG: string (nullable = true)
 |-- PUBMEDID: integer (nullable = true)
 |-- FIRST AUTHOR: string (nullable = true)
 |-- DATE: string (nullable = true)
 |-- JOURNAL: string (nullable = true)
 |-- LINK: string (nullable = true)
 |-- STUDY: string (nullable = true)
 |-- DISEASE/TRAIT: string (nullable = true)
 |-- INITIAL SAMPLE SIZE: string (nullable = true)
 |-- REPLICATION SAMPLE SIZE: string (nullable = true)
 |-- PLATFORM [SNPS PASSING QC]: string (nullable = true)
 |-- ASSOCIATION COUNT: integer (nullable = true)
 |-- MAPPED_TRAIT: string (nullable = true)
 |-- MAPPED_TRAIT_URI: string (nullable = true)
 |-- STUDY ACCESSION: string (nullable = true)
 |-- GENOTYPING TECHNOLOGY: string (nullable = true)

None
Row(DATE ADDED TO CATALOG='2022-03-01', PUBMEDID=34662886, FIRST AUTHOR='Backman JD', DATE='2021-10-18', JOURNAL='Nature', LINK='www.ncbi.nlm.nih.gov/pubmed/34662886', STUDY='Exome sequencing and analysis of 454,787 UK Biobank participants.', DISEASE/T

In [49]:
studies.filter(F.col('ASSOCIATION COUNT') != 0).count()

200

In [48]:
studies.select('MAPPED_TRAIT_URI').distinct().count()

1028