# Work in Progress PySpark conversion

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.functions import col
from pyspark.sql.types import DoubleType
from pyspark.ml.stat import Correlation
from pyspark.ml.feature import VectorAssembler
from pyspark.sql.types import StructType, StructField, FloatType
import subprocess
import re

import hail as hl
from hail.linalg import BlockMatrix

import pandas as pd
import numpy as np

# Initialize Spark Session
spark = SparkSession.builder \
    .appName("Finemapping Pipeline") \
    .config("spark.driver.memory", "16g") \
    .config("spark.memory.offHeap.enabled",True) \
    .config("spark.memory.offHeap.size","16g") \
    .getOrCreate()

# Input variables
gwas_file_path = "/Users/hn9/Documents/Analysis/FM-comparison/gwas-examples/APOE-LDL/24097068-GCST002222-EFO_0004611.h.tsv.gz"
target = "APOE_LDL"
target_chrom = 19
target_pos = 44908822
start_pos = target_pos - 500000
end_pos = target_pos + 500000
lead_snp_ID = f"{target_chrom}:{target_pos}:C:T"
n_sample = 94595

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/10/10 11:12:35 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
def lift_and_calculate_ld_gnomad(lead_snp_ID, StudyLocus):
    bm = BlockMatrix.read("gs://gcp-public-data--gnomad/release/2.1.1/ld/gnomad.genomes.r2.1.1.nfe.common.adj.ld.bm")
    variant_table = hl.read_table("gs://gcp-public-data--gnomad/release/2.1.1/ld/gnomad.genomes.r2.1.1.nfe.common.adj.ld.variant_indices.ht")
    locus = hl.parse_variant(lead_snp_ID).locus
    temp_table = hl.Table.parallelize([hl.struct(locus=locus)])
    locus_values = temp_table.select('locus').collect()
    locus_value = locus_values[0].locus
    contig = locus_value.contig
    position = locus_value.position
    locus_37 = hl.liftover(hl.locus(contig, position, 'GRCh38'), 'GRCh37')

    window_size = 500000
    locus_variants = variant_table.filter(
        (hl.abs(variant_table.locus.position - locus_37.position) <= window_size) &
        (variant_table.locus.contig == locus_37.contig)
    )

    # Get LD matrix
    indices = locus_variants['idx'].collect()
    sub_bm = bm.filter(indices, indices)
    numpy_array = sub_bm.to_numpy()
    # need to change to iteritems due to old pandas version error
    ld_df.iteritems = ld_df.items
    ld_df = pd.DataFrame(numpy_array)
    ld_pyspark_df = spark.createDataFrame(ld_df)

    # Get SNP IDs in GRCh38
    locus_variants = locus_variants.annotate(
        locus_38 = hl.liftover(locus_variants.locus, 'GRCh38')
    )
    locus_variants = locus_variants.annotate(
        snp_id_38 = hl.str(locus_variants.locus_38.contig) + "_" +
                    hl.str(locus_variants.locus_38.position) + "_" +
                    locus_variants.alleles[0] + "_" +
                    locus_variants.alleles[1]
    )
    snp_ids_38 = locus_variants['snp_id_38'].collect()

    study_snps = [row.snp_id_38 for row in StudyLocus.select("variantID").collect()]
    study_snps = set(study_snps)
    # Find the intersection of the SNPs
    common_snps = study_snps.intersection(snp_ids_38)
    
    # Filter StudyLocus to only include common SNPs
    filtered_StudyLocus = StudyLocus.filter(F.col("snp_id_38").isin(common_snps))
    
    # Filter the LD matrix to include only the common SNPs
    # First, we'll create a DataFrame with the correct column names
    ld_df.columns = snp_ids_38
    ld_pyspark_df = spark.createDataFrame(ld_df)
    
    # Next, we'll filter the DataFrame to only include the common SNPs
    selected_columns = [col for col in ld_pyspark_df.columns if col in common_snps]
    filtered_ld_pyspark_df = ld_pyspark_df.select(selected_columns)
    
    return filtered_ld_pyspark_df, filtered_StudyLocus


In [3]:
lift_and_calculate_ld_gnomad("chr8:27610986:C:A")

Initializing Hail with default parameters...


TypeError: 'JavaPackage' object is not callable

In [2]:
# PLINK command (placeholder using PLINK and UKBiobank LD reference panel)
command = f"plink --bfile /Users/hn9/Documents/Analysis/FM-comparison/ukb_v3_downsampled10k/ukb_v3_chr{target_chrom}.downsampled10k --allow-extra-chr --recode A --chr {target_chrom} --from-bp {start_pos} --to-bp {end_pos} --maf 0.001 --out {target}_locus_UKBB.txt"
subprocess.run(command, shell=True)

PLINK v1.90b6.21 64-bit (19 Oct 2020)          www.cog-genomics.org/plink/1.9/
(C) 2005-2020 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to APOE_LDL_locus_UKBB.txt.log.
Options in effect:
  --allow-extra-chr
  --bfile /Users/hn9/Documents/Analysis/FM-comparison/ukb_v3_downsampled10k/ukb_v3_chr19.downsampled10k
  --chr 19
  --from-bp 44408822
  --maf 0.001
  --out APOE_LDL_locus_UKBB.txt
  --recode A
  --to-bp 45408822

16384 MB RAM detected; reserving 8192 MB for main workspace.
6167 out of 364540 variants loaded from .bim file.
10000 people (0 males, 0 females, 10000 ambiguous) loaded from .fam.
Ambiguous sex IDs written to APOE_LDL_locus_UKBB.txt.nosex .
Using 1 thread (no multithreaded calculations invoked).
Before main variant filters, 10000 founders and 0 nonfounders present.
Calculating allele frequencies... 1011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818

CompletedProcess(args='plink --bfile /Users/hn9/Documents/Analysis/FM-comparison/ukb_v3_downsampled10k/ukb_v3_chr19.downsampled10k --allow-extra-chr --recode A --chr 19 --from-bp 44408822 --to-bp 45408822 --maf 0.001 --out APOE_LDL_locus_UKBB.txt', returncode=0)

In [8]:
from pyspark.sql import SparkSession
from pyspark.ml.stat import Correlation
from pyspark.ml.feature import VectorAssembler
import pandas as pd

spark = SparkSession.builder \
    .appName("Finemapping Pipeline") \
    .getOrCreate()

def get_ld_matrix_cor():
    """Calculates the LD matrix based on the LD data from PLINK using PySpark"""
    ld_data = pd.DataFrame(np.random.choice([0, 1], size=(5, 5)), columns=['A', 'B', 'C', 'D', 'E'])
    ld_data = spark.createDataFrame(ld_data)

    print('Calculating LD correlation matrix...')

    vec_assembler = VectorAssembler(inputCols=ld_data.columns, outputCol="features")
    ld_data_vector = vec_assembler.transform(ld_data).select("features")
    ld_matrix = Correlation.corr(ld_data_vector, "features").collect()[0][0]

    print('Finished calculating LD correlation matrix')
    return ld_data, ld_matrix

ld_data, ld_matrix = get_ld_matrix_cor()

Py4JError: An error occurred while calling o1112.legacyInferArrayTypeFromFirstElement. Trace:
py4j.Py4JException: Method legacyInferArrayTypeFromFirstElement([]) does not exist
	at py4j.reflection.ReflectionEngine.getMethod(ReflectionEngine.java:318)
	at py4j.reflection.ReflectionEngine.getMethod(ReflectionEngine.java:326)
	at py4j.Gateway.invoke(Gateway.java:274)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.lang.Thread.run(Thread.java:748)



In [3]:
def get_ld_matrix():
    """Calculates the LD matrix based on the LD data from PLINK using PySpark"""
    # TODO debug matrix correlation in PySpark
    # Doesn't finish running
    ld_data = spark.read.csv(f"{target}_locus_UKBB.txt.raw", sep=" ", header=True, inferSchema=True)
    drop_list = ["FID", "IID", "PAT", "MAT", "SEX", "PHENOTYPE"]
    ld_data = ld_data.drop(*drop_list)
    for col in ld_data.columns:
        ld_data = ld_data.withColumn(col, ld_data[col].cast("float"))

    print('Calculating LD correlation matrix...')

    vec_assembler = VectorAssembler(inputCols=ld_data.columns, outputCol="features")
    ld_data_vector = vec_assembler.transform(ld_data).select("features")
    ld_matrix = Correlation.corr(ld_data_vector, "features").collect()[0][0]

    print('Finished calculating LD correlation matrix')
    return ld_data, ld_matrix

ld_data, ld_matrix = get_ld_matrix()

ERROR:root:KeyboardInterrupt while sending command.                             
Traceback (most recent call last):
  File "/Users/hn9/anaconda3/envs/finemapping_env/lib/python3.10/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/Users/hn9/anaconda3/envs/finemapping_env/lib/python3.10/site-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/Users/hn9/anaconda3/envs/finemapping_env/lib/python3.10/socket.py", line 705, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt


KeyboardInterrupt: 

In [6]:
def get_ld_matrix():
    # Calculate LD correlation
    # TODO debug matrix correlation in PySpark
    # Doesn't finish running
    ld_data = pd.read_csv(f"{target}_locus_UKBB.txt.raw", delim_whitespace=True)
    ld_data = ld_data.drop(columns=["FID", "IID", "PAT", "MAT", "SEX", "PHENOTYPE"])
    ld_matrix = ld_data.corr(method='pearson')
    ld_data_spark = spark.read.csv(f"{target}_locus_UKBB.txt.raw", sep=" ", header=True, inferSchema=True)
    drop_list = ["FID", "IID", "PAT", "MAT", "SEX", "PHENOTYPE"]
    ld_data_spark = ld_data_spark.drop(*drop_list)
    for col in ld_data_spark.columns:
        ld_data_spark = ld_data_spark.withColumn(col, ld_data_spark[col].cast("float"))
    column_names = ld_matrix.columns.tolist()
    schema = StructType([StructField(name, FloatType(), True) for name in column_names])
    ld_matrix_dict = ld_matrix.reset_index(drop=True).to_dict('records')
    ld_matrix_spark = spark.createDataFrame(ld_matrix_dict, schema=schema)
    return ld_data_spark, ld_matrix_spark
ld_data_spark, ld_matrix_spark = get_ld_matrix()

                                                                                

23/10/08 11:47:24 WARN NettyRpcEnv: Ignored failure: java.util.concurrent.TimeoutException: Cannot receive any reply from 192.168.0.76:53025 in 10000 milliseconds
23/10/08 11:47:37 WARN Executor: Issue communicating with driver in heartbeater
org.apache.spark.rpc.RpcTimeoutException: Futures timed out after [10000 milliseconds]. This timeout is controlled by spark.executor.heartbeatInterval
	at org.apache.spark.rpc.RpcTimeout.org$apache$spark$rpc$RpcTimeout$$createRpcTimeoutException(RpcTimeout.scala:47)
	at org.apache.spark.rpc.RpcTimeout$$anonfun$addMessageIfTimeout$1.applyOrElse(RpcTimeout.scala:62)
	at org.apache.spark.rpc.RpcTimeout$$anonfun$addMessageIfTimeout$1.applyOrElse(RpcTimeout.scala:58)
	at scala.runtime.AbstractPartialFunction.apply(AbstractPartialFunction.scala:38)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:76)
	at org.apache.spark.rpc.RpcEndpointRef.askSync(RpcEndpointRef.scala:103)
	at org.apache.spark.executor.Executor.reportHeartBeat(Executor.s

ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/Users/hn9/anaconda3/envs/finemapping_env/lib/python3.10/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/Users/hn9/anaconda3/envs/finemapping_env/lib/python3.10/site-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/Users/hn9/anaconda3/envs/finemapping_env/lib/python3.10/socket.py", line 705, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt


KeyboardInterrupt: 

In [3]:
def get_ld_matrix():
    # Calculate LD correlation
    ld_data = pd.read_csv(f"{target}_locus_UKBB.txt.raw", delim_whitespace=True)
    ld_data = ld_data.drop(columns=["FID", "IID", "PAT", "MAT", "SEX", "PHENOTYPE"])
    ld_matrix = ld_data.corr(method='pearson')
    return ld_data, ld_matrix
ld_data, ld_matrix = get_ld_matrix()

In [5]:
ld_data_spark = spark.read.csv(f"{target}_locus_UKBB.txt.raw", sep=" ", header=True, inferSchema=True)
drop_list = ["FID", "IID", "PAT", "MAT", "SEX", "PHENOTYPE"]
ld_data_spark = ld_data_spark.drop(*drop_list)

                                                                                

In [9]:
ld_matrix_spark = spark.createDataFrame(ld_matrix)

  if should_localize and is_datetime64tz_dtype(s.dtype) and s.dt.tz is not None:


23/10/08 13:31:32 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 165043 ms exceeds timeout 120000 ms
23/10/08 13:31:32 WARN SparkContext: Killing executors is not supported by current scheduler.


Py4JError: An error occurred while calling o101.legacyInferArrayTypeFromFirstElement. Trace:
py4j.Py4JException: Method legacyInferArrayTypeFromFirstElement([]) does not exist
	at py4j.reflection.ReflectionEngine.getMethod(ReflectionEngine.java:318)
	at py4j.reflection.ReflectionEngine.getMethod(ReflectionEngine.java:326)
	at py4j.Gateway.invoke(Gateway.java:274)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.lang.Thread.run(Thread.java:748)



In [8]:
column_names = ld_matrix.columns.tolist()
schema = StructType([StructField(name, FloatType(), True) for name in column_names])
ld_matrix_dict = ld_matrix.reset_index(drop=True).to_dict('records')
ld_matrix_spark = spark.createDataFrame(ld_matrix_dict, schema=schema)

TypeError: 'JavaPackage' object is not callable

In [4]:
def get_sumstats(gwas_file_path, target_chrom, start_pos, end_pos):
    sumstat = spark.read.csv(gwas_file_path, header=True, sep="\t")
    sumstat = sumstat.filter(
        (col('hm_chrom') == target_chrom) & 
        (col('hm_pos') >= start_pos) & 
        (col('hm_pos') <= end_pos)
    )
    sumstat = sumstat.dropna(subset=['hm_chrom'])

    sumstat = sumstat.withColumn('z', col('beta') / col('standard_error'))

    cols_to_rename = {
        'hm_variant_id': 'ID',
        'hm_rsid': 'rsid',
        #'hm_chrom': 'chromosome', - chromosome col already exists
        'hm_pos': 'position',
        'hm_other_allele': 'allele1',
        'hm_effect_allele': 'allele2',
        'hm_effect_allele_frequency': 'maf',
        'standard_error': 'se',
        'p_value': 'p'
    }
    for old_name, new_name in cols_to_rename.items():
        sumstat = sumstat.withColumnRenamed(old_name, new_name)
    
    selected_cols = ['ID', 'rsid', 'chromosome', 'position', 'allele1', 'allele2', 'maf', 'p', 'beta', 'se', 'z']
    sumstat = sumstat.select(selected_cols)
    
    return sumstat
sumstat  = get_sumstats(gwas_file_path, target_chrom, start_pos, end_pos)

                                                                                

In [None]:
def match_snps(sumstat, ld_matrix_spark, ld_data_spark):
    """Matches SNPs between summary statistics and LD matrix and filters them accordingly."""
    # Regex pattern for extracting position from SNP
    pattern = re.compile(r"(^\d+)|(?<=:)\d+(?=:|$)")
    ld_data_spark = ld_data_spark.withColumn("position", F.regexp_extract(F.col("SNP"), pattern, 0))
    ld_data_spark = ld_data_spark.withColumn("ID", F.regexp_replace(F.col("SNP"), r'[:,_]', '_'))
    
    # Join sumstat with ld_data on 'ID'
    concordance_test = sumstat.join(ld_data_spark, 'ID', 'inner')
    
    # Filter sumstat and ld_matrix for ID matches only
    sumstat_filtered = sumstat.filter(F.col("ID").isin(concordance_test.select("ID").distinct().rdd.flatMap(lambda x: x).collect()))
    ld_matrix_filtered = ld_matrix_spark.filter(F.col("ID").isin(concordance_test.select("ID").distinct().rdd.flatMap(lambda x: x).collect()))

    return sumstat_filtered, ld_matrix_filtered, concordance_test

sumstat_filtered, ld_matrix_filtered, concordance_test = match_snps(sumstat, ld_matrix, ld_data)


In [None]:
def match_snps(sumstat, ld_matrix, ld_data):
    """Matches SNPs between summary statistics and LD matrix and filters them accordingly."""
    # Getting only SNPs in sumstats that are also in the LD matrix
    # Get SNP IDs from ld matrix to compare with sumstat SNP IDs
    pattern = re.compile(r"(^\d+)|(?<=:)\d+(?=:|$)")
    df1_transpose = ld_data.T.reset_index()
    df1_transpose.columns = ['SNP'] + list(df1_transpose.columns[1:])
    df1_transpose['position'] = df1_transpose['SNP'].apply(lambda x: re.search(pattern, x).group())
    df1_transpose['ID'] = df1_transpose['SNP'].str.replace(r'[:,_]', '_').str.replace(r'_[^_]+$', '')
    concordance_test = pd.merge(sumstat, df1_transpose, on='ID')

    # Filter sumstat and LD matrix for ID matches only
    sumstat_filtered = sumstat[sumstat['ID'].isin(concordance_test['ID'])]
    sumstat_filtered.reset_index(drop=True, inplace=True)
    ld_matrix_filtered = ld_matrix.loc[concordance_test['SNP'], concordance_test['SNP']]
    return sumstat_filtered, ld_matrix_filtered

In [None]:
def allele_flip_check(snp_ids_38, sumstat_filtered):
    df = spark.createDataFrame(snp_ids_38, StringType()).toDF("ID")

    # Split the 'ID' column to extract 'ref' and 'alt' columns
    df = df.withColumn("ref_LD", split(col("ID"), "_")[2])
    df = df.withColumn("alt_LD", split(col("ID"), "_")[3])

    # Extract alleles using PySpark string functions
    allele_df = concordance_test.withColumn('allele_parts', F.split('SNP', '[:,_]'))
    concordance_test = allele_df.withColumn('allele1_LD', allele_df['allele_parts'].getItem(1)).\
                                 withColumn('allele2_LD', allele_df['allele_parts'].getItem(2))

    # Join sumstat_filtered and concordance_test to align them
    joint_df = sumstat_filtered.join(concordance_test, 'ID', 'inner')

    # Flip z-scores if alleles are discordant
    condition = (joint_df['ref'] != joint_df['ref_LD']) | (joint_df['alt'] != joint_df['alt_LD'])
    sumstat_filtered = joint_df.withColumn('z', F.when(condition, -joint_df['z']).otherwise(joint_df['z']))
    return sumstat_filtered


In [None]:
def dentist_calc(sumstat, target, lead_snp_ID, n_sample):
    # 1. Getting R2 column for sumstats
    ld = spark_session.read.csv(f'/Users/hn9/Documents/GitHub/fine-mapping-inf/susieinf/loci/ld/{target}_subset_for_ld_calculation.ld', sep='\\s+', header=True)
    lead_ld = ld.filter((ld['SNP_A'] == lead_snp_ID) | (ld['SNP_B'] == lead_snp_ID))
    sumstat = spark_session.read.csv(f'/Users/hn9/Documents/GitHub/fine-mapping-inf/susieinf/loci/{target}_locus_sumstat_flip_check.txt.gz', sep='\t', header=True)
    sumstat = sumstat.withColumn('ID', F.regexp_replace('ID', r"_(\d+)_([A-Z])_([A-Z])", r":\1:\2:\3"))
    merged = lead_ld.join(sumstat.select('ID'), lead_ld['SNP_B'] == sumstat['ID'])
    df = merged.select('ID', 'R2')
    df = df.join(sumstat, 'ID', 'left_outer')
    r_value = (n_sample * df.agg(F.sum('R2')).first()[0]) / (n_sample * df.agg(F.count('R2')).first()[0])
    df = df.withColumn('r', F.lit(r_value))

    lead_row = df.filter(df['ID'] == lead_snp_ID).first()
    lead_z = lead_row['beta'] / lead_row['se']

    # 2. Calculate 't_dentist_s' and 'dentist_outlier'
    df = df.withColumn('t_dentist_s', (df['z'] - df['r'] * lead_z)**2 / (1 - df['r']**2))
    df = df.withColumn('dentist_outlier', F.when((df['t_dentist_s'] < 1e-4) & (df['R2'] > 0.6), 1).otherwise(0))
    df = df.drop('CHR_A', 'BP_A', 'SNP_A', 'CHR_B', 'BP_B', 'SNP_B')
    df.write.csv(f'{target}_locus_sumstat_with_dentist.txt.gz', sep='\t', header=True)

    return df


In [None]:
ld_data, ld_matrix = get_ld_matrix()
sumstat = get_sumstats(gwas_file_path)
sumstat_filtered, ld_matrix_filtered, concordance_test = match_snps(sumstat, ld_matrix, ld_data)
sumstat_filtered = allele_flip_check(concordance_test, sumstat_filtered)

lead_ld_command = f"""plink --bfile /Users/hn9/Documents/Analysis/FM-comparison/ukb_v3_downsampled10k/ukb_v3_chr{target_chrom}.downsampled10k \
        --allow-extra-chr \
        --r2 \
        --ld-snp {lead_snp_ID1} \
        --ld-window-kb 1000 \
        --ld-window 99999 \
        --ld-window-r2 0 \
        --out /Users/hn9/Documents/GitHub/fine-mapping-inf/susieinf/loci/ld/{target}_subset_for_ld_calculation
"""
subprocess.run(lead_ld_command, shell=True)

df = dentist_calc(sumstat_filtered)

In [127]:
susieinf_command = f"""python /Users/hn9/Documents/GitHub/fine-mapping-inf/run_fine_mapping.py \
    --sumstats /Users/hn9/Documents/GitHub/fine-mapping-inf/susieinf/loci/{target}_locus_sumstat_with_dentist.txt.gz \
    --beta-col-name beta \
    --se-col-name se \
    --ld-file /Users/hn9/Documents/GitHub/fine-mapping-inf/susieinf/loci/{target}_locus_ukbb_ld.txt.gz \
    --n {n_sample} \
    --method susieinf \
    --save-tsv \
    --eigen-decomp-prefix {target}_locus \
    --output-prefix  {target}_locus """

subprocess.run(susieinf_command, shell=True)

Reading summary statistics from file /Users/hn9/Documents/GitHub/fine-mapping-inf/susieinf/loci/APOE_LDL_locus_sumstat_with_dentist.txt.gz
481 SNPs in summary statistics file
Reading in LD matrix from file /Users/hn9/Documents/GitHub/fine-mapping-inf/susieinf/loci/APOE_LDL_locus_ukbb_ld.txt.gz
Reading in LD matrix took 0.05 seconds
Performing eigen decomposition
Eigen decomposition took 0.06 seconds
Running susieinf
*********************************************************************
* SuSiE-inf
* Version 1.3
* (C) Ran Cui, Zhou Fan
*********************************************************************
Iteration 0
Update s^2 for effect 0 to 0.000006
Update s^2 for effect 1 to 0.000006
Update s^2 for effect 2 to 0.000006
Update s^2 for effect 3 to 0.000006
Update s^2 for effect 4 to 0.000006
Update s^2 for effect 5 to 0.000006
Update s^2 for effect 6 to 0.000006
Update s^2 for effect 7 to 0.000006
Update s^2 for effect 8 to 0.000006
Update s^2 for effect 9 to 0.000006
Update (sigma^2,ta

CompletedProcess(args='python /Users/hn9/Documents/GitHub/fine-mapping-inf/run_fine_mapping.py     --sumstats /Users/hn9/Documents/GitHub/fine-mapping-inf/susieinf/loci/APOE_LDL_locus_sumstat_with_dentist.txt.gz     --beta-col-name beta     --se-col-name se     --ld-file /Users/hn9/Documents/GitHub/fine-mapping-inf/susieinf/loci/APOE_LDL_locus_ukbb_ld.txt.gz     --n 94595     --method susieinf     --save-tsv     --eigen-decomp-prefix APOE_LDL_locus     --output-prefix  APOE_LDL_locus ', returncode=0)

In [128]:
zip_comamnd = f"""gunzip -c {target}_locus.susieinf.bgz > {target}_locus.txt"""
subprocess.run(zip_comamnd, shell=True)

CompletedProcess(args='gunzip -c APOE_LDL_locus.susieinf.bgz > APOE_LDL_locus.txt', returncode=0)