In [1]:
import os
import pandas as pd
import glob

results_path = "/data/workspaces/lag/workspaces/lg-ukbiobank/projects/rest-multimodal/results/"

In [2]:
def snp_id(data):
    data.insert(
        loc=0,
        column="SNP_ID",
        value=data["CHR"].astype(str).str.cat(data["BP"].astype(str), sep=":").str.cat(data["A2"], sep="_").str.cat(data["A1"], sep="_"))
    return data

def add_beta_standard_error(data):
    data["beta"] = ["NA"]*len(data)
    data["standard_error"] = ["NA"]*len(data)
    return data

def load_MOSTEST_sumstats(fn, out_fn=None):
    
    data = pd.read_csv(fn, sep="\t", compression="gzip", engine="pyarrow")
    
    data = data[data["CHR"]!="CHR"]
    
    data["BP"] = data["BP"].astype("int64")
    data["PVAL"] = data["PVAL"].astype("float64")
    data["Z"] = data["Z"].astype("float32")
    data["N"] = data["N"].astype("float32")
    
    data.sort_values(by=["CHR", "BP"], inplace=True)
    
    data.reset_index(drop=True, inplace=True)
    
    data = snp_id(data)
    
    data = add_beta_standard_error(data)
    
    #data.set_index("SNP_ID", inplace=True)

    if out_fn is not None:
        data.to_csv(out_fn, sep="\t", compression="gzip")
    
    return data

def load_snp_data(fn):
    return pd.read_csv(fn, sep="\t", engine="pyarrow")

def data_loader_snpstats(fn_all):
    file_list = sorted(glob.glob(os.path.join(fn_all)))
    
    data = pd.concat(map(load_snp_data, file_list), join="inner", axis=0)
    
    data.convert_dtypes()
    
    #data.set_index("SNP_ID", inplace=True)
    
    return data

def format_GWAS_cat(data, out_fn):
    rename_dict = { "SNP_ID":"variant_id",
                    "CHR_x":"chromosome",
                    "BP":"base_pair_location",
                    "PVAL":"p_value",
                    "A1":"effect_allele",
                    "A2":"other_allele",
                    "minor_allele_frequency.subset.QCtool":"effect_allele_frequency",
                    "SNP":"rs_id",
                    "info.subset.QCtool":"info",
                    "N":"n"}
    data.rename(columns=rename_dict, inplace = True)
    
    data["n"] = data["n"].astype("int64")
    
    data.loc[data["chromosome"] == "X", "chromosome" ] = 23
    data.loc[data["chromosome"] == "XY", "chromosome" ] = 23
    data["chromosome"] = data["chromosome"].astype("int32")
    
    data["variant_id"] = [x.replace(":","_").replace("XY","23").replace("X", "23") for x in data["variant_id"] ]
    
    data[["chromosome", "base_pair_location", "effect_allele", "other_allele", "beta", "standard_error", "effect_allele_frequency", "p_value", "variant_id", "rs_id", "info", "n"]].to_csv(out_fn, sep="\t", index=False)
    return data


In [3]:
#load data
mvgwas_lang = load_MOSTEST_sumstats(os.path.join(results_path, "sumstats", "edges.sumstats.gz"))
mvgwas_asym = load_MOSTEST_sumstats(os.path.join(results_path, "sumstats", "edges_asym.sumstats.gz"))

fn_all = "/data/clusterfs/lag/users/jitame/SENT_CORE/geno/regenie/gwas/st2_in/filtered/filter_var/subsetting_reg_st2_GWAS_chr*.snpstats_mfi_hrc.snps2keep"
snp_stats = data_loader_snpstats(fn_all)

In [4]:
mvgwas_lang = mvgwas_lang.merge(snp_stats, on="SNP_ID", how="left")
mvgwas_asym = mvgwas_asym.merge(snp_stats, on="SNP_ID", how="left")

In [5]:
mvgwas_lang = format_GWAS_cat(mvgwas_lang,
                                  out_fn=os.path.join(results_path, "sumstats_globus", "language_network_mv_sumstats.tsv"))
mvgwas_asym = format_GWAS_cat(mvgwas_asym,
                                  out_fn=os.path.join(results_path, "sumstats_globus", "hemispheric_differences_mv_sumstats.tsv"))

In [6]:
#mvgwas_lang[["chromosome", "base_pair_location", "effect_allele", "other_allele", "effect_allele_frequency", "p_value", "variant_id", "rs_id", "info", "n"]].head()