## Prep data for exome analysis + GWAS
Input:
- preprocessed PLINK genome data
- uncorrected brain data (edges, nodes and asymmetries from edges)

Pipeline:
- Concatenate nodes to one dataframe
- Format df to match regenie
- Regenie step 1


In [2]:
import pandas as pd
import os
from numpy import savetxt, ceil

base_path = "/data/clusterfs/lag/users/jitame/SENT_CORE/" 

#get csv file_names 

file_exts = ["sent_edges_asym_N29682.csv",  "sent_edges_N29682.csv"]
file_names = [os.path.join(base_path, "pheno", x )  for x in file_exts]

#read final exome subject list
exome_sub_list = os.path.join(base_path, "subj_sent_N30652_exome_final_pass_sex.txt")
exome_subs = [int(x) for x in open( exome_sub_list ).read().split('\n')[:-1] ]

In [5]:
concat = True
unconcat = False

def concat_dfs(fn, df):
    """
    Concatenates two dataframes, one new, one existing.
    Adds name of the phenotype to column names to new dataframe
    """
    #get phenotype name
    name_df = os.path.split(fn)[1].split('_')[0] + '_' + os.path.split(fn)[1].split('_')[1]
    
    #read df
    df2 = pd.read_csv(fn, index_col=0)
    
    #set columns
    df2.columns = [name_df + '_' + cn for cn in df2.columns]
    
    #concatenate dfs
    df_new = pd.concat([df, df2], axis=1)
    
    #return df
    return df_new


def format_df_regenie(df, exome_subs=None):
    """
    Formats input df to regenie format
    """   
    #get subjects in exome data
    if exome_subs is not None:
        df = df.loc[exome_subs]
    
    #get NaN subs and remove them
    nan_subs = list(df[df.isna().any(axis=1)].index.values)
    df = df.drop(nan_subs)
    
    #get column names
    cols = list(df.columns)
    
    if "aicha_nodes_Unnamed: 0.1" in cols:
        cols.remove("aicha_nodes_Unnamed: 0.1")
    
    #make index column a normal column
    df.reset_index(inplace=True)
    
    #set subject IDs
    df[['FID']], df[['IID']] = df[[df.columns[0]]], df[[df.columns[0]]]
    
    #reorder
    df = df[['FID', 'IID'] + cols]
    return df

def save_in_chunks(df, no_chunks, fn):
    """
    Save dataframe in chunks
    """
    #get column list
    cols = list(df.columns)
    
    #specify ID vars and remove from other columns
    ids = ['FID', 'IID']
    for i in ids:
        cols.remove(i)
    
    #calculate column chunk size
    chunk_size = int(ceil(len(cols)/no_chunks))
    
    #chunk col list
    chunked_list = [cols[j:j+chunk_size] for j in range(0, len(cols), chunk_size)]
    
    for j in range(no_chunks):
        #get which columns to write in this chunk
        cols_out = chunked_list[j]
        
        #write to memory
        df.to_csv(fn[:-16]+"exome_{0}_of_{1}.tsv.gz".format(j, no_chunks),
                  chunksize=1000, #sets row chunksize to write
                  columns = cols_out,
                  index = False,
                  sep = "\t",
                  compression = "gzip")
    
if concat:
    print("Process concatenate dfs")
    #specify empty dataframe to start
    df=pd.DataFrame()

    #read dataframes
    for fn in file_names:
        df = concat_dfs(fn, df)

    #get subject list if necessary    
    #nan_subs = list(df[df.isna().any(axis=1)].index.values)
    #savetxt(os.path.join(base_path, 'nan_subs_sent_vs_aicha.txt'), nan_subs, delimiter="\n", fmt="%s")

    #reformat 
    df = format_df_regenie(df, exome_subs=exome_subs)

    print(df.columns)

    #save and then delete from memory to save space
    df.to_csv(os.path.join(base_path, "pheno", "sent_edges_exome.tsv"), index=False, sep="\t")
    del df

In [6]:
import os
import pandas as pd
#read dataframe

def make_covars_regenie_ready(fn_in, fn_out, exome_subs, exome=False):    
    covars = pd.read_csv(fn_in, sep="\t", index_col=0)

    #exome_subs.remove(5734558)
    #select exome subs
    covars = covars.loc[exome_subs]
    
    if exome:
        exome_batch = pd.read_csv("/data/workspaces/lag/workspaces/lg-ukbiobank/derived_data/genetic_data/exome/exome_release_final/exome_batch/exome_batches.txt", sep="\t", index_col=0)
        covars["exome_batch"] = exome_batch.loc[exome_subs]
    
    #get column order right
    cols = list(covars.columns)
    covars.reset_index(inplace=True)
    print(covars.head())
    
    covars[['FID']], covars[['IID']] = covars[['subject_id']], covars[['subject_id']]
    covars = covars[['FID', 'IID'] + cols]
                           
    #save to compressed file
    covars.to_csv(fn_out, index=False, sep="\t")
    
in_files = [os.path.join(base_path, "covars", "covars_pc10_for_correction_N30660_gwas_batch.txt"), os.path.join(base_path, "covars", "covars_pc10_for_correction_N30660_exome_gwas_batch.txt")]
out_files = [os.path.join(base_path, "covars", "covars_pc10_gwas.tsv"), os.path.join(base_path, "covars", "covars_pc10_exome.tsv")]

make_covars_regenie_ready(in_files[0], out_files[0], exome_subs, exome=False)
make_covars_regenie_ready(in_files[1], out_files[1], exome_subs, exome=True)

In [3]:
#add sex from covariates to FAM files
def sex_to_fam(fam_file, covars):
    """
    Loads sex from the covariates and adds it to the .FAM file
    """
    #load fam file
    fam = pd.read_csv(fam_file, sep="\t", header=None)
    
    #set order of covars identical to FAM-file
    covars = covars.reindex(list(fam.iloc[:, 0]))
    
    #set fam file to sex
    fam.iloc[:, 4] = [x+2 if x == 0 else x for x in covars['sex']]
    
    #save
    fam.to_csv(fam_file, sep="\t", header=False, index=False)

#Load covars
covars = pd.read_csv(os.path.join(base_path, "covars", "covars_pc10_for_correction_N30660.txt"), sep="\t", index_col=0)

#Specify chromosome
chrs= [x for x in range(1, 23, 1)] + ['X']

for i in chrs:
    print("Chromosome {}".format(i))
    sex_to_fam(os.path.join(base_path, "geno", "geno_N30652_chr{}.fam".format(i)), covars)

In [1]:
#change path in predictions list
import pandas as pd
from os import rename

#rename file
pred_list = "/data/clusterfs/lag/users/jitame/SENT_CORE/geno/regenie/step_1_sent_all/st1_out/reg_st1_sent_pred.list"
pred_list_old = "/data/clusterfs/lag/users/jitame/SENT_CORE/geno/regenie/step_1_sent_all/st1_out/reg_st1_sent_pred_mpi_local.list"
rename(pred_list, pred_list_old)

#edit path file
df = pd.read_csv(pred_list_old, delimiter=" ", header=None)
df.iloc[:,1] = [x.replace("/data/clusterfs/lag/users/jitame/SENT_CORE/geno/regenie/step_1_sent_all/st1_out/", "/input_files/") for x in df.iloc[:,1]]
df.to_csv(pred_list, header=False, index=False, sep=" ")

In [None]:
%%bash
#GZIP predictions
pred_path=/data/clusterfs/lag/users/jitame/SENT_CORE/geno/regenie/step_1_sent_all/st1_out
out_file=/data/clusterfs/lag/users/jitame/SENT_CORE/geno/regenie/step_1_sent_all.tar.gz
cd $pred_path
tar cfz $out_file *.loco