# Geno prep
1. Separate binary and quanntitative covs
2. Residualize and quantile transform phenos

In [1]:
import pandas as pd
import os
import statsmodels.api as sm
from sklearn.preprocessing import quantile_transform
import numpy as np

workspace_path = "/data/workspaces/lag/workspaces/lg-ukbiobank/projects/FLICA_multimodal/"
cfs_path = "/data/clusterfs/lag/users/jitame/FLICA/pheno"

In [2]:
covs = pd.read_csv(os.path.join(workspace_path, "regenie_covariates_50k.tsv"), sep="\t")
covs.dropna(axis=1, how="all", inplace=True)
covs.dropna(axis=0, how="any", inplace=True)

binary_covs = ["Genetic_sex", "geno_array_dummy", "site_dummy_11025", "site_dummy_11026", "site_dummy_11027"]
for cov in binary_covs[2:]:
    covs[cov] = covs[cov].astype(int)
covs[covs.columns[:-1]].to_csv(os.path.join(workspace_path, "regenie_final_covs_32k.tsv"), sep="\t", header=True, index=False)
covs[["FID", "IID"] + binary_covs].to_csv(os.path.join(workspace_path, "gcta_binary_covs_32k.tsv"), sep="\t", header=False, index=False)

covs.drop(binary_covs+["site_dummy_11028"], axis=1).to_csv(os.path.join(workspace_path, "gcta_qcovs_32k.tsv"), sep="\t", header=False, index=False)

covs = covs.drop(["site_dummy_11028"], axis=1)

In [3]:
covs.shape

(33363, 31)

In [4]:
data = pd.read_csv(os.path.join(cfs_path, "rs_ics_32k_gcta_N32677.tsv"), sep="\t")

In [6]:
data.head()

Unnamed: 0,FID,IID,5c_c2,10c_c4
0,1000099,1000099,-1.261045,-0.417428
1,1000192,1000192,-0.253847,-0.649029
2,1000256,1000256,1.908976,0.522966
3,1000281,1000281,0.765793,0.949243
4,1000330,1000330,-0.550069,0.518803


In [7]:
def save_df(data, file_name):
    #reorder and save
    initial_cols = data.columns
    data['FID'] = data.index.values.astype(int)
    data['IID'] = data.index.values.astype(int)
    data = data[['FID', 'IID', *initial_cols]]
    data.to_csv(file_name.format(len(data)), na_rep="NA", sep="\t", index=False, header=True)

def residualize(data, covs, fn_out):

    #set up files
    subs = sorted(list(set(data["FID"]) & set(covs["FID"])))
    print("Number of subs in both files: {}".format(len(subs)))
    
    data.set_index(["FID"], inplace=True)
    covs.set_index(["FID"], inplace=True)
    data = data.loc[subs]
    covs = covs.loc[subs]
    data.drop(["IID"], axis=1, inplace=True)
    covs.drop(["IID"], axis=1, inplace=True)

    print("Residualizing...")
    #define new dataframe
    data_new=pd.DataFrame(index=data.index.values)

        #residualize
    for dep_var in data.columns: 
        na_bool = data[dep_var].isna()
        data_in = data.loc[~na_bool, dep_var]
        covs_in = covs.loc[~na_bool, :]
        model = sm.OLS(data_in, exog=covs_in)
        results = model.fit()
        df_residualized = pd.DataFrame(data=results.resid, index=data_in.index, columns=[dep_var])
        data_new = data_new.join(df_residualized)
        print(data_new.shape)
    
    print("Quantile transform...")
    #quantile transformation
    X = data_new.to_numpy()
    data_new2 = pd.DataFrame(data=quantile_transform(X, n_quantiles=1000, output_distribution='normal', random_state=0, copy=True),
                             columns=data_new.columns,
                             index=data_new.index.values)
    
    print("Saving results...")
    save_df(data = data_new,
            file_name=fn_out[:-4]+"_resid_N{}.tsv".format(len(data_new)))
    save_df(data = data_new2,
            file_name=fn_out[:-4]+"_resid_norm_N{}.tsv".format(len(data_new2)))
    print("Done!")      


In [8]:
residualize(data, covs, os.path.join(cfs_path, "rs_ics_32k_gcta_N32677.tsv"))

Number of subs in both files: 32661
Residualizing...
(32661, 1)
(32661, 2)
Quantile transform...
Saving results...
Done!


In [6]:
subs = sorted(list(set(data["FID"]) & set(covs["FID"])))
data.loc[subs, ['FID', 'FID']].to_csv(os.path.join(cfs_path, "subs_list_FID_IID_N{}.txt".format(len(data))), sep="\t", header=False, index=False)

In [11]:
from shutil import copy2
cov_files = ["regenie_final_covs_32k.tsv", "gcta_binary_covs_32k.tsv", "gcta_qcovs_32k.tsv"]
for fn in cov_files:
    copy2(os.path.join(workspace_path, fn), os.path.join(cfs_path, fn))