In [1]:
import pandas as pd
import os

# Get lists of samples with White and Black ancestries

In [5]:
misc_dir = "/cluster/projects/p33/users/alexeas/ukb/misc"
fname = os.path.join(misc_dir, "ancestry.csv")
white_codes = ["1", "1001", "1002", "1003"]
black_codes = ["4", "4001", "4002", "4003"]
outf_white = os.path.join(misc_dir, "id_white.txt")
outf_black = os.path.join(misc_dir, "id_black.txt")
df = pd.read_csv(fname, dtype=str, usecols=["eid", "21000-0.0"])

i_white = df["21000-0.0"].isin(white_codes)
i_black = df["21000-0.0"].isin(black_codes)

print(f"{len(df)} total samples")
print(f"{i_white.sum()} white samples")
print(f"{i_black.sum()} black samples")

df["IID"] = df.eid

df.loc[i_white,["eid", "IID"]].to_csv(outf_white, header=False, index=False, sep='\t')
df.loc[i_black,["eid", "IID"]].to_csv(outf_black, header=False, index=False, sep='\t')

502419 total samples
472621 white samples
8058 black samples


# Get variant lists with INFO > 0.8

In [3]:
info_dir = "/cluster/projects/p33/groups/imaging/ukbio/genetics/INFO"
misc_dir = "/cluster/projects/p33/users/alexeas/ukb/misc"
info_threshold=0.8
for i in range(1,23):
    fname = os.path.join(info_dir, f"ukb_mfi_chr{i}_v3.txt")
    df = pd.read_table(fname, usecols=[1,7], header=None, names=["SNP", "INFO"])
    snps = df.loc[df.INFO>info_threshold,"SNP"]
    threshold_str = str(info_threshold).split('.')[-1]
    out_fname = os.path.join(misc_dir, f"snps_info{threshold_str}_chr{i}.txt")
    snps.to_csv(out_fname, header=None, index=None)