# Notebook to Prepare 1000 Genomes Data

In [3]:
import allel
import h5py  # Python Package to do the HDF5.
import numpy as np
import pandas as pd

In [26]:
#path_vcf = "../Data/1000Genomes/ALL.chrX.phase3_shapeit2_mvncall_integrated_v1b.20130502.genotypes.vcf.gz" # Path of VCF
path_vcf = "../Data/1000Genomes/1000gX1240kEur.vcf"
#h5_path = "../Data/1000Genomes/1000Genomes_X.hdf5"
h5_path = "../Data/1000Genomes/1000gX1240kEur.hdf5"
ind_path = "../Data/1000Genomes/integrated_call_samples_v2.20130502.ALL.ped"  # Family Relationships
pop_path = "../Data/1000Genomes/integrated_call_samples_v3.20130502.ALL.panel" # Population Information

## Do the Conversion to HDF 5 
Comment out

In [30]:
### Do the conversion to hdf5 (if not done already)
# geno = allel.read_vcf(path_vcf) # Load the VCF # Load the VCF # Needs too much Memory for my laptop
allel.vcf_to_hdf5(input=path_vcf, output=h5_path, compression="gzip") # Do the conversion to hdf5. Takes 10 Minutes

In [28]:
## Load HDF5
f = h5py.File(h5_path, "r") # Load for Sanity Check. See below!
print("Loaded %i variants" % np.shape(f["calldata/GT"])[0])
print("Loaded %i individuals" % np.shape(f["calldata/GT"])[1])
print(list(f["calldata"].keys()))
print(list(f["variants"].keys()))
#print(list(f["samples"].keys()))

Loaded 47486 variants
Loaded 503 individuals
['GT']
['ALT', 'CHROM', 'FILTER_PASS', 'ID', 'POS', 'QUAL', 'REF']


In [5]:
### Load the ID of the Individuals
ids = np.array(f["variants/ID"])
df_s_empirical = pd.DataFrame({'Individual ID' : list(f["samples"])})

In [6]:
df_s_empirical["Individual ID"] = df_s_empirical["Individual ID"].str.split("_").str[-1]

# Load and Merge Individual Meta Data

In [7]:
df_i = pd.read_csv(ind_path, sep="\t")
print(f"Loaded {np.shape(df_i)[0]} Individuals")

df_pops = pd.read_csv(pop_path, sep="\t")
print(f"Loaded {np.shape(df_i)[0]} Population Data")

### Merge with IDs in Genotype File
df = pd.merge(df_s_empirical, df_i, on='Individual ID', how='inner')
df = pd.merge(df, df_pops, left_on="Individual ID", right_on="sample", how="inner")
print(f"Merged from {len(df_s_empirical)} to {len(df)} individuals")
assert(len(df_s_empirical) == len(df))  # Sanity Check

Loaded 3691 Individuals
Loaded 3691 Population Data
Merged from 2504 to 2504 individuals


In [8]:
df["Population"].value_counts()

GWD    113
YRI    108
TSI    107
IBS    107
CHS    105
JPT    104
PUR    104
CHB    103
GIH    103
ITU    102
STU    102
FIN     99
KHV     99
CEU     99
ESN     99
LWK     99
ACB     96
PJL     96
CLM     94
CDX     93
GBR     91
BEB     86
MSL     85
PEL     85
MXL     64
ASW     61
Name: Population, dtype: int64

In [36]:
#df[df["Paternal ID"] != "0"]

# Save Individual List
Save table with Individual and Family IDs

In [23]:
save_path = "../Data/1000Genomes/EUR.csv"

eur_inds = df["super_pop"]=="EUR"
print(f"Nr of European Samples: {np.sum(eur_inds)}")

df_save = df[eur_inds]
df_save = df_save[["Family ID", "Individual ID"]] # Extract column of Individual and Family ID
df_save.to_csv(save_path, sep="\t", header=None, index=False)
print(f"Saved to {save_path}. Nr Individuals: {len(df_save)}")


### Do the same, but for families
save_path = "../Data/1000Genomes/EUR_fam.csv"

eur_inds = df["super_pop"]=="EUR"
print(f"Nr of European Samples: {np.sum(eur_inds)}")

df_save = df[eur_inds]
df_save = df_save["Individual ID"] # Extract column of Individual and Family ID
df_save.to_csv(save_path, sep="\t", header=None, index=False)
print(f"Saved to {save_path}. Nr Individuals: {len(df_save)}")

Nr of European Samples: 503
Saved to ../Data/1000Genomes/EUR.csv. Nr Individuals: 503
Nr of European Samples: 503
Saved to ../Data/1000Genomes/EUR_fam.csv. Nr Individuals: 503


# Check against Sardinian X data

In [11]:
path_snp = "../../ancient-sardinia/data/bed/full230.snp" # All SNPs found in the 1240k Ancient Panel

df_snp = pd.read_csv(path_snp, header=None, sep=r"\s*", engine="python")
df_snp.columns = ["SNP", "chr", "map", "pos", "ref", "alt"]
df_snp = df_snp[df_snp["chr"]==23]

print(f"Loaded {len(df_snp)} X SNPs.")

  yield pat.split(line.strip())
  yield pat.split(line.strip())


Loaded 49711 X SNPs.


# Save  interesection with 1240k Marker 
Prepare txt List for Plink filtering

In [12]:
save_path = "../Data/1000Genomes/variants1240k"

found = np.isin(f["variants/POS"], df_snp["pos"])
print(f"Intersection: {np.sum(found)} out of {len(found)} SNPS")
variants = f["variants/ID"][found]

dots = np.where(variants == ".")[0]
print(f"Found {len(dots)} unnamed SNPs")
variants = np.delete(variants, dots)

np.savetxt(save_path, variants, fmt="%s")
print(f"Successfully saved to {save_path}. Length: {len(variants)}")

Intersection: 48298 out of 3468093 SNPS
Found 812 unnamed SNPs
Successfully saved to ../Data/1000Genomes/variants1240k. Length: 47486


In [15]:
samples = list(f["samples"])

# Area 51

In [21]:
df[df["Population"]=="CEU"]

Unnamed: 0,Individual ID,Family ID,Paternal ID,Maternal ID,Gender,Phenotype,Population,Relationship,Siblings,Second Order,...,phase 3 genotypes,related genotypes,omni genotypes,affy_genotypes,sample,pop,super_pop,gender,Unnamed: 4,Unnamed: 5
1656,NA06984,1328,0,0,1,0,CEU,father,0,0,...,1,0,1,1,NA06984,CEU,EUR,male,,
1657,NA06985,1341,0,0,2,0,CEU,mat grandmother,0,0,...,1,0,1,1,NA06985,CEU,EUR,female,,
1658,NA06986,13291,0,0,1,0,CEU,mat grandfather,0,0,...,1,0,1,1,NA06986,CEU,EUR,male,,
1659,NA06989,1328,0,0,2,0,CEU,mother,0,0,...,1,0,1,1,NA06989,CEU,EUR,female,,
1660,NA06994,1340,0,0,1,0,CEU,pat grandfather,0,0,...,1,0,1,1,NA06994,CEU,EUR,male,,
1661,NA07000,1340,0,0,2,0,CEU,pat grandmother,0,0,...,1,0,1,1,NA07000,CEU,EUR,female,,
1662,NA07037,13291,0,0,2,0,CEU,pat grandmother,0,0,...,1,0,1,1,NA07037,CEU,EUR,female,,
1663,NA07048,1341,NA07034,NA07055,1,0,CEU,father,0,0,...,1,0,1,1,NA07048,CEU,EUR,male,,
1664,NA07051,13292,0,0,1,0,CEU,mat grandfather,0,0,...,1,0,1,1,NA07051,CEU,EUR,male,,
1665,NA07056,1340,0,0,2,0,CEU,mat grandmother,0,0,...,1,0,1,1,NA07056,CEU,EUR,female,,
