# Notebook to Prepare 1000 Genomes Data

In [27]:
import allel
import h5py  # Python Package to do the HDF5.
import numpy as np
import pandas as pd

In [57]:
path_vcf = "../Data/1000Genomes/ALL.chrX.phase3_shapeit2_mvncall_integrated_v1b.20130502.genotypes.vcf.gz" # Path of VCF
h5_path = "../Data/1000Genomes/1000Genomes_X.hdf5"
ind_path = "../Data/1000Genomes/integrated_call_samples_v2.20130502.ALL.ped"  # Family Relationships
pop_path = "../Data/1000Genomes/integrated_call_samples_v3.20130502.ALL.panel" # Population Information

## Do the Conversion to HDF 5 
Comment out

In [8]:
### Do the conversion to hdf5 (if not done already)
# geno = allel.read_vcf(path_vcf) # Load the VCF # Load the VCF # Needs too much Memory for my laptop
# allel.vcf_to_hdf5(input=path_vcf, output=h5_path, compression="gzip") # Do the conversion to hdf5. Takes 10 Minutes

In [19]:
## Load HDF5
f = h5py.File(h5_path, "r") # Load for Sanity Check. See below!
print("Loaded %i variants" % np.shape(f["calldata/GT"])[0])
print("Loaded %i individuals" % np.shape(f["calldata/GT"])[1])
print(list(f["calldata"].keys()))
print(list(f["variants"].keys()))
#print(list(f["samples"].keys()))

Loaded 3468093 variants
Loaded 2504 individuals
['GT']
['ALT', 'CHROM', 'FILTER_PASS', 'ID', 'POS', 'QUAL', 'REF']


AttributeError: 'Dataset' object has no attribute 'keys'

In [25]:
np.shape(f["calldata/GT"])

(3468093, 2504, 2)

In [14]:
### Load the ID of the Individuals
ids = np.array(f["variants/ID"])

In [39]:
df_s_empirical = pd.DataFrame({'Individual ID' : list(f["samples"])})

# Load and Merge Individual Meta Data

In [63]:
df_i = pd.read_csv(ind_path, sep="\t")
print(f"Loaded {np.shape(df_i)[0]} Individuals")

df_pops = pd.read_csv(pop_path, sep="\t")
print(f"Loaded {np.shape(df_i)[0]} Population Data")

### Merge with IDs in Genotype File
df = pd.merge(df_s_empirical, df_i, on='Individual ID', how='inner')
df = pd.merge(df, df_pops, left_on="Individual ID", right_on="sample", how="inner")
print(f"Merged from {len(df_s_empirical)} to {len(df)} individuals")
assert(len(df_s_empirical) == len(df))  # Sanity Check

Loaded 3691 Individuals
Loaded 3691 Population Data
Merged from 2504 to 2504 individuals


In [72]:
df["Population"].value_counts()

GWD    113
YRI    108
TSI    107
IBS    107
CHS    105
PUR    104
JPT    104
CHB    103
GIH    103
STU    102
ITU    102
KHV     99
ESN     99
CEU     99
FIN     99
LWK     99
PJL     96
ACB     96
CLM     94
CDX     93
GBR     91
BEB     86
MSL     85
PEL     85
MXL     64
ASW     61
Name: Population, dtype: int64

In [77]:
eur_inds = df["super_pop"]=="EUR"
print(f"Nr of European Samples: {np.sum(eur_inds)}")

Nr of European Samples: 503


In [68]:
df[df["Paternal ID"] != "0"]

Unnamed: 0,Individual ID,Family ID,Paternal ID,Maternal ID,Gender,Phenotype,Population,Relationship,Siblings,Second Order,...,phase 3 genotypes,related genotypes,omni genotypes,affy_genotypes,sample,pop,super_pop,gender,Unnamed: 4,Unnamed: 5
311,HG00867,CDX1,HG00866,0,2,0,CDX,unrel,0,0,...,1,0,1,1,HG00867,CDX,EAS,female,,
1301,HG03451,SL50,HG03466,0,1,0,MSL,father,HG03468,0,...,1,0,0,1,HG03451,MSL,AFR,male,,
1663,NA07048,1341,NA07034,NA07055,1,0,CEU,father,0,0,...,1,0,1,1,NA07048,CEU,EUR,male,,
1668,NA10847,1334,NA12146,NA12239,2,0,CEU,mother,0,0,...,1,0,1,1,NA10847,CEU,EUR,female,,
1669,NA10851,1344,NA12056,NA12057,1,0,CEU,father,0,0,...,1,0,1,1,NA10851,CEU,EUR,male,,
1752,NA12878,1463,NA12891,NA12892,2,0,CEU,mother; child,0,0,...,1,0,1,1,NA12878,CEU,EUR,female,,
2044,NA19129,Y077,NA19128,NA19127,2,0,YRI,child,0,0,...,1,0,1,1,NA19129,YRI,AFR,female,,
2148,NA19445,LWK004,NA19453,0,2,0,LWK,unrel,0,0,...,1,0,1,1,NA19445,LWK,AFR,female,,
2452,NA20910,GIH003,NA20909,0,2,0,GIH,unrel,0,0,...,1,0,1,1,NA20910,GIH,SAS,female,,


In [80]:
df[eur_inds]["pop"].value_counts()

IBS    107
TSI    107
CEU     99
FIN     99
GBR     91
Name: pop, dtype: int64

# Check against Sardinian X data

In [81]:
path_snp = "../../ancient-sardinia/data/bed/full230.snp" # All SNPs found in the ancients

df_snp = pd.read_csv(path_snp, header=None, sep=r"\s*", engine="python")
df_snp.columns = ["SNP", "chr", "map", "pos", "ref", "alt"]
df_snp = df_snp[df_snp["chr"]==23]

print(f"Loaded {len(df_snp)} X SNPs.")

  yield pat.split(line.strip())
  yield pat.split(line.strip())


Loaded 49711 X SNPs.


In [83]:
df_snp["pos"]

1151240       990180
1151241      1501471
1151242      2700157
1151243      2709331
1151244      2710887
1151245      2711961
1151246      2713211
1151247      2719111
1151248      2729625
1151249      2732096
1151250      2732166
1151251      2743286
1151252      2743627
1151253      2746489
1151254      2746835
1151255      2759615
1151256      2765370
1151257      2765925
1151258      2767020
1151259      2767269
1151260      2767366
1151261      2774700
1151262      2775998
1151263      2777107
1151264      2777560
1151265      2779345
1151266      2783126
1151267      2783555
1151268      2785740
1151269      2786596
             ...    
1200921    155102468
1200922    155105552
1200923    155107315
1200924    155111005
1200925    155111702
1200926    155113242
1200927    155114213
1200928    155119648
1200929    155121526
1200930    155121826
1200931    155123668
1200932    155148889
1200933    155159480
1200934    155168139
1200935    155175965
1200936    155189733
1200937    15

In [86]:
pos = np.array(f["variants/POS"])

In [90]:
snps_found = df_snp["pos"].isin(pos)
print(f"Found {np.sum(snps_found)} out of {len(snps_found)} SNPs")

Found 48013 out of 49711 SNPs


In [91]:
snps_found

1151240     True
1151241     True
1151242     True
1151243     True
1151244     True
1151245     True
1151246     True
1151247     True
1151248    False
1151249     True
1151250     True
1151251     True
1151252     True
1151253     True
1151254     True
1151255     True
1151256     True
1151257     True
1151258     True
1151259     True
1151260     True
1151261     True
1151262     True
1151263     True
1151264     True
1151265    False
1151266     True
1151267     True
1151268     True
1151269     True
           ...  
1200921     True
1200922    False
1200923     True
1200924     True
1200925     True
1200926     True
1200927     True
1200928    False
1200929     True
1200930     True
1200931     True
1200932     True
1200933     True
1200934    False
1200935     True
1200936     True
1200937     True
1200938     True
1200939    False
1200940     True
1200941    False
1200942     True
1200943     True
1200944     True
1200945     True
1200946     True
1200947     True
1200948    Fal