In [None]:
## python ##
"""
processing.genotype.files.py

This script will process the mach.dose genotype files.
In particular, it will remove any subjects that are not
the phenotype file. It will output the new filtered mach.dose
file as well as a file that contains the order of the subject
IDs in the genotype files. We will then use this information to
reorder the phenotype file.

INPUT: chrom, baseD, keep_ids
- the chromosome to process 
- base directory
- path to phenotype ids list

OUTPUT: outfile
- ordered genotype file 
"""
import gzip, sys, os

#os.chdir("/shared/jmarks/hiv/uhs1234/acquisition_gwas/genotype/imputed/aa/mach")

chrom = sys.argv[1]
baseD = sys.argv[2]
keep_ids = sys.argv[3] # list of ids extracted from the phenotype file
#keep_ids = "/shared/jmarks/hiv/uhs1234/acquisition_gwas/phenotype/final/phenotype_ids_aa" 


myfile = "{0}/chr{1}.mach.dose.gz".format(baseD, chrom)
outfile = "{0}/chr{1}.mach.dose.pruned".format(baseD, chrom)
out_order = "{0}/chr{1}.genotype.id.order".format(baseD, chrom)

message = "Processing chr{}.".format(chrom)
print(message)


with gzip.open(myfile) as inF, open(keep_ids) as sexF, open(outfile, 'w') as outF, open(out_order, "w") as outID:
    sex_set = set()
    for line in sexF:
        sex_set.add(line.strip())

    line = inF.readline()
    count = 1
    while line:
        sl = line.split()
        gen_id = sl[0].split("->")[0]
        if gen_id in sex_set:
            sl[0] = "{}->{}".format(count, gen_id)
            count += 1
            newline = " ".join(sl)
            outF.write(newline + "\n")
            outID.write(gen_id + "\n")

        line = inF.readline()

    message = "chr{0} all done!".format(chrom)
    print(message)
