# Map genotype ids to phenotype ids
The individual IDs are made up of different portions (e.g. AS00-00347_8002022294_HHG10078_12_H06). To map these genotype IDs to the phenotype IDs is not straight forward. In particular, some of the IDs we might be able to map to the phenotype file by the cell_line portion (e.g. HHG10078) while other times we might have to try to map to the phenotype file using the serum ID (e.g. AS00-00347). The follow script does this searching for us. 

## Proportion of HA subjects with HIV
We need to know what the proportion of HA subjects have HIV when we consider the subjects classified as HA during the STRUCTURE analysis when the standad 25% threshold was used and also when the 8% threshold was used.

We will need a list of those subjects. We created these two lists and copied them to our local machine.

In [None]:
cut -f2 ha_filtered.0.25 | xargs -I{} grep {} genotype.to.phenotype.map2 > genotype.to.phenotype.ha.0.25 &
cut -f2 ha_filtered.0.08 | xargs -I{} grep {} genotype.to.phenotype.map2 > genotype.to.phenotype.ha.0.08 &

# as function

In [4]:
### python ###
import itertools, os

#os.chdir("/Users/jmarks/OneDrive - Research Triangle Institute/Projects/heroin/ngc/uhs4/phenotype")
#print(os.getcwd())

base_dir = "/Users/jmarks/OneDrive - Research Triangle Institute/Projects/heroin/ngc/uhs4/phenotype"
date = "20190320"
ha_percent = "8"
match_list = ["viralload_cperml.y", "viralload_cperml.x", "hiv_status", "gwashiv", "hivstat", "hiv"]
for match_var in match_list:
    
    gen = "{}/processing/master.genotype.ids.n3469".format(base_dir)
    phen = "{}/unprocessed/hiv_all_merged_with_uhs_all_phenotype_data_08282017.csv".format(base_dir)
    out_file = "{}/processing/{}.genotype.to.phenotype.ancestry.{}.ha_{}percent.map".format(base_dir,  date, match_var, ha_percent)
    ha_ids = "ha.ids.{}".format(ha_percent)

    def glance(d):
        return dict(itertools.islice(d.items(), 3))

    def map_fun(gen, phen, match_var):
        with open(gen) as asF, open(phen) as pF:
            phead = pF.readline().split(",")
            serum_index = phead.index("serum")
            cell_line_index = phead.index("cell_line")
            gwas_index = phead.index("gwasserum")
            ancestry_index = phead.index("ancestry_selfreport")
            hiv_index = phead.index(match_var)

            cell_dic = {}
            serum_dic = {}
            gwas_dic = {}
            line = pF.readline()
            while line:
                sl = line.split(",")
                cell_dic[sl[cell_line_index]] = (phead[cell_line_index], sl[ancestry_index], sl[hiv_index])
                serum_dic[sl[serum_index]] = (phead[serum_index], sl[ancestry_index], sl[hiv_index])
                gwas_dic[sl[gwas_index]] = (phead[gwas_index], sl[ancestry_index], sl[hiv_index])
                line = pF.readline()
            print(glance(cell_dic))
            print(glance(gwas_dic))
        #
            keep_list = []
            sline = asF.readline()
            while sline:
                spl = sline.split()
                if spl[1] in cell_dic:
                    tmptup = (spl[2], cell_dic[spl[1]])
                    keep_list.append(tmptup)
                elif spl[0] in serum_dic:
                    tmptup = (spl[2], serum_dic[spl[0]])
                    keep_list.append(tmptup)
                elif spl[0] in gwas_dic:
                    tmptup = (spl[2], gwas_dic[spl[0]])
                    keep_list.append(tmptup)
                else:
                    print(spl[2])
                sline = asF.readline()


        print(len(keep_list))
        mytup = keep_list[1]
        mytup = (mytup[0],) + mytup[1]

        mapped_ids = [(x[0],) + x[1] for x in keep_list]
        print(mapped_ids[:5])


        out_head = "{}\t{}\t{}\t{}".format("genotype_id", "phenotype_column", "ancestry_selfreport", match_var)
        with open(out_file, 'w') as outF:
            outF.write(out_head + "\n")
            for x in mapped_ids:
                line = "\t".join(str(i) for i in x)
                outF.write(line + "\n")
            print("done")
    map_fun(gen, phen, match_var)
    
#    def ha_filter(ha_ids, map_file):
#        out_file2 = "{}.ha_only".format(map_file)
#        with open(ha_ids) as inF, open(map_file) as mF, open(out_file2, "w") as outF:
#            head = mF.readline()
#            outF.write(head)
#            data_dic = {}
#            line = mF.readline()
#            while line:
#                sl = line.split()
#                data_dic[sl[0]] = line
#                line = mF.readline()
#
#            line = inF.readline()
#            while line:
#                sl = line.strip()
#                outF.write(data_dic[sl])
#                line = inF.readline()
#
#    ha_filter(ha_ids, out_file)

ValueError: 'serum' is not in list

## overlap

In [67]:
def ha_filter(ha_ids, map_file):
    out_file2 = "{}.ha_only".format(map_file)
    with open(ha_ids) as inF, open(map_file) as mF, open(out_file2, "w") as outF:
        head = mF.readline()
        outF.write(head)
        data_dic = {}
        line = mF.readline()
        while line:
            sl = line.split()
            data_dic[sl[0]] = line
            line = mF.readline()

        line = inF.readline()
        while line:
            sl = line.strip()
            outF.write(data_dic[sl])
            line = inF.readline()
        
ha_filter(ha_ids, out_file)

'/Users/jmarks/OneDrive - Research Triangle Institute/Projects/heroin/ngc/uhs4/phenotype/ha_data/20190313.genotype.to.phenotype.ancestry.viralload_cperml.x.ha_8percent.map'

## Variable summary
### 25%

In [None]:
# number of HA classified subjects being HIV cases using 25% threshold for the ancestry cutoff
awk '$4==1' 20190313.genotype.to.phenotype.ancestry.hiv.ha_25percent.map.ha_only| ww
"""23"""

# number of HA classified subjects being HIV cases using 25% threshold for the ancestry cutoff
awk '$4==1' 20190313.genotype.to.phenotype.ancestry.hivstat.ha_25percent.map.ha_only| ww
"""23"""

# number of HA classified subjects being HIV cases using 25% threshold for the ancestry cutoff
awk '$4==1' 20190313.genotype.to.phenotype.ancestry.hiv_status.ha_25percent.map.ha_only| ww
"""23"""

# number of HA classified subjects being HIV cases using 25% threshold for the ancestry cutoff
awk '$4==1' 20190313.genotype.to.phenotype.ancestry.gwashiv.ha_25percent.map.ha_only| ww
"""23"""

# number of HA classified subjects with viral load using 25% threshold for the ancestry cutoff
awk '$4!=-9' 20190313.genotype.to.phenotype.ancestry.viralload_cperml.y.ha_25percent.map.ha_only| ww
"""23"""

# number of HA classified subjects with viral load using 25% threshold for the ancestry cutoff
awk '$4!~"NA"' 20190313.genotype.to.phenotype.ancestry.viralload_cperml.x.ha_25percent.map.ha_only | ww
"""23"""

### 8%

In [None]:
# number of HA classified subjects being HIV cases using 8% threshold for the ancestry cutoff
awk '$4==1' 20190313.genotype.to.phenotype.ancestry.hiv.ha_8percent.map.ha_only| ww
#"""71"""

# number of HA classified subjects being HIV cases using 8% threshold for the ancestry cutoff
awk '$4==1' 20190313.genotype.to.phenotype.ancestry.hivstat.ha_8percent.map.ha_only| ww
#"""71"""

# number of HA classified subjects being HIV cases using 8% threshold for the ancestry cutoff
awk '$4==1' 20190313.genotype.to.phenotype.ancestry.hiv_status.ha_8percent.map.ha_only| ww
#"""71"""

# number of HA classified subjects being HIV cases using 8% threshold for the ancestry cutoff
awk '$4~1' 20190313.genotype.to.phenotype.ancestry.gwashiv.ha_8percent.map.ha_only| ww
#"""0"""

# number of HA classified subjects with viral load using 8% threshold for the ancestry cutoff
awk '$4!=-9' 20190313.genotype.to.phenotype.ancestry.viralload_cperml.y.ha_8percent.map.ha_only| ww
#"""66"""

# number of HA classified subjects with viral load using 8% threshold for the ancestry cutoff
awk '$4!~"NA"' 20190313.genotype.to.phenotype.ancestry.viralload_cperml.x.ha_8percent.map.ha_only | ww
#"""70"""

# Sandbox

In [115]:
### python ###
import itertools, os
"""
This function will parse the master phenotype file and map the genotype IDs to the
corresponding phenotype IDs. This was developed because there is no straight forward 
way to map the gentype IDs to the phenotype file. 
"""


base_dir = "/Users/jmarks/OneDrive - Research Triangle Institute/Projects/heroin/ngc/uhs4/phenotype"
date = "20190321"
ha_percent = "8"
match_var = '"viralload_log10.y"' # phenotype variable of interest in the master phenotype file
#match_list = ["viralload_cperml.y", "viralload_cperml.x", "hiv_status", "gwashiv", "hivstat", "hiv"]
#for match_var in match_list:

# create the header for the output file
out_head = "{}\t{}\t{}\t{}\t{}\t{}".format("genotype_id", "phenotype_column", "ancestry_selfreport",
                                           match_var.strip('\"'), "age", "sex_selfreport")
gen = "{}/processing/master.genotype.ids.n3469".format(base_dir)
phen = "{}/unprocessed/hiv_all_merged_with_uhs_all_phenotype_data_08282017.csv".format(base_dir)
out_file = "{}/processing/{}.genotype.to.phenotype.ancestry.{}.ha_{}percent.map".format(base_dir,  
                                                                                        date, match_var.strip('\"'), ha_percent)
ha_ids = "ha.ids.{}".format(ha_percent)

# view head of dictionaries
def glance(d, size):
    return dict(itertools.islice(d.items(), size))
def gen_to_phen(match_var):
    with open(gen) as asF, open(phen) as pF:
        phead = pF.readline()
        phead = phead.split(",")
        
        # note that all the cells in the phenotype file have quotes around the entries
        # which is why we have to use double quotes for the following vars of interest
        serum_index = phead.index('"serum"')
        cell_line_index = phead.index('"cell_line"')
        gwas_index = phead.index('"gwasserum"')
        ancestry_index = phead.index('"ancestry_selfreport"')
        age = phead.index('"age"')
        sex = phead.index('"sex_selfreport"')
        hiv_index = phead.index(match_var) 

        # initialized dictionaries that capture the variables-of-interest information 
        # for all subjects in phenotype file
        cell_dic = {}
        serum_dic = {}
        gwas_dic = {}
        
        line = pF.readline() 
        while line: # parse each line of the master phenotype file
            sl = line.split(",")
            
            # creating three mapping dictionaries because we are ultimately not sure
            # which ID variable we will have to use to map the genotype ID to the corresponding
            # phenotype information. It is actually going to take a combination of all three.
            cell_dic[sl[cell_line_index]] = (phead[cell_line_index], sl[ancestry_index],
                                             sl[hiv_index], sl[age], sl[sex])
            serum_dic[sl[serum_index]] = (phead[serum_index], sl[ancestry_index], 
                                          sl[hiv_index], sl[age], sl[sex])
            gwas_dic[sl[gwas_index]] = (phead[gwas_index], sl[ancestry_index],
                                        sl[hiv_index], sl[age], sl[sex])
            line = pF.readline()
        # view dictionary 
        # print(glance(cell_dic, 3))

        keep_list = []
        head = asF.readline()
        sline = asF.readline()
        while sline:
            spl = sline.split()
            spl = [f'"{word}"' for word in spl]
            if spl[1] in cell_dic:
                tmptup = (spl[2], cell_dic[spl[1]])
                keep_list.append(tmptup)
            elif spl[0] in serum_dic:
                tmptup = (spl[2], serum_dic[spl[0]])
                keep_list.append(tmptup)
            elif spl[0] in gwas_dic:
                tmptup = (spl[2], gwas_dic[spl[0]])
                keep_list.append(tmptup)
            else:
                print(spl[2])
            sline = asF.readline()


        print(len(keep_list))
        mytup = keep_list[1]
        mytup = (mytup[0],) + mytup[1]

        mapped_ids = [(x[0],) + x[1] for x in keep_list]
        print(mapped_ids[:5])


        with open(out_file, 'w') as outF:
            outF.write(out_head + "\n")
            for x in mapped_ids:
                line = "\t".join(str(i).strip('\"') for i in x)
                outF.write(line + "\n")
            print("done")
            print(match_var)
gen_to_phen(match_var)

#    def ha_filter(ha_ids, map_file):
#        out_file2 = "{}.ha_only".format(map_file)
#        with open(ha_ids) as inF, open(map_file) as mF, open(out_file2, "w") as outF:
#            head = mF.readline()
#            outF.write(head)
#            data_dic = {}
#            line = mF.readline()
#            while line:
#                sl = line.split()
#                data_dic[sl[0]] = line
#                line = mF.readline()
#
#            line = inF.readline()
#            while line:
#                sl = line.strip()
#                outF.write(data_dic[sl])
#                line = inF.readline()
#
    #    ha_filter(ha_ids, out_file)

3469
[('"AS00-00347_8002022294_HHG10078_12_H06"', '"cell_line"', '"3"', '"-9"', '"49"', '"1"'), ('"AS00-00351_8002220319_HHG6146_36_C02"', '"cell_line"', '"2"', '"-9"', '"35"', '"2"'), ('"AS00-00437_8002220343_HHG6150_36_D02"', '"cell_line"', '"2"', '"-9"', '"41"', '"1"'), ('"AS00-00458_8002694957_HHG0612_1_D01"', '"cell_line"', '"2"', '"-9"', '"50"', '"1"'), ('"AS00-00459_8002220355_HHG6152_36_E02"', '"cell_line"', '"2"', '"-9"', '"52"', '"2"')]
done
"viralload_log10.y"


In [123]:
### python ###
import itertools, os
"""
This function will parse the master phenotype file and map the genotype IDs to the
corresponding phenotype IDs. This was developed because there is no straight forward 
way to map the gentype IDs to the phenotype file. 
"""


base_dir = "/Users/jmarks/OneDrive - Research Triangle Institute/Projects/heroin/ngc/uhs4/phenotype"
date = "20190320"
ha_percent = "8"
match_var = '"viralload_log10.y"' # phenotype variable of interest in the master phenotype file
#match_list = ["viralload_cperml.y", "viralload_cperml.x", "hiv_status", "gwashiv", "hivstat", "hiv"]
#for match_var in match_list:

# file which contains only the subject genotype IDs of the subjects that were classified
# as HA after the STRUCTURE analysis
ha_ids = "{}/processing/ha.ids.{}".format(base_dir, ha_percent) 

# create the header for the output file
out_head = "{}\t{}\t{}\t{}\t{}\t{}".format("genotype_id", "phenotype_column", "ancestry_selfreport",
                                           match_var.strip('\"'), "age", "sex_selfreport")
gen = "{}/processing/master.genotype.ids.n3469".format(base_dir)
phen = "{}/unprocessed/hiv_all_merged_with_uhs_all_phenotype_data_08282017.csv".format(base_dir)
out_file = "{}/processing/{}.genotype.to.phenotype.ancestry.{}.ha_{}percent.map".format(base_dir,  
                                                                                        date, match_var.strip('\"'), ha_percent)

# view head of dictionaries
def glance(d, size):
    return dict(itertools.islice(d.items(), size))
def gen_to_phen(match_var):
    with open(gen) as asF, open(phen) as pF:
        phead = pF.readline()
        phead = phead.split(",")
        
        # note that all the cells in the phenotype file have quotes around the entries
        # which is why we have to use double quotes for the following vars of interest
        serum_index = phead.index('"serum"')
        cell_line_index = phead.index('"cell_line"')
        gwas_index = phead.index('"gwasserum"')
        ancestry_index = phead.index('"ancestry_selfreport"')
        age = phead.index('"age"')
        sex = phead.index('"sex_selfreport"')
        hiv_index = phead.index(match_var) 

        # initialized dictionaries that capture the variables-of-interest information 
        # for all subjects in phenotype file
        cell_dic = {}
        serum_dic = {}
        gwas_dic = {}
        
        line = pF.readline() 
        while line: # parse each line of the master phenotype file
            sl = line.split(",")
            
            # creating three mapping dictionaries because we are ultimately not sure
            # which ID variable we will have to use to map the genotype ID to the corresponding
            # phenotype information. It is actually going to take a combination of all three.
            cell_dic[sl[cell_line_index]] = (phead[cell_line_index], sl[ancestry_index],
                                             sl[hiv_index], sl[age], sl[sex])
            serum_dic[sl[serum_index]] = (phead[serum_index], sl[ancestry_index], 
                                          sl[hiv_index], sl[age], sl[sex])
            gwas_dic[sl[gwas_index]] = (phead[gwas_index], sl[ancestry_index],
                                        sl[hiv_index], sl[age], sl[sex])
            line = pF.readline()
            # 
        print(glance(cell_dic, 3))

        keep_list = []
        next(asF) # skip header line
        sline = asF.readline()
        while sline:
            spl = sline.split()
            spl = [f'"{word}"' for word in spl]
            if spl[1] in cell_dic:
                tmptup = (spl[2], cell_dic[spl[1]])
                keep_list.append(tmptup)
            elif spl[0] in serum_dic:
                tmptup = (spl[2], serum_dic[spl[0]])
                keep_list.append(tmptup)
            elif spl[0] in gwas_dic:
                tmptup = (spl[2], gwas_dic[spl[0]])
                keep_list.append(tmptup)
            else:
                print(spl[2])
            sline = asF.readline()


        print(len(keep_list))
        mytup = keep_list[1]
        mytup = (mytup[0],) + mytup[1]

        mapped_ids = [(x[0],) + x[1] for x in keep_list]
        print(mapped_ids[:5])


        with open(out_file, 'w') as outF:
            outF.write(out_head + "\n")
            for x in mapped_ids:
                line = "\t".join(str(i).strip('\"') for i in x)
                outF.write(line + "\n")
            print("done")
gen_to_phen(match_var)


# filter the map file that was created above to only the subjects classified
# as HA after the STRUCTURE analysis, as well as .
def ha_filter(ha_ids, map_file):
    out_file2 = "{}.ha_only".format(map_file)
    with open(ha_ids) as inF, open(map_file) as mF, open(out_file2, "w") as outF:
        head = mF.readline()
        outF.write(head)
        data_dic = {}
        line = mF.readline()
        while line:
            sl = line.split()
            data_dic[sl[0]] = line
            line = mF.readline()

        line = inF.readline()
        while line:
            sl = line.strip()
            outF.write(data_dic[sl])
            line = inF.readline()

ha_filter(ha_ids, out_file)

{'"HHG4618"': ('"cell_line"', '"2"', '"-9"', '"43"', '"1"'), '"HHG6025"': ('"cell_line"', '"1"', '"-9"', '"36"', '"1"'), '"HHG0254"': ('"cell_line"', '"2"', '"-9"', '"55"', '"1"')}
3469
[('"AS00-00347_8002022294_HHG10078_12_H06"', '"cell_line"', '"3"', '"-9"', '"49"', '"1"'), ('"AS00-00351_8002220319_HHG6146_36_C02"', '"cell_line"', '"2"', '"-9"', '"35"', '"2"'), ('"AS00-00437_8002220343_HHG6150_36_D02"', '"cell_line"', '"2"', '"-9"', '"41"', '"1"'), ('"AS00-00458_8002694957_HHG0612_1_D01"', '"cell_line"', '"2"', '"-9"', '"50"', '"1"'), ('"AS00-00459_8002220355_HHG6152_36_E02"', '"cell_line"', '"2"', '"-9"', '"52"', '"2"')]
done


In [73]:
## BASH ##
awk '$4!="-9"' 20190320.genotype.to.phenotype.ancestry.viralload_log10.y.ha_8percent.map.ha_only >\
    20190320.genotype.to.phenotype.ancestry.viralload_log10.y.ha_8percent.map.ha_only.complete

'viralload_cperml.y'