# Template

In [None]:
# function parses input arguments
# required arguments: --metal
#                     --probabel
#                     --probabel_labels
#                     --out
def parseArguments(args):
        while(len(args) > 0):
                # input metal file name (full path)
                if(args[0] == '--metal'):
                        metalFName = args[1]
                        args = args[2:]

                # a list of input probabel file names (full path)
                elif(args[0] == '--probabel'):
                        args = args[1:]
                        probabelFList = []
                        while(len(args)>0 and args[0][:2] != '--'):
                                probabelFList.append(args[0])
                                args = args[1:]

                # a list of probabel labels
                elif(args[0] == '--probabel_labels'):
                        args = args[1:]
                        labelsList = []
                        while(len(args)>0 and args[0][:2] != '--'):
                                labelsList.append(args[0])
                                args = args[1:]

                # output merged file name (full path)
                elif(args[0] == '--out'):
                        outFName = args[1]
                        args = args[2:]

                # other arguments unused
                else:
                        sys.exit('Unused arguments: ' + ' '.join(args))

        # check if missing required arguments
        if('metalFName' not in locals() or 'probabelFList' not in locals() or 'labelsList' not in locals() or 'outFName' not in locals()):
                sys.exit('Missing at least one required argument. Please specify --metal, --probabel, --probabel_labels, --out')

        # check if length of probabel files equals length of probabel labels
        if(len(probabelFList) != len(labelsList)):
                sys.exit('Length of probabel files does not match length of probabel labels.')

        return (metalFName, outFName, probabelFList, labelsList)


# parse_args function
Note to have an argument 


In [2]:
import sys, os

def parse_args(args):

    while(len(args) > 0):
        # input gwas file name (full path)
        if(args[0] == '--in_file'):
            in_name = args[1]
            args = args[2:]

        # (string) name of header column for chromosome
        elif(args[0] == '--chr_name'):
            chr_name = args[1]
            args = args[2:]
            
        # name of study (e.g. s4s, ftc, gain, adaa, etc)
        elif(args[0] == '--study_name'):
            study = args[1]
            args = args[2:]
            
        # (string) name of header column for allele1
        elif(args[0] == '--a1_name'):
            a1_name = args[1]
            args = args[2:]

        # (string) name of header column for allele2
        # if 
        elif(args[0] == '--a2_name'):
            a2_name = args[1]
            args = args[2:]

        # (string) name of header column for position (BP)
        elif(args[0] == '--position_name'):
            pos_name = args[1]
            args = args[2:]
            
        elif(args[0] == '--ancestry'):
            ancestry = args[1]
            args = args[2:]

        # (string) path to processing directory
        elif(args[0] == '--processing_dir'):
            proc_dir = args[1]
            args = args[2:]

        # other arguments unused
        else:
            sys.exit('Unused arguments: ' + ' '.join(args))

    # check if missing required arguments
    if('in_name' not in locals() or 'chr_name' not in locals() \
       or 'a1_name' not in locals() or 'pos_name' not in locals() \
       or 'proc_dir' not in locals(), or 'ancestry' not in locals()):
        sys.exit('Missing at least one required argument. Please specify \
                --in_file, --chr_name, --a1_name, --position_name, --ancestry, --proc_dir')
    if('a2_name' not in locals()):
        sys.exit('Missing the argument <--a2_name>. If input data has no\
                 a2 column then specify <--a2_name none>')

    return (in_name, study, chr_name, a1_name, a2_name, pos_name, proc_dir, ancestry)

#in_name, study, chr_name, a1_name, a2_name, pos_name, processing_dir, ancestry = parse_args(sys.argv[1:]) 

In [None]:
python meth1.py \
    --in_file ~/sand/SAS_final_ftnd.summary.gz \
    --study_name s4s \
    --ancestry sas \
    --chr_name CHR \
    --a1_name A1 \
    --a2_name none \
    --position_name BP \
    --processing_dir ~/sand

In [None]:
def split_chr(in_name, study, chr_name, a1_name, a2_name, pos_name, processing_dir, ancestry):
    with open(in_name, 'r') as inF:
        
        # create a 'Marker' column that will be CHR:POSITION
        study = 's4s'
        header = inF.readline().split()
        header.insert(0, 'Marker')

        chr_index = header.index(chr_name)
        pos_index = header.index(pos_name)
        a1_index = header.index(a1_name)
        
        # if there is no A2 column then insert one
        if a2_name == 'none':
            a2_index = a1_index + 1
            header.insert(a2_index, 'A2') # insert after a1
            had_a2 = False 
        else:
            had_a2 = True

        last_chr = ''
        processed_list = [] # keep track of which chr have been processed

        line = inF.readline()
        while(line): # while we are not at the end of the file
            split_line = line.split()
            current_chr = split_line[chr_index]

            if not had_a2:
                split_line.insert(a2_index, '.') # insert place holder
            else:
                pass
            
            if(current_chr != last_chr): # create a new file 
                proc_message = 'Processing chr{}'.format(current_chr)
                print(proc_message)

                # keep track of which chromosomes have been processed
                processed_list.append(current_chr)
                times_processed = processed_list.count(current_chr)
                last_chr = current_chr # new last chromosome now
                
                
                # construct outfile name
                fname = '{}.{}.1000G.chr{}.CAT_FTND~1df_add.out.txt'.format(study,
                                                                            ancestry,
                                                                            split_line[chr_index])
                out_dir = "{}/processing/chr{}/".format(ancestry, split_line[chrIndex])
                outF = open(processing_dir + out_dir + fname, 'a')

                # if this is first time this chr has been encountered make a header
                if times_processed == 1:
                    # write to a new file based on the new chr we are processing
                    # also add the column Marker to the column header
                    outF.write("\t".join(header) + "\n")
                    
                    
                # creating the Markername = CHR:POSITION in first field
            out_line = "{}:{}\t{}\n".format(split_line[chr_index],
                                            split_line[pos_index],
                                            "\t".join(split_line))
            outF.write(out_line)

            line = inF.readline() # read next row 

# Working version

In [None]:
def kp3_dict(chrom):
    """
    Create a dictionary for the 1000G_p3 snps.
    The key is the rsID and the value is a list of [A1,A2]
    """
    import gzip
################################################################################ 
    base_dir = "/shared/data/ref_panels/1000G/2014.10"
    in_file = "{}/1000GP_Phase3_chr{}.legend.gz".format(base_dir, chrom)
    thousand_dict = {}
    with gzip.open(in_file, "r") as inF:
        next(inF)
        line = inF.readline()
        while line:
            uniq_id = line.split()[0]
            uniq_id = uniq_id.split(":") 
            rs_id = uniq_id[0]
            position = uniq_id[1]
            a1 = uniq_id[2]
            a2 = uniq_id[3]
            thousand_dict[rs_id] = [rs_id, position, a1, a2]
            line = inF.readline()
    message = "Done creating dictionary for chr{} 1000G_p3 reference panel.".format(chrom)
    print(message)
    return thousand_dict

thou_dict = kp3_dict(15)

import sys

base_dir = "/home/ec2-user/jmarks/nicotine/spit_science/processed_results/002"
#ancestry_list = ["aa", "amr", "ea", "eas", "sas"]
ancestry_list = ["sas"]
study = "sfs"

import sys
print "This is the name of the script: ", sys.argv[0]

def convert_snp(base_dir, ancestry, study, chrom):
    """Convert snps to 1000G phase 3 format"""
    import gzip
    thou_dict = kp3_dict(chrom) # reference panel dictionary of SNPs
    
    in_name = "{}.{}.1000G.chr{}.CAT_FTND~1df_add.out.txt".format(study, ancestry, chrom)
    file_dir = "{}/{}/processing/chr{}/".format(base_dir, ancestry, chrom)
    with open(file_dir + in_name, "r") as inF:
        #leg_file = "/shared/data/ref_panels/1000G/2014.10/1000GP_Phase3_chr{}.legend.gz".format(chrom)
        out_name = "{}.{}.1000G.chr{}.CAT_FTND~1df.phase3ID_add.out.txt".format(study, ancestry, chrom)
        with open(file_dir + out_name, "w") as outF:
            header = inF.readline()
            outF.write(header)
            
            snp_index = header.split().index("SNP")
            a1_index = header.split().index("A1")
            a2_index = header.split().index("A2")
            phase3_index = header.split().index("Marker") # updated SNP id
            
            line = inF.readline()
            
            while line:
                snp = line.split()[snp_index]
                snp_split = snp.split(":")
                a1 = line.split()[a1_index]
                snp_id = snp_split[0]
                
                split_line = line.split()
                if len(snp_split) >= 4: # indicates already in 1000G_p3 format
                    split_line[phase3_index] = snp
                    split_line[a2_index] = snp_split[3] # fill in A2 data
                    new_line = "\t".join(split_line)
                    outF.write(new_line + "\n")
                    
                # if not already in 1000G_p3 format, search for it in ref panel 
                # which is in dictionary we created
                elif (len(snp_split) == 1) and (snp_id[:2] == "rs"): 
                    try:
                        marker_name = ":".join(thou_dict[snp_id])
                        potential_a1 = thou_dict[snp_id][2]
                        potential_a2 = thou_dict[snp_id][3]
                        
                        if a1 == potential_a1:
                            a2 = potential_a2
                            split_line[a2_index] = a2
                            split_line[phase3_index] = marker_name
                            new_line = "\t".join(split_line)
                            outF.write(new_line + "\n")
                        elif a1 == potential_a2:
                            a2 = potential_a1
                            split_line[a2_index] = a2
                            split_line[phase3_index] = marker_name
                            new_line = "\t".join(split_line)
                            outF.write(new_line + "\n")
                        else:
                            continue 
                    except KeyError: # rsID not in ref panel dictionary
                        pass
                else:
                    pass
                line = inF.readline()
    print("Done")
    
base_dir = "/home/ec2-user/jmarks/nicotine/spit_science/processed_results/002"
ancestry = "sas"
study = "sfs"
chrom = 15

print(sys.argv[0])
print(sys.argv[1])
print(sys.argv[2])
print(sys.argv[3])
convert_snp(base_dir, ancestry, study, chrom)