# Template

# parse_args function

In [2]:
import sys, os

def parse_args(args):

    while(len(args) > 0):
        # input gwas file name (full path)
        if(args[0] == '--in_file'):
            in_name = args[1]
            args = args[2:]

        # (string) name of header column for chromosome
        elif(args[0] == '--chr_name'):
            chr_name = args[1]
            args = args[2:]
            
        # name of study (e.g. s4s, ftc, gain, adaa, etc)
        elif(args[0] == '--study_name'):
            study = args[1]
            args = args[2:]
            
        # (string) name of header column for allele1
        elif(args[0] == '--a1_name'):
            a1_name = args[1]
            args = args[2:]

        # (string) name of header column for allele2
        # if 
        elif(args[0] == '--a2_name'):
            a2_name = args[1]
            args = args[2:]

        # (string) name of header column for position (BP)
        elif(args[0] == '--position_name'):
            pos_name = args[1]
            args = args[2:]
            
        elif(args[0] == '--ancestry'):
            ancestry = args[1]
            args = args[2:]

        # (string) path to processing directory
        elif(args[0] == '--processing_dir'):
            proc_dir = args[1]
            args = args[2:]

        # other arguments unused
        else:
            sys.exit('Unused arguments: ' + ' '.join(args))

    # check if missing required arguments
    if('in_name' not in locals() or 'chr_name' not in locals() \
       or 'a1_name' not in locals() or 'pos_name' not in locals() \
       or 'proc_dir' not in locals(), or 'ancestry' not in locals()):
        sys.exit('Missing at least one required argument. Please specify \
                --in_file, --chr_name, --a1_name, --position_name, --ancestry, --proc_dir')
    if('a2_name' not in locals()):
        sys.exit('Missing the argument <--a2_name>. If input data has no\
                 a2 column then specify <--a2_name none>')

    return (in_name, study, chr_name, a1_name, a2_name, pos_name, proc_dir, ancestry)

#in_name, study, chr_name, a1_name, a2_name, pos_name, processing_dir, ancestry = parse_args(sys.argv[1:]) 

In [None]:
methods_dir=/home/ec2-user/sand
python ${methods_dir}/meth1.py \
    --in_file ~/sand/SAS_final_ftnd.summary.gz \
    --study_name s4s \
    --ancestry sas \
    --chr_name CHR \
    --a1_name A1 \
    --a2_name none \
    --position_name BP \
    --processing_dir ~/sand

In [None]:
def split_chr(in_name, study, chr_name, a1_name, a2_name, pos_name, processing_dir, ancestry):
    with open(in_name, 'r') as inF:
        
        # create a 'Marker' column that will be CHR:POSITION
        study = 's4s'
        header = inF.readline().split()
        header.insert(0, 'Marker')

        chr_index = header.index(chr_name)
        pos_index = header.index(pos_name)
        a1_index = header.index(a1_name)
        
        # if there is no A2 column then insert one
        if a2_name == 'none':
            a2_index = a1_index + 1
            header.insert(a2_index, 'A2') # insert after a1
            had_a2 = False 
        else:
            had_a2 = True

        last_chr = ''
        processed_list = [] # keep track of which chr have been processed

        line = inF.readline()
        while(line): # while we are not at the end of the file
            split_line = line.split()
            current_chr = split_line[chr_index]

            if not had_a2:
                split_line.insert(a2_index, '.') # insert place holder
            else:
                pass
            
            if(current_chr != last_chr): # create a new file 
                proc_message = 'Processing chr{}'.format(current_chr)
                print(proc_message)

                # keep track of which chromosomes have been processed
                processed_list.append(current_chr)
                times_processed = processed_list.count(current_chr)
                last_chr = current_chr # new last chromosome now
                
                
                # construct outfile name
                fname = '{}.{}.1000G.chr{}.CAT_FTND~1df_add.out.txt'.format(study,
                                                                            ancestry,
                                                                            split_line[chr_index])
                out_dir = "{}/processing/chr{}/".format(ancestry, split_line[chrIndex])
                outF = open(processing_dir + out_dir + fname, 'a')

                # if this is first time this chr has been encountered make a header
                if times_processed == 1:
                    # write to a new file based on the new chr we are processing
                    # also add the column Marker to the column header
                    outF.write("\t".join(header) + "\n")
                    
                    
                # creating the Markername = CHR:POSITION in first field
            out_line = "{}:{}\t{}\n".format(split_line[chr_index],
                                            split_line[pos_index],
                                            "\t".join(split_line))
            outF.write(out_line)

            line = inF.readline() # read next row 

In [None]:
def convert_snp(base_dir, ancestry, study, chrom):
    """Convert snps to 1000G phase 3 format"""
    import gzip
    thou_dict = kp3_dict(chrom) # reference panel dictionary of SNPs

    in_name = "{}.{}.1000G.chr{}.CAT_FTND~1df_add.out.txt".format(study, ancestry, chrom)
    file_dir = "{}/{}/processing/chr{}/".format(base_dir, ancestry, chrom)
    with open(file_dir + in_name, "r") as inF:
        #leg_file = "/shared/data/ref_panels/1000G/2014.10/1000GP_Phase3_chr{}.legend.gz".format(chrom)
        out_name = "{}.{}.1000G.chr{}.CAT_FTND~1df.phase3ID_add.out.txt".format(study, ancestry, chrom)
        with open(file_dir + out_name, "w") as outF:
            header = inF.readline()
            outF.write(header)

            snp_index = header.split().index("SNP")
            a1_index = header.split().index("A1")
            a2_index = header.split().index("A2")
            phase3_index = header.split().index("Marker") # updated SNP id

            line = inF.readline()

            while line:
                snp = line.split()[snp_index]
                snp_split = snp.split(":")
                a1 = line.split()[a1_index]
                snp_id = snp_split[0]

                split_line = line.split()
                if len(snp_split) >= 4: # indicates already in 1000G_p3 format
                    split_line[phase3_index] = snp
                    split_line[a2_index] = snp_split[3] # fill in A2 data
                    new_line = "\t".join(split_line)
                    outF.write(new_line + "\n")

                # if not already in 1000G_p3 format, search for it in ref panel
                # which is in dictionary we created
                elif (len(snp_split) == 1) and (snp_id[:2] == "rs"):
                    try:
                        marker_name = ":".join(thou_dict[snp_id])
                        potential_a1 = thou_dict[snp_id][2]
                        potential_a2 = thou_dict[snp_id][3]

                        if a1 == potential_a1:
                            a2 = potential_a2
                            split_line[a2_index] = a2
                            split_line[phase3_index] = marker_name
                            new_line = "\t".join(split_line)
                            outF.write(new_line + "\n")
                        elif a1 == potential_a2:
                            a2 = potential_a1
                            split_line[a2_index] = a2
                            split_line[phase3_index] = marker_name
                            new_line = "\t".join(split_line)
                            outF.write(new_line + "\n")
                        else:
                            pass
                    except KeyError: # rsID not in ref panel dictionary
                        pass
                else:
                    pass
                line = inF.readline()
    message = "Done with {} chr{}.".format(ancestry, chrom)
    print("Done")


# sandbox

In [None]:
def parse_args(args):
    import sys, os

    while(len(args) > 0):
        # input gwas file name (full path)
        if(args[0] == '--base_dir'):
            base_dir = args[1]
            args = args[2:]

        elif(args[0] == '--ancestry'):
            ancestry = args[1]
            args = args[2:]

        elif(args[0] == '--study'):
            study = args[1]
            args = args[2:]
            
        elif(args[0] == '--chrom'):
            chrom = args[1]
            args = args[2:]

        # other arguments unused
        else:
            sys.exit('Unused arguments: ' + ' '.join(args))

    # check if missing required arguments
    if('base_dir' not in locals() or 'ancestry' not in locals() \
       or 'study' not in locals(), or 'chrom' not in locals()):
        sys.exit('Missing at least one required argument. Please specify \
                --base_dir, --ancestry, --study, --chrom')
    return (base_dir, ancestry, study, chrom)

#base_dir, ancestry, study, chrom = parse_args(sys.argv[1:]) 

In [None]:
ancestry_list="afr amr eur eas sas"
study=s4s

base=/home/ec2-user/jmarks/nicotine/spit_science/processed_results/003
for chr in {1..22}; do
    for ancestry in $ancestry_list;do
        outF=$base/$ancestry/processing/chr$chr
        /shared/bioinformatics/software/scripts/qsub_job.sh \
            --job_name S4S_${ancestry}_chr${chr} \
            --script_prefix $outF/convert_to_1000g_p3 \
            --mem 15 \
            --nslots 3 \
            --priority 0 \
            --program python convert_gwas_results_1000g_p3.py \
                --base_dir \
                --ancestry $ancestry \
                --study $study \
                --chrom $chr $study $chr
    done
done


for ancestry in ${ancestry_list}; do
    if [ $ancestry == "afr" ]; then
        group=afr
    elif [ $ancestry == "amr" ]; then
        group=amr
    elif [ $ancestry == "eas" ]; then
        group=eas
    elif [ $ancestry == "sas" ]; then
        group=sas
    else
        group=eur
    fi    
    for (( chr=1; chr<23; chr++ )); do
        /shared/bioinformatics/software/scripts/qsub_job.sh \
            --job_name _$ancestry_${chr} \
            --script_prefix $base/$ancestry/processing/chr$chr/s4s.$ancestry.1000G.chr$chr.CAT_FTND~1df.phase3ID_add.out.txt \
            --mem 15 \
            --nslots 3 \
            --priority 0 \
            --program perl /shared/bioinformatics/software/perl/id_conversion/convert_to_1000g_p3_ids.pl \
            --file_in $base/$ancestry/processing/chr$chr/s4s.$ancestry.1000G.chr$chr.CAT_FTND~1df_add.out.txt \
            --file_out $base/$ancestry/processing/chr$chr/s4s.$ancestry.1000G.chr$chr.CAT_FTND~1df.phase3ID_add.out.txt \
            --legend /shared/data/ref_panels/1000G/2014.10/1000GP_Phase3_chr$chr.legend.gz \
            --file_in_header 1 \
            --file_in_id_col 0 \
            --file_in_chr_col 1 \
            --file_in_pos_col 3 \
            --file_in_a1_col 4 \
            --file_in_a2_col 8 \
            --chr $chr
    done
done