In [2]:
import os, itertools, subprocess
import numpy as np
import pandas as pd

from mpire import WorkerPool

RAW_SNP_PATH = "/data2/zhoujb/project/cowpea_project/basedXPXLR/snpDir/"
RAW_RES_PATH = "/data2/zhoujb/project/cowpea_project/basedXPXLR/Result/"
NEW_SNP_PATH = "/data1/zhoujb/cowpeaSequence/snpDir/"
NEW_RES_PATH = "/data1/zhoujb/cowpeaSequence/Result/"
REF_PATH = "/data1/zhoujb/cowpeaSequence/refernce/"
TEMP_PATH = "/data2/zhoujb/project/cowpea_project/basedXPXLR/templateSNP/"

CROSS_RAW_PATH = "/data1/zhoujb/cowpeaSequence/Cross_Analysis_Results/"
CROSS_PATH = "/data1/zhoujb/cowpeaSequence/Cross_Analysis_Results/Cross/"
CROSS_RES_PATH = "/data1/zhoujb/cowpeaSequence/Cross_Analysis_Results/CrossRes/"

In [6]:
def runHapCross(sample_1, sample_2):
    
    out_file_path = os.path.join(CROSS_PATH, "{}/{}_{}.Cross.Info".format(sample_1, sample_1, sample_2))
    if os.path.exists(out_file_path):
        print("{} {} DONE".format(sample_1, sample_2))
        return
    
    geno_by_hap_sel = pd.read_table(os.path.join(NEW_RES_PATH, "Genotype_by_Hap_four.tsv"), 
                                    usecols=['Region_index', sample_1, sample_2], index_col='Region_index')
    hap_score_info = pd.read_table(os.path.join(RAW_RES_PATH, "Haplotype_Score_Info.tsv"), index_col="Haplotype_ID")

    out_file = pd.DataFrame(columns=['Operating_Unit', 
                                     'P1_Haplotype', 'P1_Haplotype_Class', 'P1_haplotype_freq', 'P1_Haplotype_Score',
                                     'P2_Haplotype', 'P2_Haplotype_Class', 'P2_haplotype_freq', 'P2_Haplotype_Score',
                                     'Optimal_Haplotype', 'Delta_Ind_Hap_Score', 'P1_Breeding_Operation', 'Breeding_Strategy'])
    for index in geno_by_hap_sel.index:
        p1_haplotype = geno_by_hap_sel.loc[index, sample_1]
        if p1_haplotype == "-":
            p1_haplotype_class = "-"
            p1_indica_freq = "-"
            p1_indica_score = "-"
        else:
            p1_haplotype_class = hap_score_info.loc[p1_haplotype, "Haplotype_Class"]
            p1_indica_freq = hap_score_info.loc[p1_haplotype, "Mean.Total_freq"]
            p1_indica_score = hap_score_info.loc[p1_haplotype, "Hap_Score.by_Total_pop"]
    
        p2_haplotype = geno_by_hap_sel.loc[index, sample_2]
        if p2_haplotype == "-":
            p2_haplotype_class = "-"
            p2_indica_freq = "-"
            p2_indica_score = "-"
        else:
            p2_haplotype_class = hap_score_info.loc[p2_haplotype, "Haplotype_Class"]
            p2_indica_freq = hap_score_info.loc[p2_haplotype, "Mean.Total_freq"]
            p2_indica_score = hap_score_info.loc[p2_haplotype, "Hap_Score.by_Total_pop"]

        if (p1_haplotype == "-") and (p2_haplotype == "-"):
            delta_score = 0
            optimal_haplotype = p1_haplotype
            p1_breeding_operation = "Keep"
            breeding_strategy = "P1"
        else:
            if p1_haplotype == "-":
                delta_score = 0
                optimal_haplotype = p2_haplotype
                p1_breeding_operation = "Replaced_by_P2"
                breeding_strategy = "P2"
            elif p2_haplotype == "-":
                delta_score = 0
                optimal_haplotype = p1_haplotype
                p1_breeding_operation = "Keep"
                breeding_strategy = "P1"
            else:
                delta_score = p2_indica_score - p1_indica_score
                
                if p1_indica_freq == p2_indica_freq:
                    optimal_haplotype = p1_haplotype
                    p1_breeding_operation = "Same"
                    breeding_strategy = "||"
                elif p1_indica_freq > p2_indica_freq:
                    optimal_haplotype = p1_haplotype
                    p1_breeding_operation = "Keep"
                    breeding_strategy = "P1"
                elif p1_indica_freq < p2_indica_freq:
                    optimal_haplotype = p2_haplotype
                    p1_breeding_operation = "Replaced_by_P2"
                    breeding_strategy = "P2"

        out_file.loc[len(out_file)] = [index, 
                                       p1_haplotype, p1_haplotype_class, p1_indica_freq, p1_indica_score, 
                                       p2_haplotype, p2_haplotype_class, p2_indica_freq, p2_indica_score, 
                                       optimal_haplotype, delta_score, p1_breeding_operation, breeding_strategy]
    

    out_file.to_csv(os.path.join(CROSS_PATH, "{}/{}_{}_Cross_Info.txt".format(sample_1, sample_1, sample_2)), index=0, sep="\t")
    return
    
if __name__ == "__main__":
    samp_score_info = pd.read_table(os.path.join(NEW_RES_PATH, "Samples_Score_Info_four.tsv"))
    sample_list = samp_score_info["Sample"].to_list()
    samp_comb_list = list(itertools.permutations(sample_list, 2))

    for item in sample_list:
        os.mkdir(os.path.join(CROSS_PATH, "{}/".format(item)))

    with WorkerPool(n_jobs=12) as pool:
        pool.map(runHapCross, samp_comb_list, progress_bar=True)
    print("ALL DONE!")

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 12/12 [00:01<00:00, 57.99it/s]

ALL DONE!





['LP001', 'LP012', 'LP024', 'zhijiang618']