In [1]:
import numpy as np
import pandas as pd

from os import mkdir
from Bio import SeqIO
from os.path import isdir

def Load_Fasta_Seqs(seq_path):
    d = {}
    fasta_sequences = SeqIO.parse(open(seq_path),'fasta')
    for s in fasta_sequences:
        d[s.name] = str(s.seq)
    return d

def Get_Allele_Id(Subject):
    T = Subject.split('_')
    try:
        return T[0]+"_"+T[1]
    except IndexError:
        return T[0]

In [2]:
data_path = '/Users/harihara/Mount-2/hotspring_metagenome/single_cell_analysis_with_Gabe_Birzu/\
C_Allele_Relative_Abundance_Sep/Filtered_Alignments/Pident_80/'
seq_path = '/Users/harihara/Mount-2/hotspring_metagenome/single_cell_analysis_with_Gabe_Birzu/\
C_Allele_Relative_Abundance_Aug/'
out_path = '/Users/harihara/Mount-2/hotspring_metagenome/single_cell_analysis_with_Gabe_Birzu/\
C_Allele_Relative_Abundance_Sep/BLAST_Comparison/'

if not isdir(out_path):
    mkdir(out_path)

if not isdir(out_path+'A_Alleles/'):    
    mkdir(out_path+'A_Alleles/')
if not isdir(out_path+'B_Alleles/'):    
    mkdir(out_path+'B_Alleles/')
if not isdir(out_path+'C_Alleles/'):    
    mkdir(out_path+'C_Alleles/')


In [3]:
df_A = pd.read_excel(data_path+'A_Allele.xlsx', header = [0,1])
df_A = df_A.fillna(0)
df_A = df_A[1:]
df_A['Allele'] = df_A['Sample']['Stat'].apply(Get_Allele_Id)
df_A = df_A.set_index([('Allele','')])
df_A['A_abundances'] = df_A.loc[:, df_A.columns.get_level_values(1)=='Avg_Depth_Coverage'].sum(axis = 1)

df_B = pd.read_excel(data_path+'B_Allele.xlsx', header = [0,1])
df_B = df_B.fillna(0)
df_B = df_B[1:]
df_B['Allele'] = df_B['Sample']['Stat'].apply(Get_Allele_Id)
df_B = df_B.set_index([('Allele','')])
df_B['B_abundances'] = df_B.loc[:, df_B.columns.get_level_values(1)=='Avg_Depth_Coverage'].sum(axis = 1)

df_C = pd.read_excel(data_path+'C_Allele.xlsx', header = [0,1])
df_C = df_C.fillna(0)
df_C = df_C[1:]
df_C['Allele'] = df_C['Sample']['Stat'].apply(Get_Allele_Id)
df_C = df_C.set_index([('Allele','')])
df_C['C_abundances'] = df_C.loc[:, df_C.columns.get_level_values(1)=='Avg_Depth_Coverage'].sum(axis = 1)

df_Total = df_A[['A_abundances']].join(df_B[['B_abundances']]).join(df_C[['C_abundances']])
df_Total = df_Total.fillna(0)
df_Total['Total_Abundance'] = df_Total.sum(axis=1)
df_Total = df_Total[(df_Total['A_abundances'] > 0) & 
                    (df_Total['B_abundances'] > 0) & 
                    (df_Total['C_abundances'] > 0)]
df_Total = df_Total.sort_values(by = 'Total_Abundance', ascending = False)

In [4]:
Top_30_Allele = df_Total.index.tolist()[:30] + ['YSG_1367']

A_Alleles_Sel = df_A.loc[Top_30_Allele]['Sample']['Stat'].tolist()
B_Alleles_Sel = df_B.loc[Top_30_Allele]['Sample']['Stat'].tolist()
C_Alleles_Sel = df_C.loc[Top_30_Allele]['Sample']['Stat'].tolist()


In [5]:
A_Alleles = Load_Fasta_Seqs(seq_path+'UncmicMRedA02D15_FD_alleles.fna')

for A in A_Alleles_Sel:
    print(A)
    S = A.split('_')
    seq_name = S[0]+'_'+S[1]
    if not isdir(out_path+'A_Alleles/'+A+'/'):
        mkdir(out_path+'A_Alleles/'+A+'/')
        o = open(out_path+'A_Alleles/'+A+'/'+A+'.fasta','w')
        o.write('>'+seq_name+'\n')
        o.write(A_Alleles[A]+'\n')
        o.close()

YSG_1487_alleles_Ga0393436_012_21819_23870_A_allele
YSG_0548_alleles_Ga0393436_021_29421_30833_A_allele
YSG_1924_alleles_Ga0393436_082_2918_4342_A_allele
YSG_1319_alleles_Ga0393436_031_4952_6442_A_allele
YSG_1325_alleles_Ga0393436_027_15597_16877_A_allele
YSG_0096b_alleles_Ga0393436_014_826_3426_A_allele
YSG_0694b_alleles_Ga0393436_011_429_1055_A_allele
YSG_0108a_alleles_Ga0393436_042_5454_6590_A_allele
YSG_0136a_alleles_Ga0393436_004_51432_52400_A_allele
YSG_1447b_alleles_Ga0393436_028_7797_8984_A_allele
YSG_0898_alleles_Ga0393436_021_194_1480_A_allele
YSG_1299_alleles_Ga0393436_004_48730_51381_A_allele
YSG_1272_alleles_Ga0393436_004_7726_9726_A_allele
YSG_0699_alleles_Ga0393436_034_1225_2541_A_allele
YSG_0168b_alleles_Ga0393436_009_4379_5464_A_allele
YSG_1576_alleles_Ga0393436_002_28841_30244_A_allele
YSG_0340_alleles_Ga0393436_042_10591_12519_A_allele
YSG_1142_alleles_Ga0393436_024_256_1353_A_allele
YSG_0314_alleles_Ga0393436_025_12217_13677_A_allele
YSG_0985_alleles_Ga0393436_040_6

In [6]:
B_Alleles = Load_Fasta_Seqs(seq_path+'UncmicOcRedA2H14_FD_alleles.fna')
for B in B_Alleles_Sel:
    print(B)
    S = B.split('_')
    seq_name = S[0]+'_'+S[1]
    if not isdir(out_path+'B_Alleles/'+B+'/'):
        mkdir(out_path+'B_Alleles/'+B+'/')
        o = open(out_path+'B_Alleles/'+B+'/'+B+'.fasta','w')
        o.write('>'+seq_name+'\n')
        o.write(B_Alleles[B]+'\n')
        o.close()
        

YSG_1487_alleles_Ga0374750_104_7933_9981_Bp_allele
YSG_0548_alleles_Ga0374750_086_6183_7547_Bp_allele
YSG_1924_alleles_Ga0374750_141_1128_2552_Bp_allele
YSG_1319_alleles_Ga0374750_019_22175_23686_Bp_allele
YSG_1325_alleles_Ga0374750_006_30182_31462_Bp_allele
YSG_0096b_alleles_Ga0374750_122_5302_7902_Bp_allele
YSG_0694b_alleles_Ga0374750_098_5723_6289_Bp_allele
YSG_0108a_alleles_Ga0374750_124_5418_6554_Bp_allele
YSG_0136a_alleles_Ga0374750_042_4937_5908_Bp_allele
YSG_1447b_alleles_Ga0374750_198_271_1566_Bp_allele
YSG_0898_alleles_Ga0374750_138_5715_7001_Bp_allele
YSG_1299_alleles_Ga0374750_030_9403_12057_Bp_allele
YSG_1272_alleles_Ga0374750_082_6505_8640_Bp_allele
YSG_0699_alleles_Ga0374750_009_32071_33387_Bp_allele
YSG_0168b_alleles_Ga0374750_085_6435_7517_Bp_allele
YSG_1576_alleles_Ga0374750_037_18553_19956_Bp_allele
YSG_0340_alleles_Ga0374750_037_12305_14233_Bp_allele
YSG_1142_alleles_Ga0374750_205_59_1054_Bp_allele
YSG_0314_alleles_Ga0374750_009_35066_36490_Bp_allele
YSG_0985_allele

In [7]:
C_Alleles = Load_Fasta_Seqs(seq_path+'all_C_alleles.fna')
for C in C_Alleles_Sel:
    print(C)
    S = C.split('_')
    seq_name = S[0]+'_'+S[1]
    if not isdir(out_path+'C_Alleles/'+C+'/'):
        mkdir(out_path+'C_Alleles/'+C+'/')
        o = open(out_path+'C_Alleles/'+C+'/'+C+'.fasta','w')
        o.write('>'+seq_name+'\n')
        o.write(C_Alleles[C]+'\n')
        o.close()

YSG_1487_alleles_Ga0374717_18_24502_26553_C_allele1
YSG_0548_alleles_Ga0374717_09_51800_53203_C_allele1
YSG_1924_alleles_Ga0374717_03_44098_45522_C_allele1
YSG_1319_alleles_Ga0374717_06_54004_55494_C_allele1
YSG_1325_alleles_Ga0374717_12_2675_3955_C_allele1
YSG_0096b_alleles_Ga0374717_02_20246_22852_C_allele1
YSG_0694b_alleles_Ga0374717_13_25326_25898_C_allele1
YSG_0108a_alleles_Ga0374717_16_1829_2965_C_allele1
YSG_0136a_alleles_Ga0374717_05_20990_21949_C_allele1
YSG_1447b_alleles_Ga0374717_41_5923_7119_C_allele1
YSG_0898_alleles_Ga0374717_12_5089_6375_C_allele1
YSG_1299_alleles_Ga0374717_05_18266_20941_C_allele1
YSG_1272_alleles_Ga0374717_09_40201_42390_C_allele1
YSG_0699_alleles_Ga0374717_05_49219_50586_C_allele1
YSG_0168b_alleles_Ga0374717_20_21852_22934_C_allele1
YSG_1576_alleles_Ga0374717_11_13904_15307_C_allele1
YSG_0340_alleles_Ga0374717_16_6842_8698_C_allele1
YSG_1142_alleles_Ga0374717_45_3848_4825_C_allele1
YSG_0314_alleles_Ga0374717_10_35546_37078_C_allele1
YSG_0985_alleles_G