In [12]:
import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec
from matplotlib.patches import Rectangle

import resource
soft, hard = resource.getrlimit(resource.RLIMIT_NOFILE)
resource.setrlimit(resource.RLIMIT_NOFILE, (hard, hard))

from os import listdir
from PyPDF2 import PdfMerger
from os.path import isdir, isfile
from PyCircos_Plots.PyCircos_Utils import *

rcParams = {'font.size': 24, 'font.weight': 'normal', 'font.family': 'sans-serif',
            'axes.unicode_minus':False, 'axes.labelweight':'normal'}

plt.rcParams.update(rcParams)

osa_len = 2932766
osb_len = 3046682

def Make_Circle_Plots(ax, values, color, text, ylim):
    angles = np.linspace(0, 2*np.pi, len(values), endpoint=False)
    ax.plot(angles, values, linewidth=5, color = color)
    ax.grid(False)
    ax.set_ylim([0, ylim])
    ax.set_yticklabels([])
    ax.set_xticks([])
    ax.text(0,0, text, horizontalalignment='center', verticalalignment='center',)
    ax.spines['polar'].set_visible(False)

def Load_Prodigal_GBFF(filepath):
    lines = open(filepath).readlines()
    op = []
    ctr = 1
    for i in range(len(lines)):
        l = lines[i]
        l = l.replace("\n","")
        l = l.strip()
        if l == "//":
            ctr = 1
        else:
            if l.startswith("DEFINITION"):
                l = l.replace("DEFINITION ","")
                splits = l.split(";")
                for s in splits:
                    if s.startswith("seqhdr"):
                        sequence_id = s.replace("seqhdr=","").replace("\"","")
                    if s.startswith("seqlen"):
                        seqlen = int(s.replace("seqlen=","").replace("\"",""))
            if l.startswith("CDS"):
                l = l.replace("CDS","").strip()
                if l.startswith("complement"):
                    orientation = '-'
                else:
                    orientation = '+'
                l = l.replace("complement","").replace("(","").replace(")","").replace("<","").replace(">","")
                start,end = l.split("..")
                i += 1
                record = lines[i]
                record = record.strip()
                if record.startswith("/note"):
                    record = record.replace("/note=","").replace("\"","")
                    splits = record.split(";")
                    d = {'Query':sequence_id, 'Pred':sequence_id+'_'+str(ctr), 'Qlen':seqlen, 
                         'Orientation':orientation, 'Start':start, 'End':end}
                    for s in splits[:-1]:
                        s = s.replace("\n","")
                        key, value = s.split('=')
                        d[key] = value
                    ctr += 1
                    op.append(d)
    df_Prodigal_Hits = pd.DataFrame(op)
    return df_Prodigal_Hits[['Query','Pred','Orientation','Start','End','Qlen','ID','partial','conf']]

In [2]:
novel_contigs = {}
novel_contig_path = '/Users/harihara/Mount-2/hotspring_metagenome/Synechococcus_paper_analysis/\
Ref_Guided_Scaffolding_Clustering_Aug_2022/Missing_Contig_Coords_Aug/'
samples = listdir(novel_contig_path+'OSA/')
for s in samples:
    if(s.startswith("Hot") and s.endswith(".txt")):
        df_osa = pd.read_csv(novel_contig_path+'OSA/'+s, sep = "\t")
        df_osa.loc[(df_osa['Start'] < 0), 'Start'] += osa_len
        df_osa.loc[(df_osa['End'] < 0), 'End'] += osa_len
        df_osa_grp = df_osa.sort_values(by = ['Contig','Start']).groupby(['Contig']).apply(Max_Clique_Interval_Graph)
    
        df_osb = pd.read_csv(novel_contig_path+'OSB/'+s, sep = "\t")
        df_osb.loc[(df_osb['Start'] < 0), 'Start'] += osb_len
        df_osb.loc[(df_osb['End'] < 0), 'End'] += osb_len
        df_osb_grp = df_osb.sort_values(by = ['Contig','Start']).groupby(['Contig']).apply(Max_Clique_Interval_Graph)
    
        novel_contigs[s.replace(".txt","")] = {'OSA':df_osa_grp,'OSB':df_osb_grp}

In [3]:
grp_path = '/Users/harihara/Mount-2/hotspring_metagenome/Synechococcus_paper_analysis/\
Ref_Guided_Scaffolding_Clustering_Aug_2022/BLAST_All_vs_All/Containment_Clusters_Filtered_90_75_1000.txt'
df_novel_filtered = pd.read_csv(grp_path, sep = "\t")
d = df_novel_filtered.groupby('GroupID')['Contig'].apply(list).to_dict()
d_representatives = df_novel_filtered.groupby('GroupID')['RepresentativeContig'].apply(list).to_dict()

out_dir = '/Users/harihara/Research-Activities/Plots/Hot_Spring_Plots/Synechococcus-Paper/Novel_Groups_Circos_Aug_2022/'
if not isdir(out_dir):
    mkdir(out_dir)

In [4]:
prodigal_out = '/Users/harihara/Mount-2/hotspring_metagenome/Synechococcus_paper_analysis/\
Ref_Guided_Scaffolding_Clustering_Aug_2022/BLAST_All_vs_All/Prodigal/Representatives_Prodigal.out'
df_prodigal_hits = Load_Prodigal_GBFF(prodigal_out)
df_prodigal_hits = df_prodigal_hits.set_index('Pred')

eggnog_path = '/Users/harihara/Mount-2/hotspring_metagenome/Synechococcus_paper_analysis/\
Ref_Guided_Scaffolding_Clustering_Aug_2022/BLAST_All_vs_All/EggNOG/Representatives.eggnog.out.emapper.annotations'
df_eggnog = pd.read_csv(eggnog_path, sep = "\t")
df_eggnog = df_eggnog[['#query','seed_ortholog','evalue','score','max_annot_lvl',
                       'COG_category','Description','Preferred_name']]
df_eggnog = df_eggnog.rename(columns = {'#query':'Query'})
df_eggnog = df_eggnog.set_index('Query')

df_prodigal_hits = df_prodigal_hits.join(df_eggnog)
reps = np.unique(df_novel_filtered['RepresentativeContig'].tolist())

df_prodigal_hits = df_prodigal_hits.reset_index()
df_prodigal_hits = df_prodigal_hits.merge(df_novel_filtered[['RepresentativeContig','GroupID']].drop_duplicates(),
                                          left_on = 'Query', right_on='RepresentativeContig', how = 'left')
df_prodigal_hits['COG_category'] = df_prodigal_hits['COG_category'].fillna("-")

colors = {'C' : 'red', 'L':'blue', 'P':'green', '-':'grey', 'M':'orange', 'I':'gold', 
          'S':'purple', 'H':'olive', 'J':'yellow', 'E':'cyan', 'G':'magenta', 'V':'lime',
          'O':'teal', 'F':'black', 'Q':'yellow', 'T':'lightcoral', 'KL':'brown', 'K':'peachpuff', 'FP':'plum', 
          'MU':'peru', 'KLT':'sienna', 'KT':'maroon', 'NU':'indigo', 'U':'darkolivegreen',
          'GM':'crimson', 'NOU':'thistle', 'NPTU':'dodgerblue', 'EGP':'firebrick', 'IQ':'mediumslateblue', 
          'PT':'lightsalmon', 'D':'khaki', 'OU':'darkseagreen', 'ET':'lawngreen', 'CO':'sandybrown',
          'CH':'tan', 'UW':'royalblue', 'HP':'goldenrod', 'EQ':'springgreen', 'HJ':'cadetblue', 'CDZ':'pink', 
          'FG':'slategrey'}


In [5]:
synechococcus_blast = '/Users/harihara/Mount-2/hotspring_metagenome/Synechococcus_paper_analysis/\
Ref_Guided_Scaffolding_Clustering_Aug_2022/BLAST_All_vs_All/Representatives.Syenchococcus.blast'

df_blast = pd.read_csv(synechococcus_blast, sep = "\t",names=['qseqid', 'sseqid', 'pident', 'length', 'mismatch', 
                                                              'gapopen', 'qlen', 'qstart', 'qend', 'slen', 
                                                              'sstart', 'send', 'evalue', 'bitscore'])
df_blast = df_blast.merge(df_novel_filtered[['RepresentativeContig','GroupID']].drop_duplicates(),
                          left_on = 'qseqid', right_on='RepresentativeContig', how = 'left')
df_blast['sseqid'] = df_blast['sseqid'].replace("gi|86604733|ref|NC_007775.1|","OSA")
df_blast['sseqid'] = df_blast['sseqid'].replace("gi|86607503|ref|NC_007776.1|","OSB")


In [7]:
plt.rcParams.update(rcParams)

for g in list(d.keys())[200:225]:    
    print(g)
    contigs = d[g]+[d_representatives[g][0]]
    osa_contig_count, osb_contig_count = 0, 0
    osa_contigs, osb_contigs = [], []
    df_osa, df_osb = pd.DataFrame(), pd.DataFrame()
    
    for c in contigs:
        splits = c.split('_')
        contig = splits[-2]+'_'+splits[-1]
        genome = splits[-3].upper()
        sample = "_".join(splits[:-3])
        
        if genome == "OSA": osa_contig_count += 1
        if genome == "OSB": osb_contig_count += 1
        
        try:
            row = novel_contigs[sample][genome].loc[contig]
            row['Group'] = g
            row['Sample'] = sample
            if genome == 'OSA': df_osa = df_osa.append(row)
            elif genome == 'OSB': df_osb = df_osb.append(row)
        except:
            print("Missing...",sample,genome,contig)
            pass
    
    if len(df_osa) > 0: df_osa = df_osa.reset_index()
    if len(df_osb) > 0: df_osb = df_osb.reset_index()
    
    osa_counts, osb_counts, osa_gbl_counts, osb_gbl_counts = Make_Counts(df_osa, df_osb, osa_len, osb_len)
    osa_gbl_counts += 25
    osb_gbl_counts += 25
    
    ylim = max(np.max(osa_gbl_counts), np.max(osb_gbl_counts))
    
    fig = plt.figure(figsize=(20, 15))
    gs = GridSpec(nrows=2, ncols=2, height_ratios=[2.5, 1.5])
    ax0 = fig.add_subplot(gs[0, 0], projection = 'polar')
    ax1 = fig.add_subplot(gs[0, 1], projection = 'polar')
    ax2 = fig.add_subplot(gs[1, :])
    
    starts = df_prodigal_hits.loc[df_prodigal_hits['GroupID'] == g, 'Start'].tolist()
    ends = df_prodigal_hits.loc[df_prodigal_hits['GroupID'] == g, 'End'].tolist()
    orientations = df_prodigal_hits.loc[df_prodigal_hits['GroupID'] == g, 'Orientation'].tolist()
    cog_categories = df_prodigal_hits.loc[df_prodigal_hits['GroupID'] == g, 'COG_category'].tolist()

    osa_alignments_starts = df_blast.loc[(df_blast['GroupID'] == g) & (df_blast['sseqid'] == 'OSA') ,
                                         'qstart'].tolist()
    osa_alignments_ends = df_blast.loc[(df_blast['GroupID'] == g) & (df_blast['sseqid'] == 'OSA') ,
                                         'qend'].tolist()
    
    osb_alignments_starts = df_blast.loc[(df_blast['GroupID'] == g) & (df_blast['sseqid'] == 'OSB') ,
                                         'qstart'].tolist()
    osb_alignments_ends = df_blast.loc[(df_blast['GroupID'] == g) & (df_blast['sseqid'] == 'OSB') ,
                                         'qend'].tolist()
    
    width = df_prodigal_hits.loc[df_prodigal_hits['GroupID'] == g, 'Qlen'].tolist()[0]
    ax2.plot([4, width],[4,4], linewidth=20, color = 'black')
    
    for i in range(len(osa_alignments_starts)):
        start = min(osa_alignments_starts[i], osa_alignments_ends[i])
        end = max(osa_alignments_starts[i], osa_alignments_ends[i])
        ax2.plot([start, end], [3, 3], color = 'red', linewidth = 2)
        
    for i in range(len(osb_alignments_starts)):
        start = min(osb_alignments_starts[i], osb_alignments_ends[i])
        end = max(osb_alignments_starts[i], osb_alignments_ends[i])
        ax2.plot([start, end], [2, 2], color = 'blue', linewidth = 2)
    
    for i in range(0, len(starts)):
        if orientations[i] == '+':
            ax2.arrow(int(starts[i]),1,int(ends[i])-int(starts[i])+1,0, width = 0.25, alpha = 0.8, 
                      length_includes_head = True, head_length = 20, color = colors[cog_categories[i]])
        else:
            ax2.arrow(int(ends[i]),1,int(starts[i])-int(ends[i])+1,0, width = 0.25, alpha = 0.8, 
                      length_includes_head = True, head_length = 20, color = colors[cog_categories[i]])
    ax2.set_ylim([-1, 5])
    ax2.set_yticks([])
    fig.tight_layout()
    
    label=("Synechococcus Sub.Sp A\n"+"#Contigs (in Group):"+str(osa_contig_count))
    Make_Circle_Plots(ax0, osa_gbl_counts, 'green', label, ylim)
    label=("Synechococcus Sub.Sp B\n"+"#Contigs (in Group):"+str(osb_contig_count))
    Make_Circle_Plots(ax1, osb_gbl_counts, 'orange', label, ylim)
    fig.suptitle(g)
    fig.tight_layout()
    fig.savefig(out_dir+g+'.pdf')
    
    plt.close("all")

Group_595
Group_603
Group_604
Group_605
Group_609
Group_612
Group_614
Group_621
Group_622
Group_631
Group_639
Group_64
Group_645
Group_648
Group_651
Group_659
Group_662
Group_67
Group_699
Group_704
Group_71
Group_718
Group_720
Group_721
Group_73


In [8]:
pdfs = listdir(out_dir)
pdfs.sort()
merger = PdfMerger()
for pdf in pdfs:
    if pdf.startswith('Group'):
        merger.append(out_dir+'/'+pdf)
        print(pdf)
        
merger.write(out_dir+'/Circos.pdf')
merger.close()

Group_0.pdf
Group_1.pdf
Group_10.pdf
Group_100.pdf
Group_101.pdf
Group_108.pdf
Group_110.pdf
Group_112.pdf
Group_115.pdf
Group_118.pdf
Group_119.pdf
Group_120.pdf
Group_121.pdf
Group_122.pdf
Group_123.pdf
Group_127.pdf
Group_129.pdf
Group_130.pdf
Group_131.pdf
Group_132.pdf
Group_135.pdf
Group_138.pdf
Group_139.pdf
Group_14.pdf
Group_140.pdf
Group_15.pdf
Group_152.pdf
Group_155.pdf
Group_160.pdf
Group_162.pdf
Group_17.pdf
Group_173.pdf
Group_174.pdf
Group_178.pdf
Group_179.pdf
Group_18.pdf
Group_182.pdf
Group_189.pdf
Group_191.pdf
Group_193.pdf
Group_197.pdf
Group_198.pdf
Group_199.pdf
Group_201.pdf
Group_204.pdf
Group_205.pdf
Group_208.pdf
Group_21.pdf
Group_212.pdf
Group_222.pdf
Group_223.pdf
Group_226.pdf
Group_227.pdf
Group_229.pdf
Group_235.pdf
Group_237.pdf
Group_244.pdf
Group_249.pdf
Group_254.pdf
Group_256.pdf
Group_257.pdf
Group_258.pdf
Group_259.pdf
Group_266.pdf
Group_267.pdf
Group_268.pdf
Group_269.pdf
Group_27.pdf
Group_270.pdf
Group_273.pdf
Group_279.pdf
Group_282.pdf
Gro