In [1]:
from PyPDF2 import PdfMerger
import pycircos
import warnings
import numpy as np
import pandas as pd
import patchworklib as pw 
import matplotlib.pyplot as plt

from os.path import isdir
from os import listdir, mkdir

warnings.filterwarnings("ignore")

rcParams = {'font.size': 20, 'font.weight': 'normal', 'font.family': 'sans-serif',
            'axes.unicode_minus':False, 'axes.labelweight':'normal'}

plt.rcParams.update(rcParams)

Garc    = pycircos.Garc
Gcircle = pycircos.Gcircle

osa_len = 2932766
osb_len = 3046682
def Make_Counts(df_osa, df_osb, samples):
    osa_indicator = {}
    osb_indicator = {}
    osa_gbl_counts = np.zeros(osa_len)
    osb_gbl_counts = np.zeros(osb_len)
    try:
        osa_samples = set(df_osa['Sample'].tolist())
    except KeyError:
        osa_samples = set({})
    try:
        osb_samples = set(df_osb['Sample'].tolist())
    except KeyError:
        osb_samples = set({})
        
    samples = osa_samples | osb_samples
    
    for g in osa_samples:
        try:
            osa = np.zeros(osa_len)
            temp_osa = df_osa[df_osa['Sample'] == g]
            starts = temp_osa['Start'].tolist()
            ends = temp_osa['End'].tolist()
            lengths = temp_osa['Length'].tolist()
            
            for i in range(len(starts)):
                start, end = int(min(starts[i],ends[i])), int(max(starts[i],ends[i]))
                if abs(start-end) == lengths[i]:    
                    osa[start:end] += 1
                    osa_gbl_counts[start:end] += 1
                else:
                    print("Here", end-start, lengths[i])
                    osa[end:osa_len] += 1
                    osa[0:start] += 1
                    osa_gbl_counts[end:osa_len] += 1
                    osa_gbl_counts[0:start] += 1
            osa[np.isnan(osa)] = 0
            osa_indicator[g] = osa
        except KeyError:
            print('OSA',g)
            osa_indicator[g] = osa
            
    for g in osb_samples:    
        try:
            osb = np.zeros(osb_len)
            temp_osb = df_osb[df_osb['Sample'] == g].reset_index()
            starts = temp_osb['Start'].tolist()
            ends = temp_osb['End'].tolist()
            lengths = temp_osb['Length'].tolist()
            
            for i in range(len(starts)):
                start, end = int(min(starts[i],ends[i])), int(max(starts[i],ends[i]))
                if abs(start-end) == lengths[i]:    
                    osb[start:end] += 1
                    osb_gbl_counts[start:end] += 1
                else:
                    print("Here", end-start, lengths[i])
                    osb[end:osa_len] += 1
                    osb[0:start] += 1
                    osb_gbl_counts[end:osb_len] += 1
                    osb_gbl_counts[0:start] += 1
            osb[np.isnan(osb)] = 0
            osb_indicator[g] = osb
        except KeyError:
            print('OSB',g)
            osb_indicator[g] = osb
    return osa_gbl_counts, osb_gbl_counts

def Max_Clique_Interval_Graph(group):
    length = group.iloc[0]['Contig_Length']
    if len(group) == 1:
        return pd.Series({'Max_Clique':1, 'Start':group['Start'].tolist()[0], 
                          'End': group['End'].tolist()[0], 'Num_Assignments': 1, 
                          'Length':length})
    
    group['Diff'] = group['Start'].shift(-1) - group['End']
    num_assign = len(group)
    max_clique = -1
    max_start, max_end = 0, 0
    clique = -1
    start, end = 0, 0
    
    difference = group['Diff'].tolist()
    starts = group['Start'].tolist()
    ends = group['End'].tolist()
    
    flag = False
    for i in range(0,len(difference)):
        if difference[i] <= 0 and flag == False:
            clique = 2
            start = starts[i]
            end = ends[i]
            flag = True
        elif difference[i] <= 0 and flag == True:
            clique += 1
        elif difference[i] > 0:
            if max_clique < clique:
                max_clique = clique
                max_start = start
                max_end = end
            flag = False
            clique = -1
    
    if max_clique < clique:
        max_clique = clique
        max_start = start
        max_end = end
            
    if max_clique == -1:
        max_clique = 1
        i = np.argmin(difference)
        max_start = starts[i]
        max_end = ends[i]
    
    return pd.Series({'Max_Clique':max_clique, 'Start':max_start, 
                      'End': max_end, 'Num_Assignments':num_assign,
                      'Length':length})         

  import pandas.util.testing as tm


In [2]:
novel_contigs = {}
novel_contig_path = '/Users/harihara/Research-Activities/Data/Hot-Spring/Missing_Contig_Coords_2/'
samples = listdir(novel_contig_path+'OSA/')
for s in samples:
    df_osa = pd.read_csv(novel_contig_path+'OSA/'+s, sep = "\t")
    df_osa.loc[(df_osa['Start'] < 0), 'Start'] += osa_len
    df_osa.loc[(df_osa['End'] < 0), 'End'] += osa_len
    df_osa_grp = df_osa.sort_values(by = ['Contig','Start']).groupby(['Contig']).apply(Max_Clique_Interval_Graph)
    
    df_osb = pd.read_csv(novel_contig_path+'OSB/'+s, sep = "\t")
    df_osb.loc[(df_osb['Start'] < 0), 'Start'] += osb_len
    df_osb.loc[(df_osb['End'] < 0), 'End'] += osb_len
    df_osb_grp = df_osb.sort_values(by = ['Contig','Start']).groupby(['Contig']).apply(Max_Clique_Interval_Graph)
    
    novel_contigs[s.replace(".txt","")] = {'OSA':df_osa_grp,'OSB':df_osb_grp}
    

In [3]:
grp_path = '/Users/harihara/Downloads/contig_containment_groups_subset.txt'
df_novel_filtered = pd.read_csv(grp_path, sep = "\t")
d = df_novel_filtered.groupby('GroupID')['Contig'].apply(list).to_dict()

out_dir = '/Users/harihara/Research-Activities/Plots/Hot_Spring_Plots/Synechococcus-Paper/Novel_Groups_Circos_June_2022/'
if not isdir(out_dir):
    mkdir(out_dir)


In [4]:
i = 0
for g in list(d.keys()):
    contigs = d[g]
    osa_contig_count, osb_contig_count = 0, 0
    osa_contigs, osb_contigs = [], []
    df_osa, df_osb = pd.DataFrame(), pd.DataFrame()
    
    for c in contigs:
        splits = c.split('_')
        
        if len(splits) == 5:
            sample = splits[0]+'_'+splits[1]
            genome = splits[2].upper()
            contig = splits[3]+'_'+splits[4]
        if len(splits) == 4:
            sample = splits[0]
            genome = splits[1].upper()
            contig = splits[2]+'_'+splits[3]
        if genome == "OSA":
            osa_contig_count += 1
        if genome == "OSB":
            osb_contig_count += 1
        
        try:
            row = novel_contigs[sample][genome].loc[contig]
            row['Group'] = g
            row['Sample'] = sample
            if genome == 'OSA':
                df_osa = df_osa.append(row)
            elif genome == 'OSB':
                df_osb = df_osb.append(row)
        except:
            print("Missing...",sample,genome,contig)
            pass
    
    if len(df_osa) > 0:
        df_osa = df_osa.reset_index()
    if len(df_osb) > 0:
        df_osb = df_osb.reset_index()
    
    osa_counts, osb_counts = Make_Counts(df_osa, df_osb, list(novel_contigs.keys()))
    
    garc_osa = Garc(arc_id="OSA", interspace=0, linewidth=0, facecolor="#FFFFFF00", raxis_range=(0,10),
                    label=("Synechococcus Sub.Sp A\n"+"#Contigs (in Group):"+str(osa_contig_count)+
                           "\n#Contigs (in Graph):"+str(len(df_osa))), labelsize = 24,label_visible=True)
    garc_osb = Garc(arc_id="OSB", interspace=0, linewidth=0, facecolor="#FFFFFF00", raxis_range=(0,10),
                    label=("Synechococcus Sub.Sp B\n"+"#Contigs (in Group):"+str(osb_contig_count)+
                           "\n#Contigs (in Graph):"+str(len(df_osb))), labelsize = 24,label_visible=True)
    
    gcircle_osa = Gcircle(fig=pw.Brick._figure)
    gcircle_osa.add_garc(garc_osa)
    gcircle_osa.set_garcs()
    ylim = int(max(np.max(osa_counts), np.max(osb_counts)))+0.000001
    print(ylim)
    
    gcircle_osa.lineplot('OSA',osa_counts+0.000001,  raxis_range=(800,1000),
                         rlim = (0, ylim), linewidth = 4, linecolor = 'green')
    
    gcircle_osb = Gcircle(fig=pw.Brick._figure)
    gcircle_osb.add_garc(garc_osb)
    gcircle_osb.set_garcs()
    gcircle_osb.lineplot('OSB',osb_counts+0.000001,  raxis_range=(800,1000),
                         rlim = (0, ylim), linewidth = 4, linecolor = 'orange')
    
    pw.param["margin"] = 0.0001
    
    circos12 = pw.cBrick(ax=gcircle_osa.ax, figsize = (16,10)) | pw.cBrick(ax=gcircle_osb.ax, figsize = (16,10))
    circos12.set_suptitle(g, size = 60)
    circos12.savefig(out_dir+g+'.pdf')
    
    print(i, g)
    print(osa_contig_count, len(df_osa), np.max(osa_counts))
    print(osb_contig_count, len(df_osb), np.max(osb_counts))
    
    i+=1
    plt.close("all")

11.000001
0 Group_1
20 20 11.0
13 13 10.0
10.000001
1 Group_10
23 23 8.0
17 17 10.0
3.000001
2 Group_100
3 3 2.0
4 4 3.0
14.000001
3 Group_11
0 0 0.0
27 27 14.0
3.000001
4 Group_12
9 9 3.0
0 0 0.0
12.000001
5 Group_13
0 0 0.0
19 19 12.0
5.000001
6 Group_14
0 0 0.0
10 10 5.0
8.000001
7 Group_15
29 29 8.0
4 4 2.0
7.000001
8 Group_16
15 15 2.0
12 12 7.0
4.000001
9 Group_17
2 2 1.0
4 4 4.0
5.000001
10 Group_18
13 13 5.0
7 7 5.0
4.000001
11 Group_19
11 11 4.0
16 16 4.0
9.000001
12 Group_2
15 15 5.0
12 12 9.0
3.000001
13 Group_20
6 6 3.0
4 4 3.0
1.000001
14 Group_21
9 9 1.0
1 1 1.0
6.000001
15 Group_22
0 0 0.0
13 13 6.0
15.000001
16 Group_23
2 2 1.0
24 24 15.0
3.000001
17 Group_24
6 6 3.0
4 4 2.0
2.000001
18 Group_25
0 0 0.0
7 7 2.0
4.000001
19 Group_26
12 12 4.0
1 1 1.0
5.000001
20 Group_27
0 0 0.0
18 18 5.0
6.000001
21 Group_28
7 7 4.0
11 11 6.0
13.000001
22 Group_29
16 16 8.0
23 23 13.0
5.000001
23 Group_3
11 11 4.0
13 13 5.0
2.000001
24 Group_30
9 9 2.0
2 2 2.0
11.000001
25 Group_31
20 2

In [5]:
pdfs = listdir(out_dir)
pdfs.sort()
merger = PdfMerger()
for pdf in pdfs:
    if pdf.startswith('Group'):
        merger.append(out_dir+'/'+pdf)
        print(pdf)

Group_1.pdf
Group_10.pdf
Group_100.pdf
Group_11.pdf
Group_12.pdf
Group_13.pdf
Group_14.pdf
Group_15.pdf
Group_16.pdf
Group_17.pdf
Group_18.pdf
Group_19.pdf
Group_2.pdf
Group_20.pdf
Group_21.pdf
Group_22.pdf
Group_23.pdf
Group_24.pdf
Group_25.pdf
Group_26.pdf
Group_27.pdf
Group_28.pdf
Group_29.pdf
Group_3.pdf
Group_30.pdf
Group_31.pdf
Group_32.pdf
Group_33.pdf
Group_34.pdf
Group_35.pdf
Group_36.pdf
Group_37.pdf
Group_38.pdf
Group_39.pdf
Group_4.pdf
Group_40.pdf
Group_41.pdf
Group_42.pdf
Group_43.pdf
Group_44.pdf
Group_45.pdf
Group_46.pdf
Group_47.pdf
Group_48.pdf
Group_49.pdf
Group_5.pdf
Group_50.pdf
Group_51.pdf
Group_52.pdf
Group_53.pdf
Group_54.pdf
Group_55.pdf
Group_56.pdf
Group_57.pdf
Group_58.pdf
Group_59.pdf
Group_6.pdf
Group_60.pdf
Group_61.pdf
Group_62.pdf
Group_63.pdf
Group_64.pdf
Group_65.pdf
Group_66.pdf
Group_67.pdf
Group_68.pdf
Group_69.pdf
Group_7.pdf
Group_70.pdf
Group_71.pdf
Group_72.pdf
Group_73.pdf
Group_74.pdf
Group_75.pdf
Group_76.pdf
Group_77.pdf
Group_78.pdf
Group

In [6]:
merger.write(out_dir+'/Circos.pdf')
merger.close()