In [1]:
from PyPDF2 import PdfMerger
import pycircos
import warnings
import numpy as np
import pandas as pd
import patchworklib as pw 
import matplotlib.pyplot as plt

from os.path import isdir
from os import listdir, mkdir

warnings.filterwarnings("ignore")

rcParams = {'font.size': 20, 'font.weight': 'normal', 'font.family': 'sans-serif',
            'axes.unicode_minus':False, 'axes.labelweight':'normal'}

plt.rcParams.update(rcParams)

Garc    = pycircos.Garc
Gcircle = pycircos.Gcircle

osa_len = 2932766
osb_len = 3046682
def Make_Counts(df_osa, df_osb, samples):
    osa_indicator = {}
    osb_indicator = {}
    osa_gbl_counts = np.zeros(osa_len)
    osb_gbl_counts = np.zeros(osb_len)
    try:
        osa_samples = set(df_osa['Sample'].tolist())
    except KeyError:
        osa_samples = set({})
    try:
        osb_samples = set(df_osb['Sample'].tolist())
    except KeyError:
        osb_samples = set({})
        
    samples = osa_samples | osb_samples
    
    for g in osa_samples:
        try:
            osa = np.zeros(osa_len)
            temp_osa = df_osa[df_osa['Sample'] == g]
            starts = temp_osa['Start'].tolist()
            ends = temp_osa['End'].tolist()
            lengths = temp_osa['Length'].tolist()
            
            for i in range(len(starts)):
                start, end = int(min(starts[i],ends[i])), int(max(starts[i],ends[i]))
                if abs(start-end) == lengths[i]:    
                    osa[start:end] += 1
                    osa_gbl_counts[start:end] += 1
                else:
                    print("Here", end-start, lengths[i])
                    osa[end:osa_len] += 1
                    osa[0:start] += 1
                    osa_gbl_counts[end:osa_len] += 1
                    osa_gbl_counts[0:start] += 1
            osa[np.isnan(osa)] = 0
            osa_indicator[g] = osa
        except KeyError:
            print('OSA',g)
            osa_indicator[g] = osa
            
    for g in osb_samples:    
        try:
            osb = np.zeros(osb_len)
            temp_osb = df_osb[df_osb['Sample'] == g].reset_index()
            starts = temp_osb['Start'].tolist()
            ends = temp_osb['End'].tolist()
            lengths = temp_osb['Length'].tolist()
            
            for i in range(len(starts)):
                start, end = int(min(starts[i],ends[i])), int(max(starts[i],ends[i]))
                if abs(start-end) == lengths[i]:    
                    osb[start:end] += 1
                    osb_gbl_counts[start:end] += 1
                else:
                    print("Here", end-start, lengths[i])
                    osb[end:osa_len] += 1
                    osb[0:start] += 1
                    osb_gbl_counts[end:osb_len] += 1
                    osb_gbl_counts[0:start] += 1
            osb[np.isnan(osb)] = 0
            osb_indicator[g] = osb
        except KeyError:
            print('OSB',g)
            osb_indicator[g] = osb
    return osa_gbl_counts, osb_gbl_counts

def Max_Clique_Interval_Graph(group):
    length = group.iloc[0]['Contig_Length']
    if len(group) == 1:
        return pd.Series({'Max_Clique':1, 'Start':group['Start'].tolist()[0], 
                          'End': group['End'].tolist()[0], 'Num_Assignments': 1, 
                          'Length':length})
    
    group['Diff'] = group['Start'].shift(-1) - group['End']
    num_assign = len(group)
    max_clique = -1
    max_start, max_end = 0, 0
    clique = -1
    start, end = 0, 0
    
    difference = group['Diff'].tolist()
    starts = group['Start'].tolist()
    ends = group['End'].tolist()
    
    flag = False
    for i in range(0,len(difference)):
        if difference[i] <= 0 and flag == False:
            clique = 2
            start = starts[i]
            end = ends[i]
            flag = True
        elif difference[i] <= 0 and flag == True:
            clique += 1
        elif difference[i] > 0:
            if max_clique < clique:
                max_clique = clique
                max_start = start
                max_end = end
            flag = False
            clique = -1
    
    if max_clique < clique:
        max_clique = clique
        max_start = start
        max_end = end
            
    if max_clique == -1:
        max_clique = 1
        i = np.argmin(difference)
        max_start = starts[i]
        max_end = ends[i]
    
    return pd.Series({'Max_Clique':max_clique, 'Start':max_start, 
                      'End': max_end, 'Num_Assignments':num_assign,
                      'Length':length})         

  import pandas.util.testing as tm


In [2]:
novel_contigs = {}
novel_contig_path = '/Users/harihara/Research-Activities/Data/Hot-Spring/Missing_Contig_Coords_2/'
samples = listdir(novel_contig_path+'OSA/')
for s in samples:
    df_osa = pd.read_csv(novel_contig_path+'OSA/'+s, sep = "\t")
    df_osa.loc[(df_osa['Start'] < 0), 'Start'] += osa_len
    df_osa.loc[(df_osa['End'] < 0), 'End'] += osa_len
    df_osa_grp = df_osa.sort_values(by = ['Contig','Start']).groupby(['Contig']).apply(Max_Clique_Interval_Graph)
    
    df_osb = pd.read_csv(novel_contig_path+'OSB/'+s, sep = "\t")
    df_osb.loc[(df_osb['Start'] < 0), 'Start'] += osb_len
    df_osb.loc[(df_osb['End'] < 0), 'End'] += osb_len
    df_osb_grp = df_osb.sort_values(by = ['Contig','Start']).groupby(['Contig']).apply(Max_Clique_Interval_Graph)
    
    novel_contigs[s.replace(".txt","")] = {'OSA':df_osa_grp,'OSB':df_osb_grp}
    

In [3]:
grp_path = '/Users/harihara/Research-Activities/Data/Hot-Spring/contig_containment_groups_subset_filtered.txt'
df_novel_filtered = pd.read_csv(grp_path, sep = "\t")
d = df_novel_filtered.groupby('GroupID')['Contig'].apply(list).to_dict()

out_dir = '/Users/harihara/Research-Activities/Plots/Hot_Spring_Plots/Synechococcus-Paper/Novel_Groups_Circos_2/'
if not isdir(out_dir):
    mkdir(out_dir)


In [4]:
i = 0
for g in list(d.keys()):
    contigs = d[g]
    osa_contig_count, osb_contig_count = 0, 0
    osa_contigs, osb_contigs = [], []
    df_osa, df_osb = pd.DataFrame(), pd.DataFrame()
    
    for c in contigs:
        splits = c.split('_')
        
        if len(splits) == 5:
            sample = splits[0]+'_'+splits[1]
            genome = splits[2].upper()
            contig = splits[3]+'_'+splits[4]
        if len(splits) == 4:
            sample = splits[0]
            genome = splits[1].upper()
            contig = splits[2]+'_'+splits[3]
        if genome == "OSA":
            osa_contig_count += 1
        if genome == "OSB":
            osb_contig_count += 1
        
        try:
            row = novel_contigs[sample][genome].loc[contig]
            row['Group'] = g
            row['Sample'] = sample
            if genome == 'OSA':
                df_osa = df_osa.append(row)
            elif genome == 'OSB':
                df_osb = df_osb.append(row)
        except:
            print("Missing...",sample,genome,contig)
            pass
    
    if len(df_osa) > 0:
        df_osa = df_osa.reset_index()
    if len(df_osb) > 0:
        df_osb = df_osb.reset_index()
    
    osa_counts, osb_counts = Make_Counts(df_osa, df_osb, list(novel_contigs.keys()))
    
    garc_osa = Garc(arc_id="OSA", interspace=0, linewidth=0, facecolor="#FFFFFF00", raxis_range=(0,10),
                    label=("Synechococcus Sub.Sp A\n"+"#Contigs (in Group):"+str(osa_contig_count)+
                           "\n#Contigs (in Graph):"+str(len(df_osa))), labelsize = 24,label_visible=True)
    garc_osb = Garc(arc_id="OSB", interspace=0, linewidth=0, facecolor="#FFFFFF00", raxis_range=(0,10),
                    label=("Synechococcus Sub.Sp B\n"+"#Contigs (in Group):"+str(osb_contig_count)+
                           "\n#Contigs (in Graph):"+str(len(df_osb))), labelsize = 24,label_visible=True)
    
    gcircle_osa = Gcircle(fig=pw.Brick._figure)
    gcircle_osa.add_garc(garc_osa)
    gcircle_osa.set_garcs()
    ylim = int(max(np.max(osa_counts), np.max(osb_counts)))+0.000001
    print(ylim)
    
    gcircle_osa.lineplot('OSA',osa_counts+0.000001,  raxis_range=(800,1000),
                         rlim = (0, ylim), linewidth = 4, linecolor = 'green')
    
    gcircle_osb = Gcircle(fig=pw.Brick._figure)
    gcircle_osb.add_garc(garc_osb)
    gcircle_osb.set_garcs()
    gcircle_osb.lineplot('OSB',osb_counts+0.000001,  raxis_range=(800,1000),
                         rlim = (0, ylim), linewidth = 4, linecolor = 'orange')
    
    pw.param["margin"] = 0.0001
    
    circos12 = pw.cBrick(ax=gcircle_osa.ax, figsize = (16,10)) | pw.cBrick(ax=gcircle_osb.ax, figsize = (16,10))
    circos12.set_suptitle(g, size = 60)
    circos12.savefig(out_dir+g+'.pdf')
    
    print(i, g)
    print(osa_contig_count, len(df_osa), np.max(osa_counts))
    print(osb_contig_count, len(df_osb), np.max(osb_counts))
    
    i+=1
    plt.close("all")

Missing... HotsprottomLayer OSA k141_1518
5.000001
0 Group_1003
9 8 5.0
0 0 0.0
Missing... Hotspr2Sample148 OSB k141_11354
5.000001
1 Group_1008
6 6 5.0
9 8 4.0
6.000001
2 Group_1012
13 13 2.0
14 14 6.0
Missing... HotsprottomLayer OSA k141_4998
7.000001
3 Group_1013
24 23 2.0
27 27 7.0
2.000001
4 Group_1015
12 12 2.0
6 6 2.0
3.000001
5 Group_1016
0 0 0.0
8 8 3.0
15.000001
6 Group_1021
0 0 0.0
25 25 15.0
Missing... Hotspr2Sample148 OSA k141_14015
Missing... HotsprottomLayer_2 OSA k141_9551
Missing... HotsprSampleMS55 OSA k141_5140
7.000001
7 Group_1025
21 18 7.0
1 1 1.0
Missing... Hotspr2Sample148 OSB k141_32465
Missing... Hotspr2Sampleee2 OSB k141_1426
Missing... Hotspr2Sampleme2 OSB k141_74679
Missing... Hotspr2SamplePe2 OSB k141_544
Missing... HotsprSampleMS60 OSB k141_23227
1.000001
8 Group_1030
1 1 1.0
5 0 0.0
Missing... Hotspr20SampleT8 OSB k141_28821
Missing... Hotspr2Sample148 OSA k141_13686
Missing... Hotspr2Sample148 OSB k141_27481
Missing... Hotspr2Sample149 OSA k141_13694
Mi

34 Group_187
0 0 0.0
19 1 1.0
Missing... Hotspr20Samplem2 OSA k141_766
Missing... HotsprottomLayer_2 OSA k141_1936
Missing... HotsprottomLayer_2 OSA k141_2626
Missing... HotsprSampOS1260 OSA k141_11097
4.000001
35 Group_211
25 21 4.0
6 6 2.0
Missing... Hotspr2SamplePe2 OSB k141_215
Missing... HotsprSampleR4cd OSB k141_31309
10.000001
36 Group_218
23 23 8.0
20 18 10.0
Missing... Hotspr20Samplem2 OSB k141_17753
Missing... HotsprSampleMS13 OSB k141_26733
Missing... HotsprSampleMS55 OSB k141_25740
15.000001
37 Group_223
0 0 0.0
31 28 15.0
Missing... Hotspr2Sample148 OSB k141_11393
Missing... Hotspr2SamplePe2 OSB k141_25784
Missing... HotsprSampleMS55 OSB k141_25439
Missing... HotsprSampleMSe2 OSB k141_53556
Missing... HotsprSampleOSM2 OSB k141_8528
Missing... HotsprSampleR4cd OSA k141_39201
13.000001
38 Group_23
22 21 11.0
22 17 13.0
Missing... Hotspr20Samplem2 OSB k141_13432
Missing... Hotspr20Samplet1 OSB k141_6315
Missing... Hotspr2Sample149 OSB k141_8857
Missing... Hotspr2Sampleee2 OSA

3.000001
53 Group_365
47 3 2.0
44 5 3.0
Missing... Hotspr20SampleT8 OSA k141_1164
Missing... Hotspr20SampleT8 OSA k141_27387
Missing... Hotspr20SampleT9 OSA k141_15092
Missing... Hotspr2Sample148 OSA k141_4703
Missing... Hotspr2Sampleee2 OSA k141_23429
Missing... Hotspr2Sampleme2 OSA k141_7998
Missing... Hotspr2SamplePe2 OSA k141_6679
Missing... HotsprSampleMS13 OSA k141_1960
Missing... HotsprSampleMS50 OSA k141_783
Missing... HotsprSampleMS55 OSA k141_11405
Missing... HotsprSampleMS60 OSA k141_16190
Missing... HotsprSampleMSe1 OSA k141_17111
Missing... HotsprSampleMSe2 OSA k141_13662
Missing... HotsprSampleMSe2 OSA k141_15508
Missing... HotsprSampleMSe3 OSA k141_17073
Missing... HotsprSampleMSe3 OSA k141_17533
Missing... HotsprSampleMSe4 OSA k141_14925
Missing... HotsprSampleOS65 OSA k141_1215
Missing... HotsprSampleOSM3 OSA k141_25617
Missing... HotsprSampleOSM4 OSA k141_22485
Missing... HotsprSampleR4cd OSA k141_14412
Missing... HotsprSampleR4cd OSA k141_22624
Missing... HotsprSampl

81 Group_627
10 8 4.0
14 6 2.0
13.000001
82 Group_629
0 0 0.0
18 18 13.0
Missing... Hotspr20Samplet1 OSA k141_20341
Missing... Hotspr20Samplet1 OSB k141_8248
Missing... Hotspr20SampleT9 OSA k141_7937
Missing... Hotspr2Sample148 OSA k141_10140
Missing... Hotspr2Sample149 OSB k141_30404
Missing... Hotspr2SamplePe2 OSA k141_6260
Missing... Hotspr2SamplePe2 OSB k141_35266
Missing... HotsprOSTMatCore OSA k141_390
Missing... HotsprSampleMS50 OSA k141_10778
Missing... HotsprSampleMS65 OSA k141_1741
Missing... HotsprSampleOS60 OSB k141_12914
Missing... HotsprSampleOS65 OSA k141_2432
Missing... HotsprSampleOSM3 OSA k141_7355
Missing... HotsprSampleOSM3 OSB k141_44387
11.000001
83 Group_634
23 14 6.0
23 18 11.0
6.000001
84 Group_636
11 11 6.0
0 0 0.0
Missing... Hotspr2Sampleee2 OSB k141_10994
Missing... HotsprSampleMS50 OSB k141_3044
Missing... HotsprSampleMSe1 OSB k141_57471
Missing... HotsprSampleOS65 OSB k141_40811
16.000001
85 Group_644
24 24 16.0
23 19 8.0
Missing... Hotspr20Samplem2 OSB k1

112 Group_835
22 0 0.0
25 1 1.0
2.000001
113 Group_844
12 12 2.0
4 4 2.0
Missing... Hotspr2Sample149 OSB k141_10427
Missing... HotsprSampleMS50 OSB k141_1386
Missing... HotsprSampleMS55 OSA k141_8595
13.000001
114 Group_851
22 21 13.0
21 19 9.0
Missing... Hotspr20Samplem2 OSA k141_19248
Missing... HotsprottomLayer OSA k141_1937
Missing... HotsprottomLayer OSA k141_5496
Missing... HotsprSampleOSM1 OSB k141_13587
Missing... HotsprSampleOSM2 OSB k141_954
Missing... HotsprSampleOSM4 OSA k141_10938
Missing... HotsprSampOS1260 OSB k141_2244
3.000001
115 Group_854
19 15 3.0
15 12 3.0
Missing... HotsprOSTMatCore OSA k141_3782
Missing... HotsprSampleOSM2 OSB k141_13212
1.000001
116 Group_856
6 5 1.0
2 1 1.0
Missing... HotsprSampleOS65 OSA k141_1846
5.000001
117 Group_857
23 22 5.0
9 9 3.0
Missing... Hotspr20SampleP4 OSA k141_28254
Missing... Hotspr20SampleT8 OSA k141_13398
Missing... Hotspr20SampleT9 OSA k141_8296
Missing... Hotspr2Sample148 OSA k141_13923
Missing... Hotspr2Sample149 OSA k141_1

In [5]:
pdfs = listdir(out_dir)
pdfs.sort()
merger = PdfMerger()
for pdf in pdfs:
    if pdf.startswith('Group'):
        merger.append(out_dir+'/'+pdf)
        print(pdf)

Group_1003.pdf
Group_1008.pdf
Group_1012.pdf
Group_1013.pdf
Group_1015.pdf
Group_1016.pdf
Group_1021.pdf
Group_1025.pdf
Group_1030.pdf
Group_1034.pdf
Group_1040.pdf
Group_1042.pdf
Group_1044.pdf
Group_105.pdf
Group_1051.pdf
Group_1058.pdf
Group_1066.pdf
Group_1075.pdf
Group_1079.pdf
Group_1098.pdf
Group_1106.pdf
Group_1153.pdf
Group_1156.pdf
Group_1174.pdf
Group_1178.pdf
Group_1184.pdf
Group_1189.pdf
Group_125.pdf
Group_128.pdf
Group_130.pdf
Group_175.pdf
Group_179.pdf
Group_180.pdf
Group_182.pdf
Group_187.pdf
Group_211.pdf
Group_218.pdf
Group_223.pdf
Group_23.pdf
Group_249.pdf
Group_258.pdf
Group_284.pdf
Group_286.pdf
Group_290.pdf
Group_306.pdf
Group_312.pdf
Group_321.pdf
Group_322.pdf
Group_323.pdf
Group_333.pdf
Group_335.pdf
Group_353.pdf
Group_364.pdf
Group_365.pdf
Group_382.pdf
Group_40.pdf
Group_406.pdf
Group_416.pdf
Group_419.pdf
Group_423.pdf
Group_437.pdf
Group_464.pdf
Group_476.pdf
Group_482.pdf
Group_485.pdf
Group_495.pdf
Group_498.pdf
Group_501.pdf
Group_508.pdf
Group_519.

In [6]:
merger.write(out_dir+'/Circos.pdf')
merger.close()

In [7]:
d['Group_1021']

['Hotspr20Samplem2_osb_k141_19542',
 'Hotspr20SampleP4_osb_k141_16222',
 'Hotspr20Samplet1_osb_k141_10031',
 'Hotspr20SampleT8_osb_k141_63099',
 'Hotspr20SampleT9_osb_k141_40430',
 'Hotspr2Sample148_osb_k141_16155',
 'Hotspr2Sample149_osb_k141_5083',
 'Hotspr2Sampleee2_osb_k141_63386',
 'Hotspr2Sampleme2_osb_k141_47137',
 'Hotspr2SamplePe2_osb_k141_21553',
 'HotsprottomLayer_2_osb_k141_7141',
 'HotsprSampleMS13_osb_k141_67751',
 'HotsprSampleMS50_osb_k141_19140',
 'HotsprSampleMS55_osb_k141_25062',
 'HotsprSampleMS60_osb_k141_37150',
 'HotsprSampleMSe1_osb_k141_42547',
 'HotsprSampleMSe3_osb_k141_34467',
 'HotsprSampleMSe4_osb_k141_39229',
 'HotsprSampleOS50_osb_k141_25763',
 'HotsprSampleOS65_osb_k141_21149',
 'HotsprSampleOSM2_osb_k141_7141',
 'HotsprSampleOSM3_osb_k141_33276',
 'HotsprSampleOSM4_osb_k141_64529',
 'HotsprSampleR4cd_osb_k141_13282',
 'HotsprSampOS1260_osb_k141_14707']

In [11]:
    df_novel_filtered[df_novel_filtered['GroupID']=='Group_1042']

Unnamed: 0,Contig,RepresentativeContig,GroupID,RepresentativeContigLength,ContigLength,RepresentativeContigSynHits,ContigSynHits,MatchesToRef
77,Hotspr20Samplem2_osb_k141_20839,Hotspr2Sampleee2_osb_k141_26365,Group_1042,1067,512,yes,yes,yes
231,Hotspr20SampleP4_osb_k141_41900,Hotspr2Sampleee2_osb_k141_26365,Group_1042,1067,984,yes,yes,yes
520,Hotspr20SampleT8_osb_k141_65461,Hotspr2Sampleee2_osb_k141_26365,Group_1042,1067,1061,yes,yes,yes
625,Hotspr20SampleT9_osb_k141_26849,Hotspr2Sampleee2_osb_k141_26365,Group_1042,1067,855,yes,yes,yes
728,Hotspr2Sample148_osb_k141_13890,Hotspr2Sampleee2_osb_k141_26365,Group_1042,1067,821,yes,yes,yes
965,Hotspr2Sampleee2_osb_k141_26365,Hotspr2Sampleee2_osb_k141_26365,Group_1042,1067,1067,yes,yes,yes
1127,Hotspr2Sampleme2_osb_k141_58968,Hotspr2Sampleee2_osb_k141_26365,Group_1042,1067,893,yes,yes,yes
1564,HotsprSampleMS13_osb_k141_67992,Hotspr2Sampleee2_osb_k141_26365,Group_1042,1067,983,yes,yes,yes
1631,HotsprSampleMS50_osb_k141_16957,Hotspr2Sampleee2_osb_k141_26365,Group_1042,1067,932,yes,yes,yes
1866,HotsprSampleMS60_osb_k141_1596,Hotspr2Sampleee2_osb_k141_26365,Group_1042,1067,817,yes,yes,yes
