In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from os import listdir
import datetime as dt

rcParams = {'font.size': 20, 
            'font.weight': 'normal', 
            'font.family': 'sans-serif',
            'axes.unicode_minus':False, 
            'axes.labelweight':'normal', 
            'xtick.labelsize' : 16,
            'ytick.labelsize' : 16}

def Load_CSV(filepath):
    df = pd.DataFrame()
    iters = pd.read_csv(filepath,sep = "\t",chunksize=100000, engine='c')
    for i in iters:
        df = df.append(i, ignore_index = True)
    return df

In [2]:
data_dir = '/Users/harihara/Mount-2/hotspring_metagenome/Synechococcus_paper_analysis/'

In [3]:
OSA_averages = pd.DataFrame()
OSB_averages = pd.DataFrame()
for f in listdir(data_dir+'Differential_Read_Counting/'):
    if f.startswith("Hot"):
        sample = f.replace("_Diff_Read_Count", "")
        df_OSA = Load_CSV(data_dir+'Differential_Read_Counting/'+f+'/Genome.OSA.Coverage.gz')
        df_OSB = Load_CSV(data_dir+'Differential_Read_Counting/'+f+'/Genome.OSB.Coverage.gz')
        
        OSA_averages[sample] = df_OSA['Fold_Cov'].tolist()
        OSB_averages[sample] = df_OSB['Fold_Cov'].tolist()
        
        del df_OSA, df_OSB
        print(dt.datetime.now(), sample)

2022-06-22 17:13:22.983274 Hotspr20Samplet1
2022-06-22 17:13:49.187026 Hotspr2Sample149
2022-06-22 17:14:21.348617 HotsprSampleR4cd
2022-06-22 17:14:50.069719 HotsprSampOS1260
2022-06-22 17:15:14.982885 HotsprSampleOS50
2022-06-22 17:15:41.318142 HotsprSampleMS50
2022-06-22 17:16:16.112668 HotsprSampleOS60
2022-06-22 17:16:47.000874 HotsprSampleMSe3
2022-06-22 17:17:19.771431 Hotspr20SampleT9
2022-06-22 17:17:49.747463 HotsprSampleOS55
2022-06-22 17:18:18.933037 HotsprSamplt10cd
2022-06-22 17:18:50.554343 HotsprSampleMSe4
2022-06-22 17:19:19.041995 HotsprSampleOSM1
2022-06-22 17:19:48.565451 HotsprSampleMS65
2022-06-22 17:20:14.675286 HotsprSampleOS65
2022-06-22 17:20:41.537893 HotsprSampOS1265
2022-06-22 17:21:14.720806 HotsprSampleOSM4
2022-06-22 17:21:41.924257 HotsprSampleMS55
2022-06-22 17:22:11.964409 HotsprSampleOSM3
2022-06-22 17:22:31.013069 HotsprottomLayer
2022-06-22 17:23:02.229554 Hotspr20SampleT8
2022-06-22 17:23:36.193310 Hotspr20SampleP4
2022-06-22 17:23:58.251985 Hotsp

In [4]:
d = {'OSA':OSA_averages, 'OSB':OSB_averages}

genome = data_dir+'/Non_Synechococcus_Alignments/'
genomes = listdir(genome)
genomes.sort()

for g in genomes:
    if g[0] == '.':
        continue
    df_averages = pd.DataFrame()
    hotsprings = listdir(genome+g+'/')
    for h in hotsprings:
        if h.startswith("Hot"):
            try:
                sample = h.replace("_FD", "")
                df = Load_CSV(genome+g+'/'+h+'/Genome.'+g+'.Coverage.gz')
                df_averages[sample] = df['Fold_Cov'].tolist()
            except FileNotFoundError:
                pass
            print(dt.datetime.now(), g, sample)
    d[g] = df_averages

2022-06-22 17:32:19.784279 Candidatus_Thermochlorobacteriaceae HotsprottomLayer
2022-06-22 17:32:27.426712 Candidatus_Thermochlorobacteriaceae HotsprSampOS1260
2022-06-22 17:32:35.507673 Candidatus_Thermochlorobacteriaceae HotsprSampleMS65
2022-06-22 17:32:43.324080 Candidatus_Thermochlorobacteriaceae Hotspr20Samplem2
2022-06-22 17:32:51.259520 Candidatus_Thermochlorobacteriaceae HotsprSampleOSM1
2022-06-22 17:32:58.778398 Candidatus_Thermochlorobacteriaceae HotsprSampleMS60
2022-06-22 17:33:06.348611 Candidatus_Thermochlorobacteriaceae HotsprSampleOSM3
2022-06-22 17:33:14.381711 Candidatus_Thermochlorobacteriaceae HotsprSampleMSe1
2022-06-22 17:33:21.732619 Candidatus_Thermochlorobacteriaceae HotsprSampleOSM2
2022-06-22 17:33:29.122638 Candidatus_Thermochlorobacteriaceae HotsprSampleMS55
2022-06-22 17:33:36.621779 Candidatus_Thermochlorobacteriaceae HotsprSampleOS65
2022-06-22 17:33:44.402998 Candidatus_Thermochlorobacteriaceae Hotspr20Samplet1
2022-06-22 17:33:52.644856 Candidatus_Th

2022-06-22 17:55:21.266314 Gloeomargarita Hotspr2Sampleee2
2022-06-22 17:55:28.378194 Gloeomargarita Hotspr2Sample149
2022-06-22 17:55:35.795023 Gloeomargarita HotsprSampleOS50
2022-06-22 17:55:42.907541 Gloeomargarita Hotspr2Sample148
2022-06-22 17:55:49.992208 Gloeomargarita HotsprOSTMatCore
2022-06-22 17:55:57.223300 Gloeomargarita HotsprottomLayer_2
2022-06-22 17:56:04.635342 Gloeomargarita HotsprSampleOSM4
2022-06-22 17:56:11.876472 Gloeomargarita HotsprSampleMS13
2022-06-22 17:56:19.027680 Gloeomargarita HotsprSampleMSe4
2022-06-22 17:56:26.144803 Gloeomargarita Hotspr20SampleP4
2022-06-22 17:56:33.255629 Gloeomargarita HotsprSampleOS60
2022-06-22 17:56:40.365667 Gloeomargarita HotsprSampleMS50
2022-06-22 17:56:47.545737 Gloeomargarita HotsprSampleMSe2
2022-06-22 17:56:54.674441 Gloeomargarita Hotspr20SampleT8
2022-06-22 17:57:02.024442 Gloeomargarita Hotspr2SamplePe2
2022-06-22 17:57:09.212010 Gloeomargarita HotsprSampleMSe3
2022-06-22 17:57:16.357428 Gloeomargarita HotsprSample

In [5]:
plt.rcParams.update(rcParams)

In [6]:
w = 10000
keys = ['OSA','OSB']+genomes
colors = ['teal','#FC7A57','purple','#B7B6C2','#3A506B','blue','#20BF55']


ctr = 0

for i in range(len(keys)):
    if keys[i][0] == ".":
        continue
    print(keys[i])
    mat = d[keys[i]].rolling(w).mean().values.T
    fig,ax = plt.subplots(1,1,figsize = (16,4))
    for j in range(0, len(mat)):
        ax.plot(mat[j], color = colors[ctr], alpha = 0.1, linewidth = 2)   
    ax.plot(0,0, color = colors[ctr], label = keys[i])
    ax.legend()  
    ax.set_xlabel('Sliding Window of 10000 Base-Pairs')
    
    ctr += 1
    
    #fig.supylabel('Average Fold Coverage')
    fig.tight_layout()
    fig.savefig('/Users/harihara/Research-Activities/Plots/Hot_Spring_Plots/Synechococcus-Paper/'+keys[i]+'_Rolling-Coverage-Mean(Non-Normalized).pdf')
    plt.close('all')
    

OSA
OSB
Candidatus_Thermochlorobacteriaceae
Chloracidobacterium_thermophilum_B
Chloroflexus
Gloeomargarita
Roseiflexus
