In [1]:
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import matplotlib as mpl
import gffpandas.gffpandas as gffpd

In [2]:

results_dir = Path("/nfs/chisholmlab001/kve/2021_dark_adapted_transcriptome/results/")

experiment_dir = results_dir / "experiments"
plotting_dir = results_dir / 'gene_expression_plots'

log_count_table = experiment_dir / "experiment_all" / "DEseq2out" / "experiment_all_NATL2A_rlog.tsv"
rlog_df = pd.read_csv(log_count_table, sep='\t', index_col="long_ID")

rlog_df.columns = pd.MultiIndex.from_product([['Control', 'Pheno'], [0, 4, 8, 13, 16, 20, 24], [1, 2, 3]], names=['treatment', 'time', 'replicate'])

rlog_df = rlog_df[~rlog_df.index.duplicated()]

rlog_mean_df = rlog_df.groupby(level=['treatment', 'time'], axis='columns').mean()

rlog_std_df = rlog_df.groupby(level=['treatment', 'time'], axis='columns').std()


In [3]:
dfs = []
for time in [0, 4, 8, 13, 16, 20, 24]:
    result_path = experiment_dir / f"experiment_{time}" / "DGE_tables" / f"experiment_{time}_NATL2A_DGE_all.tsv"
    result_df = pd.read_csv(result_path, sep="\t", index_col="long_ID")

    # annotation_data = result_df[["product", "protein_id", "locus_tag"]]
    # annotation_data = annotation_data.drop_duplicates()

    significance_data = result_df[["log2FoldChange", "padj"]]
    significance_data.columns = pd.MultiIndex.from_product([[time], significance_data.columns], names=['time', 'significance'])

    significance_data = significance_data.drop_duplicates()

    dfs.append(significance_data)

result_df = pd.concat(dfs, axis=1)

result_df = result_df.swaplevel("time", "significance", axis=1)

# conversion_df = pd.read_csv("gene_label_conversion_table.tsv", sep='\t', index_col='gene')

# annotation_data = annotation_data.join(conversion_df, how='left')


In [24]:
gff_path = Path("/nfs/chisholmlab001/kve/2021_dark_adapted_transcriptome/input_data/culture_genome_annotations/NATL2A.gff")

annotation = gffpd.read_gff3(gff_path)
annotation_data = annotation.attributes_to_columns()

conversion_df = pd.read_csv("natl2a_convertion_table2.tsv", sep='\t')
conversion_df["locus_tag"] = conversion_df['NCBI ID_2']
conversion_df = conversion_df[conversion_df["locus_tag"].notna()]

annotation_data = annotation_data.merge(conversion_df, how='left', on="locus_tag")
annotation_data = annotation_data.set_index("ID")

annotation_data = annotation_data[annotation_data['type'].isin(['sRNA', 'CDS'])]
annotation_data = annotation_data.drop_duplicates()

annotation_data

Unnamed: 0_level_0,seq_id,source,type,start,end,score,strand,phase,attributes,Dbxref,...,start_range,strain,transl_table,NCBI ID,NCBI ID_2,NCBI ID_3,Gene Name,Genbank Annotation,RAST annotation,Response of Synechococcus elongatus PCC7942 homolog to darkness
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
cds-WP_011293961.1,NC_007335.2,Protein Homology,CDS,189,1349,.,+,0,ID=cds-WP_011293961.1;Parent=gene-PMN2A_RS0001...,Genbank:WP_011293961.1,...,,,11,NATL2_00001,PMN2A_RS00015,PMN2A_1328,dnaN,DNA polymerase III subunit beta,DNA polymerase III beta subunit (EC 2.7.7.7),
cds-WP_011293962.1,NC_007335.2,Protein Homology,CDS,1352,2122,.,+,0,ID=cds-WP_011293962.1;Parent=gene-PMN2A_RS0002...,Genbank:WP_011293962.1,...,,,11,NATL2_00011,PMN2A_RS00020,PMN2A_1329,,hypothetical protein,RNA metabolism-related protein,
cds-WP_011293963.1,NC_007335.2,Protein Homology,CDS,2126,4537,.,+,0,ID=cds-WP_011293963.1;Parent=gene-PMN2A_RS0002...,Genbank:WP_011293963.1,...,,,11,NATL2_00021,PMN2A_RS00025,PMN2A_1330,purL,phosphoribosylformylglycinamidine synthase sub...,"Phosphoribosylformylglycinamidine synthase, sy...",
cds-WP_011293964.1,NC_007335.2,Protein Homology,CDS,4599,6056,.,+,0,ID=cds-WP_011293964.1;Parent=gene-PMN2A_RS0003...,Genbank:WP_011293964.1,...,,,11,NATL2_00031,PMN2A_RS00030,PMN2A_1331,purF,amidophosphoribosyltransferase,Amidophosphoribosyltransferase (EC 2.4.2.14),
cds-WP_011293965.1,NC_007335.2,Protein Homology,CDS,6053,8536,.,-,0,ID=cds-WP_011293965.1;Parent=gene-PMN2A_RS0003...,Genbank:WP_011293965.1,...,,,11,NATL2_00041,PMN2A_RS00035,PMN2A_1332,gyrA,topoisomerase IV,DNA gyrase subunit A (EC 5.99.1.3),
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Yfr22_2,NC_007335.2,source,sRNA,1083668,1083721,.,-,0,ID=Yfr22_2;product=Yfr22_2,,...,,,,,,,,,,
Yfr1,NC_007335.2,source,sRNA,1191112,1191170,.,+,0,ID=Yfr1;product=Yfr1,,...,,,,,,,,,,
Yfr106,NC_007335.2,source,sRNA,1217566,1217642,.,-,0,ID=Yfr106;product=Yfr106,,...,,,,,,,,,,
Yfr23,NC_007335.2,source,sRNA,1084235,1084301,.,+,0,ID=Yfr23;product=Yfr23,,...,,,,,,,,,,


In [25]:
@mpl.rc_context({
    'lines.linewidth': 6, 
    'lines.marker':'o', 
    'lines.markersize':18, 
    'legend.fontsize': 'x-large',
    'axes.labelsize': 'x-large',
    'axes.titlesize':'x-large',
    'xtick.labelsize':'x-large',
    'ytick.labelsize':'x-large'})
def plot_gene(ax, gene_ID, rlog_mean_df, rlog_std_df, results_df, annotation_data, night_periods, night_color, attr_dict):

    ax.yaxis.set_major_locator(mpl.ticker.MaxNLocator(integer=True))
    mean_series = rlog_mean_df.loc[gene_ID]
    std_series = rlog_std_df.loc[gene_ID]

    for treatment in ['Control', 'Pheno']:
        for (s, e) in night_periods:
            ax.axvspan(s, e, color=night_color)
        ax.errorbar(x=mean_series[treatment].index, y=mean_series[treatment], yerr=std_series[treatment], capsize=6, capthick=3, label=treatment, color=attr_dict[treatment]['color'])
    
    # axes limits
    bottom, top = ax.get_ylim()
    y_range = top - bottom
    ax.set_ylim(top-(y_range*1.1), top)
    ax.set_xlim(-1, 25)
    
    #plotting significance
    significance_df = result_df.loc[gene_ID]
    significance_df["padj"]

    for x, padj in zip(significance_df["padj"].index, significance_df["padj"]):
        if padj < 0.05:
            ax.plot(x, bottom - y_range*0.05, color='k', marker=(8, 2, 0), markersize=15, label="Differentially expressed at 5% FDR")

    gene_annotation = annotation_data.loc[gene_ID]

    # title stuff
    ax.set_title(f"{gene_annotation.name}/{gene_annotation['NCBI ID_3']}\n{gene_annotation['Genbank Annotation']}")

@mpl.rc_context({
    'lines.linewidth': 6, 
    'lines.marker':'o', 
    'lines.markersize':18, 
    'legend.fontsize': 'x-large',
    'axes.labelsize': 'xx-large',
    'axes.titlesize':'xx-large',
    'xtick.labelsize':'xx-large',
    'figure.titlesize': 'xx-large',
    'ytick.labelsize':'xx-large'})
def plot_gene_table(gene_df_subset, out_path, rlog_mean_df, rlog_std_df, results_df, annotation_data,
                num_cols = 3,
                night_periods = [(-11, 0), (13, 24)], 
                night_color="#dfdfdf",
                attr_dict={'Control':{'color':'salmon', 'label':'Parental $\it{Prochlorococcus}$'}, 'Pheno':{'color':'lightseagreen', 'label':'Dark-tolerant $\it{Prochlorococcus}$'}}):

    # original from elaina:
    # color_dict={'Control':'#e97e72', 'Pheno':'#52bcc2'}

    if len(gene_df_subset) > num_cols:
        gene_arr = list(gene_df_subset.index.values)
        gene_arr += [None]*(num_cols - (len(gene_arr) % num_cols))
        gene_arr = np.array(gene_arr).reshape(-1, num_cols)
    else:
        gene_arr = np.array(gene_df_subset.index.values).reshape(1, len(gene_df_subset))

    y_height = 5
    x_width = 5

    heights = [y_height]*gene_arr.shape[0]
    widths = [x_width]*gene_arr.shape[1]

    fig = plt.figure(figsize=(sum(widths), sum(heights)), constrained_layout=True)
    gs = fig.add_gridspec(ncols=len(widths), nrows=len(heights), height_ratios=heights, width_ratios=widths)

    for i, row in enumerate(gene_arr):
        for j, element in enumerate(row):
            if element != None:
                ax = fig.add_subplot(gs[i,j])
                plot_gene(ax, element, rlog_mean_df, rlog_std_df, results_df, annotation_data, night_periods, night_color, attr_dict)

    # handles, labels = ax.get_legend_handles_labels()
    legend_elements = [mpl.lines.Line2D([0], [0], color=d['color'], label=d['label']) for t, d in attr_dict.items()]
    legend = fig.legend(handles=legend_elements, loc='center left', bbox_to_anchor= (1.01, 0.5))

    xlab = fig.supxlabel("Time (hours)")
    ylab = fig.supylabel("Relative transcript abundance")
    plt.savefig(out_path, bbox_extra_artists=[legend, xlab, ylab], bbox_inches='tight')
    plt.close()



In [11]:
# Clock proteins
clock_proteins = ["KaiB",
                  "KaiC",
                  "SasA",
                  "RpaA",
                  "LdpA"]

clock_proteins = ["PMN2A_0914", "PMN2A_0913", "PMN2A_0674", "PMN2A_1494", "PMN2A_1131"]

genes = annotation_data[annotation_data["NCBI ID_3"].isin(clock_proteins)]

out_path = plotting_dir / 'clock_proteins.jpg'
plot_gene_table(genes, out_path, rlog_mean_df, rlog_std_df, result_df, annotation_data)


In [12]:
# Sigma Factors
sigma_factors = ["PMN2A_RS02610",
                 "PMN2A_RS03005",
                 "PMN2A_RS07470",
                 "PMN2A_RS09305",
                 "PMN2A_RS09820"]

genes = annotation_data[annotation_data["NCBI ID_3"].isin(clock_proteins)]

out_path = plotting_dir / 'clock_proteins.jpg'
plot_gene_table(genes, out_path, rlog_mean_df, rlog_std_df, result_df, annotation_data)


In [13]:
# Stringent Reponse Genes
stringent_response = ["PMN2A_1459", "PMN2A_1467", "PMN2A_1474", "PMN2A_1490", "PMN2A_1506", "PMN2A_1515", "PMN2A_1541", "PMN2A_1580", "PMN2A_1599", "PMN2A_1607", "PMN2A_1646", "PMN2A_1661", "PMN2A_1721", "PMN2A_1735", "PMN2A_1738", "PMN2A_1776", "PMN2A_1777", "PMN2A_1778", "PMN2A_1779", "PMN2A_1879", "PMN2A_1880", "PMN2A_0016", "PMN2A_0066", "PMN2A_0183", "PMN2A_0437", "PMN2A_0621", "PMN2A_0735", "PMN2A_0758", "PMN2A_0830", "PMN2A_0847", "PMN2A_0975", "PMN2A_0995", "PMN2A_1005", "PMN2A_1075", "PMN2A_1130", "PMN2A_1144", "PMN2A_1150", "PMN2A_1190", "PMN2A_1311", "PMN2A_1326", "PMN2A_0077", "PMN2A_1217"]

genes = annotation_data[annotation_data["NCBI ID_3"].isin(stringent_response)]
out_path = plotting_dir / 'stringent_response.jpg'
plot_gene_table(genes, out_path, rlog_mean_df, rlog_std_df, result_df, annotation_data, num_cols=7)


In [14]:
# histidine kinases and response regulator genes
histidine_kinase_w_response_regulators = ["PMN2A_0437",
                                          "PMN2A_0674",
                                          "PMN2A_0912",
                                          "PMN2A_1151",
                                          "PMN2A_1635",
                                          "PMN2A_0436",
                                          "PMN2A_0637",
                                          "PMN2A_1196",
                                          "PMN2A_1494",
                                          "PMN2A_1500",
                                          "PMN2A_1537",]

genes = annotation_data[annotation_data["NCBI ID_3"].isin(histidine_kinase_w_response_regulators)]
out_path = plotting_dir / 'histidine_kinase_w_response_regulators.jpg'
plot_gene_table(genes, out_path, rlog_mean_df, rlog_std_df, result_df, annotation_data, num_cols=4)


In [15]:
# atpT (PMN2A_2168) - cyanobacterial ATP synthase inhibitory factor

genes = annotation_data[annotation_data["locus_tag"].isin(["PMN2A_2168"])]
out_path = plotting_dir / 'atpT.jpg'
plot_gene_table(genes, out_path, rlog_mean_df, rlog_std_df, result_df, annotation_data, num_cols=4)


In [16]:
# ATP Synthase Gene Cluster -- Product contains "ATP synthase" -- should be 9 genes

genes = annotation_data[annotation_data["product"].str.contains("ATP synthase")]
out_path = plotting_dir / 'ATPsynthaseSubunits.jpg'
plot_gene_table(genes, out_path, rlog_mean_df, rlog_std_df, result_df, annotation_data, num_cols=4)


In [17]:
# Transcriptional repressor LexA (PMN2A_0828)
genes = annotation_data[annotation_data["NCBI ID_3"].isin(["PMN2A_0828"])]
out_path = plotting_dir / 'LexA.jpg'
plot_gene_table(genes, out_path, rlog_mean_df, rlog_std_df, result_df, annotation_data)

In [18]:
# Pcb gene family cluster
# get from Allison's most recent email
# It should have PMN2A_:
# PMN2A_0066 pcbA
# PMN2A_0719 pcbB1
# PMN2A_0723 pcbB2 (which is what we are missing)
# PMN2A_0215 pcbC
# PMN2A_0718 pcbD
# PMN2A_0722 pcbE
# PMN2A_0717 pcbH

pcb = ["PMN2A_0215", "PMN2A_0066", "PMN2A_0717", "PMN2A_0718", "PMN2A_0719", "PMN2A_0723", "PMN2A_0722"]
genes = annotation_data[annotation_data["NCBI ID_3"].isin(pcb)]
out_path = plotting_dir / 'pcb.jpg'
plot_gene_table(genes, out_path, rlog_mean_df, rlog_std_df, result_df, annotation_data, num_cols=3)

In [19]:
# Transcriptional regulator AbrB (PMN2A_1774)
genes = annotation_data[annotation_data["NCBI ID_3"].isin(["PMN2A_1774"])]
out_path = plotting_dir / 'AbrB.jpg'
plot_gene_table(genes, out_path, rlog_mean_df, rlog_std_df, result_df, annotation_data)


In [20]:
# two component transcriptional regulator (PMN2A_0637/PMN2A_RS06350)
genes = annotation_data[annotation_data["NCBI ID_3"].isin(["PMN2A_0637"])]
out_path = plotting_dir / 'response_regulator_transcription_factor.jpg'
plot_gene_table(genes, out_path, rlog_mean_df, rlog_std_df, result_df, annotation_data)


In [21]:
# transcriptional regulator, GntR family (PMN2A_1216)
genes = annotation_data[annotation_data["NCBI ID_3"].isin(["PMN2A_1216"])]
out_path = plotting_dir / 'GntR_family_transcriptional_regulator.jpg'
plot_gene_table(genes, out_path, rlog_mean_df, rlog_std_df, result_df, annotation_data)


In [26]:
# YFRs
# genes = annotation_data[annotation_data["NCBI ID_3"].isin(["PMN2A_1216"])]

# YFR103 specifically

index_series = annotation_data.index.to_series()
yfr_list = index_series[index_series.str.contains("Yfr")==True].to_list()

genes = annotation_data.loc[yfr_list]
out_path = plotting_dir / 'yfrs.jpg'
plot_gene_table(genes, out_path, rlog_mean_df, rlog_std_df, result_df, annotation_data, num_cols=5)

genes = annotation_data.loc[['Yfr103_1', 'Yfr103_2']]
out_path = plotting_dir / 'yfr103.jpg'
plot_gene_table(genes, out_path, rlog_mean_df, rlog_std_df, result_df, annotation_data)



In [23]:
# Sigma 70 – rpoD (PMN2A_1829)
genes = annotation_data[annotation_data["NCBI ID_3"].isin(["PMN2A_1829"])]
out_path = plotting_dir / 'sigma70.jpg'
plot_gene_table(genes, out_path, rlog_mean_df, rlog_std_df, result_df, annotation_data)