In [18]:
import pandas as pd
pd.set_option('display.max_columns', 100)

In [2]:
# The data scrapped from breseq reports can have different character encodings. Best just to normalize upfront.
from unicodedata import normalize


mut_df = pd.read_pickle('./data/4_10_with_uniq_midpts.pkl')

mut_df["Sequence Change"] = mut_df.apply(lambda m: normalize('NFKC', m["Sequence Change"]), axis=1)
mut_df["Details"] = mut_df.apply(lambda m: normalize('NFKC', m["Details"]), axis=1)

display(len(mut_df), mut_df.head())

3921

Unnamed: 0,index,exp,ale,flask,isolate,tech_rep,presence,Position,Mutation Type,Sequence Change,...,temperature,carbon-source,supplement,strain-description,taxonomy-id,base-media,nitrogen-source,phosphorous-source,sulfur-source,calcium-source
0,9,42C,1,124,1,1,1.0,702352,DEL,Δ21 bp,...,42 celsius,glucose(4),NaCl(0.5g/L) trace elements,WT,511145,M9,NH4Cl(1),KH2PO4(3) Na2HPO4(6.8),MgSO4(0.24),CaCl2(0.1)
1,10,42C,1,124,1,1,1.0,1308318,SNP,G→C,...,42 celsius,glucose(4),NaCl(0.5g/L) trace elements,WT,511145,M9,NH4Cl(1),KH2PO4(3) Na2HPO4(6.8),MgSO4(0.24),CaCl2(0.1)
2,11,42C,1,124,1,1,1.0,3815859,DEL,Δ82 bp,...,42 celsius,glucose(4),NaCl(0.5g/L) trace elements,WT,511145,M9,NH4Cl(1),KH2PO4(3) Na2HPO4(6.8),MgSO4(0.24),CaCl2(0.1)
3,12,42C,1,124,1,1,1.0,4187550,SNP,C→T,...,42 celsius,glucose(4),NaCl(0.5g/L) trace elements,WT,511145,M9,NH4Cl(1),KH2PO4(3) Na2HPO4(6.8),MgSO4(0.24),CaCl2(0.1)
4,13,42C,1,124,1,1,1.0,4400313,SNP,A→C,...,42 celsius,glucose(4),NaCl(0.5g/L) trace elements,WT,511145,M9,NH4Cl(1),KH2PO4(3) Na2HPO4(6.8),MgSO4(0.24),CaCl2(0.1)


In [3]:
regdb_gene_df = pd.read_csv(
    "./data/regulondb10/gene.txt", sep="\t", comment='#', header=None)
regdb_gene_df.columns = [
    "GENE_ID",
    "GENE_NAME",
    "GENE_POSLEFT",
    "GENE_POSRIGHT",
    "GENE_STRAND",
    "GENE_SEQUENCE",
    "GC_CONTENT",
    "CRI_SCORE",
    "GENE_NOTE",
    "GENE_INTERNAL_COMMENT",
    "KEY_ID_ORG",
    "GENE_TYPE"
]


gene_synonym_df = pd.read_csv(
    "./data/regulondb10/object_synonym.txt",
    sep="\t",
    comment='#',
    header=None,
    quoting=3
)
gene_synonym_df.columns = ["OBJECT_ID", "OBJECT_SYNONYM_NAME", "OS_INTERNAL_COMMENT", "KEY_ID_ORG"]
gene_synonym_df.head()

Unnamed: 0,OBJECT_ID,OBJECT_SYNONYM_NAME,OS_INTERNAL_COMMENT,KEY_ID_ORG
0,ECK120000001,EG10001,,ECK12
1,ECK120000001,ECK4045,,ECK12
2,ECK120000001,b4053,,ECK12
3,ECK120000001,alr5,,ECK12
4,ECK120000002,b0764,,ECK12


In [4]:
mut_df = mut_df.rename(columns={"mutation target annotation": "Gene"})

In [5]:
import os, sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
from aledbmutil.mut import get_clean_mut_gene_list, is_genetic_mut

mut_df["genetic"] = mut_df["Details"].apply(is_genetic_mut)
mut_df["clean genes"] = mut_df.apply(
    lambda m: set(get_clean_mut_gene_list(m["Gene"]) if m["genetic"] else set()),  # For when using gene names instead of bnums
    axis=1)
mut_df.head()

Unnamed: 0,index,exp,ale,flask,isolate,tech_rep,presence,Position,Mutation Type,Sequence Change,...,carbon-source,supplement,strain-description,taxonomy-id,base-media,nitrogen-source,phosphorous-source,sulfur-source,calcium-source,clean genes
0,9,42C,1,124,1,1,1.0,702352,DEL,Δ21 bp,...,glucose(4),NaCl(0.5g/L) trace elements,WT,511145,M9,NH4Cl(1),KH2PO4(3) Na2HPO4(6.8),MgSO4(0.24),CaCl2(0.1),{nagA}
1,10,42C,1,124,1,1,1.0,1308318,SNP,G→C,...,glucose(4),NaCl(0.5g/L) trace elements,WT,511145,M9,NH4Cl(1),KH2PO4(3) Na2HPO4(6.8),MgSO4(0.24),CaCl2(0.1),{clsA}
2,11,42C,1,124,1,1,1.0,3815859,DEL,Δ82 bp,...,glucose(4),NaCl(0.5g/L) trace elements,WT,511145,M9,NH4Cl(1),KH2PO4(3) Na2HPO4(6.8),MgSO4(0.24),CaCl2(0.1),{rph}
3,12,42C,1,124,1,1,1.0,4187550,SNP,C→T,...,glucose(4),NaCl(0.5g/L) trace elements,WT,511145,M9,NH4Cl(1),KH2PO4(3) Na2HPO4(6.8),MgSO4(0.24),CaCl2(0.1),{rpoC}
4,13,42C,1,124,1,1,1.0,4400313,SNP,A→C,...,glucose(4),NaCl(0.5g/L) trace elements,WT,511145,M9,NH4Cl(1),KH2PO4(3) Na2HPO4(6.8),MgSO4(0.24),CaCl2(0.1),{hfq}


In [6]:
def _get_gene_bnum(gene_name):
    gene_bnum = ""
    gene_df = regdb_gene_df[regdb_gene_df["GENE_NAME"]==gene_name]
    if len(gene_df) > 0:
        regdb_id = gene_df.iloc[0]["GENE_ID"]
        g_synonym_df = gene_synonym_df[gene_synonym_df["OBJECT_ID"]==regdb_id]
        g_bnum_df = g_synonym_df[g_synonym_df["OBJECT_SYNONYM_NAME"].str.contains('^b\d{4}')]
        if len(g_bnum_df) > 0:
            gene_bnum = g_bnum_df.iloc[0]["OBJECT_SYNONYM_NAME"]
    return gene_bnum

assert(_get_gene_bnum("rpoB")=="b3987")

In [7]:
from tqdm import tqdm
tqdm.pandas()

# from pandarallel import pandarallel
# pandarallel.initialize(progress_bar=True)

# For whatever reason, parallel_apply's progress bars freeze,
# which makes me think that parallel_apply crashes, though not sure.
# tqdm's progress_apply successfully returns progress though.
mut_df["bnums"] = mut_df.progress_apply(
# aledb_MG1655_ep_max_freq_mut_df["bnums"] = aledb_MG1655_ep_max_freq_mut_df.parallel_apply(
    lambda m: {_get_gene_bnum(gn) for gn in m['clean genes']},
    axis=1)
mut_df.head()

100%|██████████| 3921/3921 [00:08<00:00, 465.56it/s]


Unnamed: 0,index,exp,ale,flask,isolate,tech_rep,presence,Position,Mutation Type,Sequence Change,...,supplement,strain-description,taxonomy-id,base-media,nitrogen-source,phosphorous-source,sulfur-source,calcium-source,clean genes,bnums
0,9,42C,1,124,1,1,1.0,702352,DEL,Δ21 bp,...,NaCl(0.5g/L) trace elements,WT,511145,M9,NH4Cl(1),KH2PO4(3) Na2HPO4(6.8),MgSO4(0.24),CaCl2(0.1),{nagA},{b0677}
1,10,42C,1,124,1,1,1.0,1308318,SNP,G→C,...,NaCl(0.5g/L) trace elements,WT,511145,M9,NH4Cl(1),KH2PO4(3) Na2HPO4(6.8),MgSO4(0.24),CaCl2(0.1),{clsA},{b1249}
2,11,42C,1,124,1,1,1.0,3815859,DEL,Δ82 bp,...,NaCl(0.5g/L) trace elements,WT,511145,M9,NH4Cl(1),KH2PO4(3) Na2HPO4(6.8),MgSO4(0.24),CaCl2(0.1),{rph},{b3643}
3,12,42C,1,124,1,1,1.0,4187550,SNP,C→T,...,NaCl(0.5g/L) trace elements,WT,511145,M9,NH4Cl(1),KH2PO4(3) Na2HPO4(6.8),MgSO4(0.24),CaCl2(0.1),{rpoC},{b3988}
4,13,42C,1,124,1,1,1.0,4400313,SNP,A→C,...,NaCl(0.5g/L) trace elements,WT,511145,M9,NH4Cl(1),KH2PO4(3) Na2HPO4(6.8),MgSO4(0.24),CaCl2(0.1),{hfq},{b4172}


In [8]:
# key genes from Patrick's Ind Chem Tol GDV study that are commonly truncated.
genes_of_interest= {
    'b3783',  # rho
    'b3938',  # metJ
    'b3295',  # rpoA
    'b3251',  # mreB
    'b3650',  # spoT
}

In [15]:
from collections import Counter


mut_df["genes of interest"] = mut_df.apply(
    lambda m: m["bnums"] & genes_of_interest,
    axis=1)
mut_genes_of_interest_df = mut_df[mut_df["genes of interest"]!=set()].copy()
display(Counter(mut_genes_of_interest_df.Gene))

Counter({'rho': 16, 'mreB': 8, 'rpoA': 46, 'metJ': 10, 'spoT': 18})

In [16]:
# Only want SNPs since we specifically AA substitutions.
snp_genes_of_interest_df = mut_genes_of_interest_df[mut_genes_of_interest_df["Mutation Type"]=="SNP"].copy()
display(Counter(snp_genes_of_interest_df.Gene))

Counter({'rho': 15, 'mreB': 8, 'rpoA': 40, 'metJ': 9, 'spoT': 18})

In [36]:
# snp_genes_of_interest_df.to_csv('./data/aledb_snp_df.pkl')
# out_df = pd.DataFrame(columns=['gene', 'AA sequence', 'substitution', 'variant source', 'conditions'])
snp_genes_of_interest_df.rename(columns={"Gene": "gene"}, inplace=True)
snp_genes_of_interest_df['substitution'] = snp_genes_of_interest_df.Details.apply(lambda s: s.split(' ')[0])
snp_genes_of_interest_df['variant source'] = 'ALEdb'

conditions = [
    'temperature', 'carbon-source', 'supplement',
    'strain-description', 'base-media', 'nitrogen-source',
    'phosphorous-source', 'sulfur-source', 'calcium-source']
snp_genes_of_interest_df['conditions'] = snp_genes_of_interest_df.apply(
    lambda r: {c: r[c] for c in conditions},
    axis=1)
snp_genes_of_interest_df = snp_genes_of_interest_df[['gene', 'substitution', 'variant source', 'conditions']]
snp_genes_of_interest_df.head()

Unnamed: 0,gene,substitution,variant source,conditions
12,rho,G61E,ALEdb,"{'temperature': '42 celsius', 'carbon-source':..."
176,mreB,E346A,ALEdb,"{'temperature': '37 celsius', 'carbon-source':..."
636,rho,K417I,ALEdb,"{'temperature': '37 celsius', 'carbon-source':..."
653,rho,I382F,ALEdb,"{'temperature': '37 celsius', 'carbon-source':..."
1160,rpoA,T285I,ALEdb,"{'temperature': '37 celsius', 'carbon-source':..."


In [37]:
snp_genes_of_interest_df.to_csv('./aledb_snp_df.csv', index=False)