In [21]:
%matplotlib inline
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
from tqdm.auto import tqdm

from generate_libraries import generate_libraries
from generate_phenotypes import generate_phenotypes
from pheno_to_growth import pheno_to_growth
from transform_and_mix import transform_and_mix
from initialize_population import initialize_population
from growth_with_selection import growth_with_selection
from sequence_and_collate import sequence_and_collate

import random
import copy

import csv
from collections import defaultdict


In [22]:
# -----------------------------------------------------------------------------
# Library definition information

# wildtype amino acid sequence (single letter, uppercase)
aa_sequence = \
"""
VKPVTLYDVAEYAGVSYQTVSRVVNQASHVSAKTREKVEAAMAELNYIPNRVAQQLAGK
QSLLIGVATSSLALHAPSQIVAAIKSRADQLGASVVVSMVERSGVEACKAAVHNLLAQR
VSGLIINYPLDDQDAIAVEAACTNVPALFLDVSDQTPINSIIFSHEDGTRLGVEHLVAL
GHQQIALLAGPLSSVSARLRLAGWHKYLTRNQIQPIAEREGDWSAMSGFQQTMQMLNEG
IVPTAMLVANDQMALGAMRAITESGLRVGADISVVGYDDTEDSSCYIPPLTTIKQDFRL
LGQTSVDRLLQLSQGQAVKGNQLLPVSLVKRKTTLAPNTQTASP
"""

# sites that are mutated. sites should either be wildtype 
# amino acids or non-uppercase letters. non-uppercase letters
# are interpreted as sub-library identifiers. 
mutated_sites = \
"""
VKPVTLYDVAEYAGVSYQTVSRVVNQASH111111111111111111111111111111
QS2222222222222222222222222222222222222VERSGVEACKAAVHNLLAQR
VSGLIINYPLDDQDAIAVEAACTNVPALFLDVSDQTPINSIIFSHEDGTRLGVEHLVAL
GHQQIALLAGPLSSVSARLRLAGWHKYLTRNQIQPIAEREGDWSAMSGFQQTMQMLNEG
IVPTAMLVANDQMALGAMRAITESGLRVGADISVVGYDDTEDSSCYIPPLTTIKQDFRL
LGQTSVDRLLQLSQGQAVKGNQLLPVSLVKRKTTLAPNTQTASP
"""

# """
# VKPVTLYDVAEYAGVSYQTVSRVVNQASH1SAKTREKVEAAMAELNYIPNRVAQQLAGK
# QSLLIGVATSSLALHAPSQIVAAIKSRADQLGASVVVS2VERSGVEACKAAVHNLLAQR
# VSGLIINYPLDDQDAIAVEAACTNVPALFLDVSDQTPINSIIFSHEDGTRLGVEHLVAL
# GHQQIALLAGPLSSVSARLRLAGWHKYLTRNQIQPIAEREGDWSAMSGFQQTMQMLNEG
# IVPTAMLVANDQMALGAMRAITESGLRVGADISVVGYDDTEDSSCYIPPLTTIKQDFRL
# LGQTSVDRLLQLSQGQAVKGNQLLPVSLVKRKTTLAPNTQTASP
# """

# Number of first letter in amino sequence. Should match 
# numbering in ddG_spreadsheet
seq_starts_at = 1

# Should we make internal combinations (within O1, O2, etc.)
internal_doubles = False 

# highest order combination to make (1 -- alone, 2 -- pairs,
# 3 -- triplicates, etc.)
max_num_combos = 2

# What codon to place at mutated sites
degen_codon = "nnt"


# -----------------------------------------------------------------------------
# Transformation and library composition

# Define how many transformants to include for each sub-library. Keys are 
# tuples of strings. ('1',) means O1; ('2',) means O2; ('1','2') means O1/O2. 
# if you set internal_doubles to True above, you can then add ('1','1') and 
# ('2','2') to get the doubles within each pool. 

# sizes: how many clones will you get for the smallest transformation of this
# library
transform_sizes = {('1',):1e5,
                   ('2',):1e5,
                   ('1','2'):1e6}

# mixture: what is the mixture ratio of the library. This works together with 
# sizes. For the example values, the final mixed library would have 1e5 * 1
# O1, 1e5 * 1, O2, and and 1e6 * 10 O1/O2. Note that the **ratio** matters, 
# not the absolute value. You do not need to have all 1E10 clones... in fact,
# keep as small as possible while maintinaing ratios. I'd recommend setting 
# the lowest count library bit to 1, then scaling rest appropriately.
library_mixture = {('1',):1,
                   ('2',):1,
                   ('1','2'):10}

# The number of plasmids per cell is set by sampling from a Poisson 
# distribuation. If 0 or None, each cell gets exactly one plasmid. Otherwise,
# do Poisson sampling.
lambda_value = 0

# Maximum number of plasmids per cell. The lower this number, the faster the
# analysis; however, if you make it too small relative to lambda_value, you'll
# distort the Poisson distribution by cutting off the long right tail.
max_num_plasmids = 1 #Should be 10 

# -----------------------------------------------------------------------------
# Pre selection culture set up

# Number of cells that come out of glycerol stock
#NIH says 40% 
num_thawed_colonies = 1e7 #1e7 #1e3

# Overnight cell culture volume
overnight_volume_in_mL = 10

# Overnight culture saturation level
saturation_cfu_mL = 1e9

# Dilution factor morning of experiment
morning_dilution = 1/70

# Grow cells to this cfu/mL before diluting into IPTG 
# (OD600 of 0.35 on spec is 89370229 cfu/mL
pre_iptg_cfu_mL = 90000000

# Dilute by this factor when adding to IPTG tubes
iptg_dilution_factor = 0.2/10.2

# Grow out this long before putting into selection conditions
iptg_out_growth_time = 30

# -----------------------------------------------------------------------------
# Data collection parameters

# Sample the population at the following times post treatment (minutes)
#How important is it that these times are real? 
sample_times = [30,60,90]

# number of reads per condition (time, iptg, selection)
num_reads_per_condition = 38e6 #1e6 #3.5e7 #3e7 #50e6 #5e3  #rough estimate 1 billion reads/(16IPTG + sel*3, + 2 IPTG 0 or  1*3+ 2 initial libraries = 20)


# 2e9/((16*3) + 4)/1e6


In [23]:
#Build library
# Generate libraries 
#Save library 
libraries = generate_libraries(aa_sequence=aa_sequence,
                               mutated_sites=mutated_sites,
                               seq_starts_at=seq_starts_at,
                               max_num_combos=max_num_combos,
                               internal_doubles=internal_doubles,
                               degen_codon=degen_codon)

 

In [24]:

# -----------------------------------------------------------------------------
# Ensemble observables to growth 

# sel_name is the name of the selection. This can be whatever you like, it just
# identifies the selection for later. 

# pheno_0_growth is the growth rate when the phenotype has a value of 0
# pheno_1_growth is the growth rate when the phenotype has a value of 1

# pheno_0_growth and pheno_1_growth can either be a single value (assigned to 
# all clones) or a pair of values (corresponding to the mean and standard dev
# of distribution to sample for each clone). 

# ln_base_growth_rate is the natural log of the base growth rate. This
# base is assumed to be the wildtype growth rate. For mutants, we sample
# from a normal distributio defined by ln_base_growth_rate (mean) and
# ln_base_growth_rate_std (std). The actual growth rate is then 
# exp(ln_growth_rate). Once these are assigned by a call to pheno_to_growth,
# they are not assigned again by subsequent calls. (Put another way, these
# terms are ignored after the first call of pheno_to_growth). 

# pheno_name is the phenotype to correlate to growth. it can be one of:
#     'obs': fx_folded*fx_occupied - thing I care about! 
#     'fx_folded': fraction of molecules in folded state
#     'fx_occupied': fraction of molecules occupying 'observable' states 
#     'ddG': Boltzmann-weighted free energy of the state

kan_selection = {"sel_name":"kan",  
                 "pheno_0_growth":0.00001, #make not zero to avoid division probelm 
                 "pheno_1_growth":0.8,
                 "ln_base_growth_rate":-5, #e to this times the 0 pheo growth 
                 "ln_base_growth_rate_std":0.1,
                 "pheno_name":"obs",
                 "scale_pheno_by":12} 
                                     
phes_selection = {"sel_name":"pheS",  
                  "pheno_0_growth":0.8,
                  "pheno_1_growth":0.00001, #e to this times the 1 kan growth 
                  "ln_base_growth_rate":-5,
                  "ln_base_growth_rate_std":0.1, #set to zero for w calc
                  "pheno_name":"obs",
                  "scale_pheno_by":12} 
# -----------------------------------------------------------------------------
# Ensemble information

# IPTG concentrations in mM
iptg_concs = np.array([0.000001,
                       0.00001,
                       0.0001,
                       0.0003,
                       0.0010,
                       0.0030,
                       0.0100,
                       0.0300])*1e3

# spreadsheet defining ensemble and interactions with iptg
#Right now this asserts 3 states dna bound, iptg bound, unfolded 
ensemble_spreadsheet = "input/ensemble_dG.xlsx"

# Spreadsheet with affects of all mutations on each state in ensemble
#Output of ThermoMPNN with the effect of mutations to the unfolded always 0 
ddG_spreadsheet = "input/ddG_no_monomer.xlsx"

T = 310      # K
R = 0.001987 # kcal/mol/K


In [25]:
#Need to save lib phenotypes 
# -----------------------------------------------------------------------------
# Calculate phenotypes, and run screen

# Calculate phenotype of each clone given ensemble
lib_phenotypes = generate_phenotypes(libraries=libraries,        
                                     ensemble_spreadsheet=ensemble_spreadsheet,
                                     ddG_spreadsheet=ddG_spreadsheet,
                                     concs_mM=iptg_concs,
                                     T=T,
                                     R=R)



calculating phenotypes for library ('1',)


  0%|          | 0/480 [00:00<?, ?it/s]

calculating phenotypes for library ('2',)


  0%|          | 0/592 [00:00<?, ?it/s]

calculating phenotypes for library ('1', '2')


  0%|          | 0/284160 [00:00<?, ?it/s]

In [27]:
# Get growth rate under kanamycin selection

lib_pheotypes = pheno_to_growth(lib_phenotypes, 
                                **kan_selection, 
                                return_df=True, #allows output to be saved 
                                save_path="kan_lib_file.csv")

# Get growth rate under pheS/4CP selection
lib_pheotypes = pheno_to_growth(lib_phenotypes, 
                                **phes_selection, 
                                return_df=True, 
                                save_path="pheS_lib_file.csv")

Calculating growth rates for library ('1',) with kan


  0%|          | 0/480 [00:00<?, ?it/s]

Calculating growth rates for library ('2',) with kan


  0%|          | 0/592 [00:00<?, ?it/s]

Calculating growth rates for library ('1', '2') with kan


  0%|          | 0/284160 [00:00<?, ?it/s]

Saved combined DataFrame to: kan_lib_file.csv
Calculating growth rates for library ('1',) with pheS


  0%|          | 0/480 [00:00<?, ?it/s]

Calculating growth rates for library ('2',) with pheS


  0%|          | 0/592 [00:00<?, ?it/s]

Calculating growth rates for library ('1', '2') with pheS


  0%|          | 0/284160 [00:00<?, ?it/s]

Saved combined DataFrame to: pheS_lib_file.csv


In [28]:
# Sample from the main library
input_library, all_genotypes = transform_and_mix(lib_phenotypes=lib_phenotypes,
                                                 transform_sizes=transform_sizes,
                                                 library_mixture=library_mixture,
                                                 max_num_plasmids=max_num_plasmids,
                                                 lambda_value=lambda_value)

# Create initial populations and growth rates for each cell under all relevant 
# conditions
init_output = initialize_population(input_library,
                                    all_genotypes,
                                    num_thawed_colonies=num_thawed_colonies,
                                    overnight_volume_in_mL=overnight_volume_in_mL,
                                    saturation_cfu_mL=saturation_cfu_mL,
                                    morning_dilution=morning_dilution,
                                    pre_iptg_cfu_mL=pre_iptg_cfu_mL,
                                    iptg_dilution_factor=iptg_dilution_factor,
                                    iptg_out_growth_time=iptg_out_growth_time)
input_library, ln_pop_array, base_growth_rates, growth_rates = init_output




getting growth rates of each bacterium


  0%|          | 0/10000000 [00:00<?, ?it/s]

In [None]:
# Get the population over time for each each selection and iptg condition
pops_vs_time = growth_with_selection(ln_pop_array,
                                     growth_rates,
                                     time_points=sample_times)

# Sequence samples and summarize results
df, genotype_df, cfu_df = sequence_and_collate(pops_vs_time=pops_vs_time,
                                               iptg_concs=iptg_concs,
                                               sample_times=sample_times,
                                               input_library=input_library,
                                               all_genotypes=all_genotypes,
                                               num_reads_per_condition=num_reads_per_condition)


simulating growth under all conditions


  0%|          | 0/6 [00:00<?, ?it/s]

sequencing samples of each condition


  0%|          | 0/48 [00:00<?, ?it/s]

extracting sequence counts


  0%|          | 0/48 [00:00<?, ?it/s]

In [None]:
df.to_csv("output/obs_many-state_molar_slower.csv") #output_obs_20250703_NJ_df.csv")
genotype_df.to_csv("output/genotype_many-state_molar_slower.csv") #enotype_obs_20250703_NJ_df.csv")
cfu_df.to_csv("output/cfu_many-state_molar_slower.csv") #cfu_obs_20250703_df.csv")