In [1]:
%matplotlib inline
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
from tqdm.auto import tqdm

import tfscreen

from tfscreen import generate_libraries
from tfscreen import generate_phenotypes
from tfscreen import build_condition_dataframe


from tfscreen import pheno_to_growth
from tfscreen import transform_and_mix
from tfscreen import initialize_population
from tfscreen import growth_with_selection
from tfscreen import sequence_and_collate

import random
import copy


In [7]:
# -----------------------------------------------------------------------------
# Ensemble information

# spreadsheet defining ensemble and interactions with iptg
ensemble_spreadsheet = "input/ensemble.xlsx"

# Spreadsheet with affects of all mutations on each state in ensemble
ddG_spreadsheet = "input/ddG.xlsx"

T = 310      # K
R = 0.001987 # kcal/mol/K

# -----------------------------------------------------------------------------
# Library definition information

# wildtype amino acid sequence (single letter, uppercase)
aa_sequence = \
"""
VKPVTLYDVAEYAGVSYQTVSRVVNQASHVSAKTREKVEAAMAELNYIPNRVAQQLAGK
QSLLIGVATSSLALHAPSQIVAAIKSRADQLGASVVVSMVERSGVEACKAAVHNLLAQR
VSGLIINYPLDDQDAIAVEAACTNVPALFLDVSDQTPINSIIFSHEDGTRLGVEHLVAL
GHQQIALLAGPLSSVSARLRLAGWHKYLTRNQIQPIAEREGDWSAMSGFQQTMQMLNEG
IVPTAMLVANDQMALGAMRAITESGLRVGADISVVGYDDTEDSSCYIPPLTTIKQDFRL
LGQTSVDRLLQLSQGQAVKGNQLLPVSLVKRKTTLAPNTQTASP
"""

# sites that are mutated. sites should either be wildtype 
# amino acids or non-uppercase letters. non-uppercase letters
# are interpreted as sub-library identifiers. 
mutated_sites = \
"""
VKPVTLYDVAEYAGVSYQTVSRVVNQASH111111111111111111111111111111
QS2222222222222222222222222222222222222VERSGVEACKAAVHNLLAQR
VSGLIINYPLDDQDAIAVEAACTNVPALFLDVSDQTPINSIIFSHEDGTRLGVEHLVAL
GHQQIALLAGPLSSVSARLRLAGWHKYLTRNQIQPIAEREGDWSAMSGFQQTMQMLNEG
IVPTAMLVANDQMALGAMRAITESGLRVGADISVVGYDDTEDSSCYIPPLTTIKQDFRL
LGQTSVDRLLQLSQGQAVKGNQLLPVSLVKRKTTLAPNTQTASP
"""

# Number of first letter in amino sequence. Should match 
# numbering in ddG spreadsheet
seq_starts_at = 1

# Should we make internal combinations (within O1, O2, etc.)
internal_doubles = False 

# highest order combination to make (1 -- alone, 2 -- pairs,
# 3 -- triplicates, etc.)
max_num_combos = 2

# What codon to place at mutated sites
degen_codon = "nnt"

# -----------------------------------------------------------------------------
# Transformation and library composition

# Define how many transformants to include for each sub-library. Keys are 
# tuples of strings. ('1',) means O1; ('2',) means O2; ('1','2') means O1/O2. 
# if you set internal_doubles to True above, you can then add ('1','1') and 
# ('2','2') to get the doubles within each pool. 

# sizes: how many clones will you get for the smallest transformation of this
# library
transform_sizes = {('1',):1e5,
                   ('2',):1e5,
                   ('1','2'):1e6}

# mixture: what is the mixture ratio of the library. This works together with 
# sizes. For the example values, the final mixed library would have 1e5 * 1
# O1, 1e5 * 1, O2, and and 1e6 * 10 O1/O2. Note that the **ratio** matters, 
# not the absolute value. You do not need to have all 1E10 clones... in fact,
# keep as small as possible while maintinaing ratios. I'd recommend setting 
# the lowest count library bit to 1, then scaling rest appropriately.
library_mixture = {('1',):1,
                   ('2',):1,
                   ('1','2'):10}

# The number of plasmids per cell is set by sampling from a Poisson 
# distribuation. If 0 or None, each cell gets exactly one plasmid. Otherwise,
# do Poisson sampling.
lambda_value = 3

# Maximum number of plasmids per cell. The lower this number, the faster the
# analysis; however, if you make it too small relative to lambda_value, you'll
# distort the Poisson distribution by cutting off the long right tail. 10 should
# be safe for lambda ~ 2-3. 
max_num_plasmids = 10 


# -----------------------------------------------------------------------------
# Pre selection culture set up

# Number of cells that come out of glycerol stock
# (NIH says 40%)
num_thawed_colonies = 1e7

# Overnight cell culture volume
overnight_volume_in_mL = 10

# Grow cells to this cfu/mL before diluting into IPTG 
# (OD600 of 0.35 on spec is 89370229 cfu/mL)
pre_iptg_cfu_mL = 90000000

# Grow out this long before putting into selection conditions
iptg_out_growth_time = 30

# Dilute by this factor when adding to IPTG tubes
post_iptg_dilution_factor = 0.2/10.2



# -----------------------------------------------------------------------------
# Selection conditions

# condition_blocks is a list of dictionaries defining different growt
# conditions. Each dictionary should have a "marker" key with a value of either
# "kanR" or "pheS", a "select" key with a value of either 0 or 1, and a "iptg"
# key with a list of iptg concentrations (in mM). 

iptg_concs = [0, 0.0001, 0.001, 0.003, 0.01, 0.03, 0.1, 1.0]
condition_blocks = [{"iptg":[0,1],
                     "marker":"kanR",
                     "select":0},
                    {"iptg":[0,1],
                     "marker":"pheS",
                     "select":0},
                    {"iptg":iptg_concs,
                     "marker":"kanR",
                     "select":1},
                    {"iptg":iptg_concs,
                     "marker":"pheS",
                     "select":1}]
                     
# -----------------------------------------------------------------------------
# Data collection parameters

# Sample the population at the following times post treatment (minutes)
sample_times = [95,110,125]

total_num_reads = 1e9




In [8]:
# -----------------------------------------------------------------------------
# Build library

libraries, genotype_df = generate_libraries(aa_sequence=aa_sequence,
                                            mutated_sites=mutated_sites,
                                            seq_starts_at=seq_starts_at,
                                            max_num_combos=max_num_combos,
                                            internal_doubles=internal_doubles,
                                            degen_codon=degen_codon)

condition_df = None
for c in condition_blocks:
    condition_df = build_condition_dataframe(**c,current_df=condition_df)

# -----------------------------------------------------------------------------
# Calculate phenotypes

# Calculate phenotype. Each row is indexed by a unique genotype
phenotype_df, genotype_df = generate_phenotypes(genotype_df=genotype_df,
                                                condition_df=condition_df,
                                                ensemble_spreadsheet=ensemble_spreadsheet,
                                                ddG_spreadsheet=ddG_spreadsheet,
                                                scale_obs_by=12,
                                                mut_growth_rate_std=1,
                                                T=T,
                                                R=R)

# -----------------------------------------------------------------------------
# Sample from library and grow out

# Sample from the main library
input_library = transform_and_mix(libraries=libraries,
                                  transform_sizes=transform_sizes,
                                  library_mixture=library_mixture,
                                  max_num_plasmids=max_num_plasmids,
                                  lambda_value=lambda_value)


# Create initial populations and growth rates for each cell under all relevant 
# conditions
init_output = initialize_population(input_library=input_library,
                                    phenotype_df=phenotype_df,
                                    genotype_df=genotype_df,
                                    condition_df=condition_df,
                                    num_thawed_colonies=num_thawed_colonies,
                                    overnight_volume_in_mL=overnight_volume_in_mL,
                                    pre_iptg_cfu_mL=pre_iptg_cfu_mL,
                                    iptg_out_growth_time=iptg_out_growth_time,
                                    post_iptg_dilution_factor=post_iptg_dilution_factor)
    
bacteria, ln_pop_array, bact_condition_k = init_output


calculating phenotypes


  0%|          | 0/224890 [00:00<?, ?it/s]

storing phenotypes
Generating initial populations


  0%|          | 0/10200000 [00:00<?, ?it/s]

Getting bacterial growth rates


  0%|          | 0/1102349 [00:00<?, ?it/s]

In [12]:
bact_condition_k.shape

(1102349, 20)

In [None]:


# Get the population over time for each each selection and iptg condition
pops_vs_time = growth_with_selection(ln_pop_array,
                                     growth_rates,
                                     time_points=sample_times)

# -----------------------------------------------------------------------------
# Sequence samples and summarize results
df, genotype_df, condition_df = sequence_and_collate(pops_vs_time=pops_vs_time,
                                                     iptg_concs=iptg_concs,
                                                     sample_times=sample_times,
                                                     input_library=input_library,
                                                     all_genotypes=all_genotypes,
                                                     num_reads_per_condition=num_reads_per_condition)



In [None]:
np.stack([ln_pop_array]*20,axis=0)[0].shape

In [None]:

print("Generating initial populations",flush=True)
if input_library.ndim == 1:

    bacteria, ln_pop_array = tfscreen.thaw_glycerol_stock(input_library)
    bacteria, counts = np.unique(bacteria,return_counts=True)
    ln_pop_array = np.log(np.exp(ln_pop_array[0])*counts)

else:

    to_thaw = []
    indexer = {}
    for i in tqdm(range(input_library.shape[0])):
        
        tmp_genos = list(input_library[i][input_library[i] != "-"])
        tmp_genos.sort()
        tmp_genos = tuple(tmp_genos)
        
        if tmp_genos not in indexer:
            indexer[tmp_genos] = i
        
        to_thaw.append(indexer[tmp_genos])

    to_thaw = np.array(to_thaw)
    reverse_indexer = dict(zip(indexer.values(),indexer.keys()))

    bacteria, ln_pop_array = tfscreen.thaw_glycerol_stock(to_thaw)
    bacteria, counts = np.unique(bacteria,return_counts=True)
    ln_pop_array = np.log(np.exp(ln_pop_array[0])*counts)

    bacteria = np.array([reverse_indexer[b] for b in bacteria],dtype=object)
    

In [None]:
plt.hist(ln_pop_array,bins=np.arange(-6,6,0.1))


In [None]:
bacteria, population = tfscreen.thaw_glycerol_stock(out)
bacteria, counts = np.unique(bacteria,return_counts=True)
population = np.log(np.exp(population[0])*counts)







genotype_to_idx = dict(zip(list(genotype_df.index),
                           range(len(genotype_df.index))))

num_conditions = condition_df.shape[0]
num_bacteria = len(input_library)

condition_k = np.array(phenotype_df["growth_rate"])
condition_k = np.reshape(condition_k,
                         (len(condition_k)//num_conditions,
                          num_conditions))
base_delta_k = np.array(genotype_df["growth_rate_effect"])


if input_library.ndim == 1:
    
    idx = np.array([genotype_to_idx[g] for g in input_library])

    clone_genotypes = input_library
    clone_base_delta_k = base_delta_k[idx]
    clone_condition_k = condition_k[idx]

else:

    clone_genotypes = []
    clone_condition_k = np.zeros((num_bacteria,num_conditions),dtype=float)
    clone_base_delta_k = np.zeros(num_bacteria)
    
    for i in tqdm(range(len(input_library))):
    
        genotypes = input_library[i][input_library[i] != "-"]
        idx = np.array([genotype_to_idx[g] for g in genotypes])
    
        clone_genotypes.append(genotypes)
        clone_condition_k[i] = np.mean(condition_k[idx,:],axis=0)
        clone_base_delta_k[i] = np.mean(base_delta_k[idx])
    
    clone_genotypes = list(clone_genotypes)


In [None]:
input_library = [
x, y = np.unique(input_library,return_counts=True)
input_library.shape


    

In [None]:
base_k[0]

In [None]:
df.to_csv("obs.csv")
genotype_df.to_csv("genotype.csv")
cfu_df.to_csv("cfu.csv")

In [None]:
# libraries --> dictionary keying each library to a list of mutations in each clone
# 
# lib_phenotypes
# [{'clone': ['V30N'],
#    'ddG': array([2.0162, 1.8317, 1.8317, 2.0162, 2.0162, 1.8317, 1.8317, 2.0162,
#           1.8317, 1.8312, 1.8312, 1.8312, 1.831 , 1.8309, 1.8309, 1.8309,
#           0.    ]),
#    'fx_occupied': array([0.0007608 , 0.00103465, 0.00426162, 0.01198636, 0.02997511,
#           0.04687013, 0.05698534, 0.06055376]),
#    'fx_folded': array([1.        , 1.        , 0.99999996, 0.99999979, 0.99999928,
#           0.99999875, 0.99999842, 0.9999983 ]),
#    'obs': array([0.0007608 , 0.00103465, 0.00426162, 0.01198636, 0.02997509,
#           0.04687007, 0.05698525, 0.06055365]),
#    'base_growth_rate': np.float64(0.005445439477620463),
#    'sel_kan': array([3.98258294e-05, 5.41413358e-05, 2.22833015e-04, 6.26648000e-04,
#           1.56701933e-03, 2.45021391e-03, 2.97899038e-03, 3.16553093e-03]),
#    'kan_p0g': np.float64(5.4454394776204634e-08),
#    'kan_p1g': np.float64(0.004356351582096371),
#    'sel_pheS': array([0.00431658, 0.00430226, 0.00413357, 0.00372976, 0.00278939,
#           0.00190619, 0.00137742, 0.00119088]),
#    'pheS_p0g': np.float64(0.004356351582096371),
#    'pheS_p1g': np.float64(5.4454394776204634e-08)},

# input_library and all_genotypes
# input_library -> array([214305,   8702, 169508, ...,  81919, 281807,  70355],shape=(10000000,))

# all_genotypes -> list with full genotype information (like lib_phenotypes) indexed by input library.
# ln_pop_array : log population of each clone in initial library before selection (indexed by input_library)
# base_growth_rates : base growth rate of each clone (indexed by input_library)
# growth_rates : dictionary keyed to selection. values are N x num_iptg arrays where N is indexed by input_library


In [None]:

all_genotypes = []
for lib in libraries:
    all_genotypes.extend(["/".join(g) for g in libraries[lib]])

genotype_df = build_genotype_df(all_genotypes)


In [None]:
libraries[('1',)]