In [2]:
import tfscreen
from tfscreen.plot import heatmap
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

## Basic manipulations

How to read in data and manipulate it using tfscreen and the categorical genotype datatype. 

In [3]:
theta_file = "theta.csv"

# This reads in the dataframe in a clean way, dropping leading index 
# entries and automatically using read_csv, read_xlsx, or just leaving
# the dataframe alone if you already loaded it
df = tfscreen.util.read_dataframe(theta_file)

In [4]:
# Genotype column starts as an object (str). This means if we sort it, it 
# sorts alphabetically. 
print(df["genotype"].dtype)
df.sort_values("genotype")

object


Unnamed: 0,genotype,titrant_name,titrant_conc,theta_est,theta_std,class,name,guess,transform,scale_mu,scale_sigma,idx,lower_bounds,upper_bounds,censored,fixed
351,A32C,iptg,1.0000,0.552133,0.133079,theta,theta_A32C_iptg_1.0,0.000251,dummy,0.0,1.0,25,-inf,inf,False,False
349,A32C,iptg,0.0300,0.842998,0.138525,theta,theta_A32C_iptg_0.03,0.218308,dummy,0.0,1.0,23,-inf,inf,False,False
348,A32C,iptg,0.0100,1.340297,0.146708,theta,theta_A32C_iptg_0.01,0.715383,dummy,0.0,1.0,22,-inf,inf,False,False
347,A32C,iptg,0.0030,1.534203,0.142718,theta,theta_A32C_iptg_0.003,0.965431,dummy,0.0,1.0,21,-inf,inf,False,False
346,A32C,iptg,0.0010,1.613716,0.146837,theta,theta_A32C_iptg_0.001,0.996037,dummy,0.0,1.0,20,-inf,inf,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4,wt,iptg,0.0100,1.359748,0.149432,theta,theta_wt_iptg_0.01,0.715383,dummy,0.0,1.0,14,-inf,inf,False,False
3,wt,iptg,0.0030,1.526672,0.145191,theta,theta_wt_iptg_0.003,0.965431,dummy,0.0,1.0,13,-inf,inf,False,False
2,wt,iptg,0.0010,1.649798,0.153619,theta,theta_wt_iptg_0.001,0.996037,dummy,0.0,1.0,12,-inf,inf,False,False
1,wt,iptg,0.0001,1.752083,0.147329,theta,theta_wt_iptg_0.0001,0.999960,dummy,0.0,1.0,11,-inf,inf,False,False


In [5]:
# This function makes the genotype into a categorical datatype with the right
# sort order. Standardize makes double sure things like M98I/H29A -> H29A/M98I
# and H29A -> wt. 
df = tfscreen.genetics.set_categorical_genotype(df,standardize=True,sort=True)
df

Unnamed: 0,genotype,titrant_name,titrant_conc,theta_est,theta_std,class,name,guess,transform,scale_mu,scale_sigma,idx,lower_bounds,upper_bounds,censored,fixed
0,wt,iptg,0.0000,1.823504,0.166039,theta,theta_wt_iptg_0.0,1.000000,dummy,0.0,1.0,10,-inf,inf,False,False
1,wt,iptg,0.0001,1.752083,0.147329,theta,theta_wt_iptg_0.0001,0.999960,dummy,0.0,1.0,11,-inf,inf,False,False
2,wt,iptg,0.0010,1.649798,0.153619,theta,theta_wt_iptg_0.001,0.996037,dummy,0.0,1.0,12,-inf,inf,False,False
3,wt,iptg,0.0030,1.526672,0.145191,theta,theta_wt_iptg_0.003,0.965431,dummy,0.0,1.0,13,-inf,inf,False,False
4,wt,iptg,0.0100,1.359748,0.149432,theta,theta_wt_iptg_0.01,0.715383,dummy,0.0,1.0,14,-inf,inf,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1687490,M42I/H74A/K84L,iptg,0.0010,0.810740,0.155545,theta,theta_M42I/H74A/K84L_iptg_0.001,0.996037,dummy,0.0,1.0,20,-inf,inf,False,False
1687491,M42I/H74A/K84L,iptg,0.0030,0.759494,0.147623,theta,theta_M42I/H74A/K84L_iptg_0.003,0.965431,dummy,0.0,1.0,21,-inf,inf,False,False
1687492,M42I/H74A/K84L,iptg,0.0100,0.727106,0.152426,theta,theta_M42I/H74A/K84L_iptg_0.01,0.715383,dummy,0.0,1.0,22,-inf,inf,False,False
1687493,M42I/H74A/K84L,iptg,0.0300,0.876118,0.149091,theta,theta_M42I/H74A/K84L_iptg_0.03,0.218308,dummy,0.0,1.0,23,-inf,inf,False,False


In [6]:
# if we sort on genotype now, it works. 
df.sort_values("genotype")

Unnamed: 0,genotype,titrant_name,titrant_conc,theta_est,theta_std,class,name,guess,transform,scale_mu,scale_sigma,idx,lower_bounds,upper_bounds,censored,fixed
0,wt,iptg,0.0000,1.823504,0.166039,theta,theta_wt_iptg_0.0,1.000000,dummy,0.0,1.0,10,-inf,inf,False,False
1,wt,iptg,0.0001,1.752083,0.147329,theta,theta_wt_iptg_0.0001,0.999960,dummy,0.0,1.0,11,-inf,inf,False,False
2,wt,iptg,0.0010,1.649798,0.153619,theta,theta_wt_iptg_0.001,0.996037,dummy,0.0,1.0,12,-inf,inf,False,False
3,wt,iptg,0.0030,1.526672,0.145191,theta,theta_wt_iptg_0.003,0.965431,dummy,0.0,1.0,13,-inf,inf,False,False
4,wt,iptg,0.0100,1.359748,0.149432,theta,theta_wt_iptg_0.01,0.715383,dummy,0.0,1.0,14,-inf,inf,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1687489,M42I/H74A/K84L,iptg,0.0001,0.920603,0.150240,theta,theta_M42I/H74A/K84L_iptg_0.0001,0.999960,dummy,0.0,1.0,19,-inf,inf,False,False
1687490,M42I/H74A/K84L,iptg,0.0010,0.810740,0.155545,theta,theta_M42I/H74A/K84L_iptg_0.001,0.996037,dummy,0.0,1.0,20,-inf,inf,False,False
1687491,M42I/H74A/K84L,iptg,0.0030,0.759494,0.147623,theta,theta_M42I/H74A/K84L_iptg_0.003,0.965431,dummy,0.0,1.0,21,-inf,inf,False,False
1687492,M42I/H74A/K84L,iptg,0.0100,0.727106,0.152426,theta,theta_M42I/H74A/K84L_iptg_0.01,0.715383,dummy,0.0,1.0,22,-inf,inf,False,False


In [7]:
# This extracts a single genotype
out_df = df[df["genotype"] == "H74A"]
out_df

Unnamed: 0,genotype,titrant_name,titrant_conc,theta_est,theta_std,class,name,guess,transform,scale_mu,scale_sigma,idx,lower_bounds,upper_bounds,censored,fixed
4863,H74A,iptg,1.0,0.61753,0.132919,theta,theta_H74A_iptg_1.0,0.000251,dummy,0.0,1.0,25,-inf,inf,False,False
4862,H74A,iptg,0.1,0.474287,0.137036,theta,theta_H74A_iptg_0.1,0.024519,dummy,0.0,1.0,24,-inf,inf,False,False
4861,H74A,iptg,0.03,0.533357,0.138806,theta,theta_H74A_iptg_0.03,0.218308,dummy,0.0,1.0,23,-inf,inf,False,False
4860,H74A,iptg,0.01,0.42888,0.142753,theta,theta_H74A_iptg_0.01,0.715383,dummy,0.0,1.0,22,-inf,inf,False,False
4857,H74A,iptg,0.0001,0.784549,0.139311,theta,theta_H74A_iptg_0.0001,0.99996,dummy,0.0,1.0,19,-inf,inf,False,False
4858,H74A,iptg,0.001,0.665968,0.144305,theta,theta_H74A_iptg_0.001,0.996037,dummy,0.0,1.0,20,-inf,inf,False,False
4856,H74A,iptg,0.0,1.011603,0.154134,theta,theta_H74A_iptg_0.0,1.0,dummy,0.0,1.0,18,-inf,inf,False,False
4859,H74A,iptg,0.003,0.527075,0.136995,theta,theta_H74A_iptg_0.003,0.965431,dummy,0.0,1.0,21,-inf,inf,False,False


In [8]:
# This function expands the gentoype column into sites. All of the NAN
# values are for wildtype. 
out_df = tfscreen.genetics.expand_genotype_columns(df)
out_df

Unnamed: 0,genotype,wt_aa_1,resid_1,mut_aa_1,wt_aa_2,resid_2,mut_aa_2,wt_aa_3,resid_3,mut_aa_3,...,name,guess,transform,scale_mu,scale_sigma,idx,lower_bounds,upper_bounds,censored,fixed
0,wt,,,,,,,,,,...,theta_wt_iptg_0.0,1.000000,dummy,0.0,1.0,10,-inf,inf,False,False
1,wt,,,,,,,,,,...,theta_wt_iptg_0.0001,0.999960,dummy,0.0,1.0,11,-inf,inf,False,False
2,wt,,,,,,,,,,...,theta_wt_iptg_0.001,0.996037,dummy,0.0,1.0,12,-inf,inf,False,False
3,wt,,,,,,,,,,...,theta_wt_iptg_0.003,0.965431,dummy,0.0,1.0,13,-inf,inf,False,False
4,wt,,,,,,,,,,...,theta_wt_iptg_0.01,0.715383,dummy,0.0,1.0,14,-inf,inf,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1687490,M42I/H74A/K84L,M,42,I,H,74,A,K,84,L,...,theta_M42I/H74A/K84L_iptg_0.001,0.996037,dummy,0.0,1.0,20,-inf,inf,False,False
1687491,M42I/H74A/K84L,M,42,I,H,74,A,K,84,L,...,theta_M42I/H74A/K84L_iptg_0.003,0.965431,dummy,0.0,1.0,21,-inf,inf,False,False
1687492,M42I/H74A/K84L,M,42,I,H,74,A,K,84,L,...,theta_M42I/H74A/K84L_iptg_0.01,0.715383,dummy,0.0,1.0,22,-inf,inf,False,False
1687493,M42I/H74A/K84L,M,42,I,H,74,A,K,84,L,...,theta_M42I/H74A/K84L_iptg_0.03,0.218308,dummy,0.0,1.0,23,-inf,inf,False,False
