In [1]:
import tfscreen
from tfscreen.plot import heatmap
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

## Basic manipulations

How to read in data and manipulate it using tfscreen and the categorical genotype datatype. 

In [2]:
theta_file = "/Users/harmsm/Desktop/keep/theta_df.csv"

# This reads in the dataframe in a clean way, dropping leading index 
# entries and automatically using read_csv, read_xlsx, or just leaving
# the dataframe alone if you already loaded it
df = tfscreen.util.read_dataframe(theta_file)

In [3]:
# Genotype column starts as an object (str). This means if we sort it, it 
# sorts alphabetically. 
print(df["genotype"].dtype)
df.sort_values("genotype")

object


Unnamed: 0,genotype,titrant_name,titrant_conc,theta_est,theta_std,class,name,guess,transform,scale_mu,scale_sigma,idx,lower_bounds,upper_bounds,censored,fixed
344,A32C,iptg,0.0000,1.832580,0.198045,theta,theta_A32C_iptg_0.0,1.000000,theta,0.0,1.0,36,-inf,inf,False,False
345,A32C,iptg,0.0001,1.738420,0.192495,theta,theta_A32C_iptg_0.0001,0.999960,theta,0.0,1.0,37,-inf,inf,False,False
346,A32C,iptg,0.0010,1.753172,0.194352,theta,theta_A32C_iptg_0.001,0.996037,theta,0.0,1.0,38,-inf,inf,False,False
347,A32C,iptg,0.0030,1.672375,0.190422,theta,theta_A32C_iptg_0.003,0.965431,theta,0.0,1.0,39,-inf,inf,False,False
348,A32C,iptg,0.0100,1.430563,0.200783,theta,theta_A32C_iptg_0.01,0.715383,theta,0.0,1.0,40,-inf,inf,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4,wt,iptg,0.0100,1.490344,0.196287,theta,theta_wt_iptg_0.01,0.715383,theta,0.0,1.0,16,-inf,inf,False,False
3,wt,iptg,0.0030,1.690168,0.184302,theta,theta_wt_iptg_0.003,0.965431,theta,0.0,1.0,15,-inf,inf,False,False
2,wt,iptg,0.0010,1.758324,0.188953,theta,theta_wt_iptg_0.001,0.996037,theta,0.0,1.0,14,-inf,inf,False,False
1,wt,iptg,0.0001,1.721745,0.186540,theta,theta_wt_iptg_0.0001,0.999960,theta,0.0,1.0,13,-inf,inf,False,False


In [4]:
# This function makes the genotype into a categorical datatype with the right
# sort order. Standardize makes double sure things like M98I/H29A -> H29A/M98I
# and H29A -> wt. 
df = tfscreen.genetics.set_categorical_genotype(df,standardize=True,sort=True)
df

Unnamed: 0,genotype,titrant_name,titrant_conc,theta_est,theta_std,class,name,guess,transform,scale_mu,scale_sigma,idx,lower_bounds,upper_bounds,censored,fixed
0,wt,iptg,0.0000,1.841333,0.193819,theta,theta_wt_iptg_0.0,1.000000,theta,0.0,1.0,12,-inf,inf,False,False
1,wt,iptg,0.0001,1.721745,0.186540,theta,theta_wt_iptg_0.0001,0.999960,theta,0.0,1.0,13,-inf,inf,False,False
2,wt,iptg,0.0010,1.758324,0.188953,theta,theta_wt_iptg_0.001,0.996037,theta,0.0,1.0,14,-inf,inf,False,False
3,wt,iptg,0.0030,1.690168,0.184302,theta,theta_wt_iptg_0.003,0.965431,theta,0.0,1.0,15,-inf,inf,False,False
4,wt,iptg,0.0100,1.490344,0.196287,theta,theta_wt_iptg_0.01,0.715383,theta,0.0,1.0,16,-inf,inf,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1694554,K59Y/S93C,iptg,0.0010,1.506344,0.333097,theta,theta_K59Y/S93C_iptg_0.001,0.996037,theta,0.0,1.0,56,-inf,inf,False,False
1694555,K59Y/S93C,iptg,0.0030,1.471617,0.338239,theta,theta_K59Y/S93C_iptg_0.003,0.965431,theta,0.0,1.0,57,-inf,inf,False,False
1694556,K59Y/S93C,iptg,0.0100,0.998207,0.339033,theta,theta_K59Y/S93C_iptg_0.01,0.715383,theta,0.0,1.0,58,-inf,inf,False,False
1694557,K59Y/S93C,iptg,0.0300,1.425698,0.303875,theta,theta_K59Y/S93C_iptg_0.03,0.218308,theta,0.0,1.0,59,-inf,inf,False,False


In [5]:
# if we sort on genotype now, it works. 
df.sort_values("genotype")

Unnamed: 0,genotype,titrant_name,titrant_conc,theta_est,theta_std,class,name,guess,transform,scale_mu,scale_sigma,idx,lower_bounds,upper_bounds,censored,fixed
0,wt,iptg,0.0000,1.841333,0.193819,theta,theta_wt_iptg_0.0,1.000000,theta,0.0,1.0,12,-inf,inf,False,False
1,wt,iptg,0.0001,1.721745,0.186540,theta,theta_wt_iptg_0.0001,0.999960,theta,0.0,1.0,13,-inf,inf,False,False
2,wt,iptg,0.0010,1.758324,0.188953,theta,theta_wt_iptg_0.001,0.996037,theta,0.0,1.0,14,-inf,inf,False,False
3,wt,iptg,0.0030,1.690168,0.184302,theta,theta_wt_iptg_0.003,0.965431,theta,0.0,1.0,15,-inf,inf,False,False
4,wt,iptg,0.0100,1.490344,0.196287,theta,theta_wt_iptg_0.01,0.715383,theta,0.0,1.0,16,-inf,inf,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1694553,K59Y/S93C,iptg,0.0001,1.386238,0.317756,theta,theta_K59Y/S93C_iptg_0.0001,0.999960,theta,0.0,1.0,55,-inf,inf,False,False
1694554,K59Y/S93C,iptg,0.0010,1.506344,0.333097,theta,theta_K59Y/S93C_iptg_0.001,0.996037,theta,0.0,1.0,56,-inf,inf,False,False
1694555,K59Y/S93C,iptg,0.0030,1.471617,0.338239,theta,theta_K59Y/S93C_iptg_0.003,0.965431,theta,0.0,1.0,57,-inf,inf,False,False
1694556,K59Y/S93C,iptg,0.0100,0.998207,0.339033,theta,theta_K59Y/S93C_iptg_0.01,0.715383,theta,0.0,1.0,58,-inf,inf,False,False


In [6]:
# This extracts a single genotype
out_df = df[df["genotype"] == "H74A"]
out_df

Unnamed: 0,genotype,titrant_name,titrant_conc,theta_est,theta_std,class,name,guess,transform,scale_mu,scale_sigma,idx,lower_bounds,upper_bounds,censored,fixed
4869,H74A,iptg,0.03,0.508422,0.187179,theta,theta_H74A_iptg_0.03,0.218308,theta,0.0,1.0,17,-inf,inf,False,False
4871,H74A,iptg,1.0,0.663432,0.178436,theta,theta_H74A_iptg_1.0,0.000251,theta,0.0,1.0,19,-inf,inf,False,False
4870,H74A,iptg,0.1,0.571315,0.183166,theta,theta_H74A_iptg_0.1,0.024519,theta,0.0,1.0,18,-inf,inf,False,False
4868,H74A,iptg,0.01,0.453398,0.196925,theta,theta_H74A_iptg_0.01,0.715383,theta,0.0,1.0,16,-inf,inf,False,False
4864,H74A,iptg,0.0,1.003997,0.193973,theta,theta_H74A_iptg_0.0,1.0,theta,0.0,1.0,12,-inf,inf,False,False
4866,H74A,iptg,0.001,0.769961,0.188886,theta,theta_H74A_iptg_0.001,0.996037,theta,0.0,1.0,14,-inf,inf,False,False
4867,H74A,iptg,0.003,0.612956,0.184721,theta,theta_H74A_iptg_0.003,0.965431,theta,0.0,1.0,15,-inf,inf,False,False
4865,H74A,iptg,0.0001,0.788357,0.18656,theta,theta_H74A_iptg_0.0001,0.99996,theta,0.0,1.0,13,-inf,inf,False,False


In [7]:
# This function expands the gentoype column into sites. All of the NAN
# values are for wildtype. 
out_df = tfscreen.genetics.expand_genotype_columns(df)
out_df

Unnamed: 0,genotype,wt_aa_1,resid_1,mut_aa_1,wt_aa_2,resid_2,mut_aa_2,num_muts,titrant_name,titrant_conc,...,name,guess,transform,scale_mu,scale_sigma,idx,lower_bounds,upper_bounds,censored,fixed
0,wt,,,,,,,0,iptg,0.0000,...,theta_wt_iptg_0.0,1.000000,theta,0.0,1.0,12,-inf,inf,False,False
1,wt,,,,,,,0,iptg,0.0001,...,theta_wt_iptg_0.0001,0.999960,theta,0.0,1.0,13,-inf,inf,False,False
2,wt,,,,,,,0,iptg,0.0010,...,theta_wt_iptg_0.001,0.996037,theta,0.0,1.0,14,-inf,inf,False,False
3,wt,,,,,,,0,iptg,0.0030,...,theta_wt_iptg_0.003,0.965431,theta,0.0,1.0,15,-inf,inf,False,False
4,wt,,,,,,,0,iptg,0.0100,...,theta_wt_iptg_0.01,0.715383,theta,0.0,1.0,16,-inf,inf,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1694554,K59Y/S93C,K,59,Y,S,93,C,2,iptg,0.0010,...,theta_K59Y/S93C_iptg_0.001,0.996037,theta,0.0,1.0,56,-inf,inf,False,False
1694555,K59Y/S93C,K,59,Y,S,93,C,2,iptg,0.0030,...,theta_K59Y/S93C_iptg_0.003,0.965431,theta,0.0,1.0,57,-inf,inf,False,False
1694556,K59Y/S93C,K,59,Y,S,93,C,2,iptg,0.0100,...,theta_K59Y/S93C_iptg_0.01,0.715383,theta,0.0,1.0,58,-inf,inf,False,False
1694557,K59Y/S93C,K,59,Y,S,93,C,2,iptg,0.0300,...,theta_K59Y/S93C_iptg_0.03,0.218308,theta,0.0,1.0,59,-inf,inf,False,False
