In [3]:
# Main imports
import numpy as np
import pandas as pd

# Import matplotlib and such
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
%matplotlib inline
plt.ion()

from helper_functions import \
    thermodynamic_model, \
    plot_manifold_model, \
    plot_manifold_measurements, \
    get_measurement_subset_df
    

In [4]:
# Set global styles
textwidth=5.5
halfcolwidth=3
fontsize=8
titlesize=10
panel_label_size=12
legend_fontsize=7
markersize=5
mpl.rcParams['font.size'] = fontsize
mpl.rcParams['text.usetex'] = False

# Get default colors
prop_cycle = plt.rcParams['axes.prop_cycle']
colors = prop_cycle.by_key()['color']
color10 = colors[1]
color35 = np.array([.99, .50, .16])*.7

In [6]:
# Load clonal measurements 
data_df = pd.read_excel('../data/results.xlsx', sheet_name='measurements_summary').set_index('location')
data_df.head()

Unnamed: 0_level_0,name,log_t+,dlog_t+,log_t-,dlog_t-,num_t+,num_t-,outlier,spacing,sequence
location,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
b3C9,60c-r18,-1.564814,0.302713,-1.797453,0.122828,5,6,0.0,,CGCAATGAATCACTCCATTGAGTGTTTTGAGGGTCCCAGGCTTTAC...
b5E2,61c-oc-2,-5.308959,0.114674,-3.535456,0.081788,3,6,,-2.5,CGCAATGAATCACTCCATTGAGTGTTTTGAGGGTCCCCAGGCTTTA...
b5E1,61c-oc-2,-7.081135,0.270409,-3.405931,0.205113,3,6,,-2.5,CGCAATGAATCACTCCATTGAGTGTTTTGAGGGTCCCCAGGCTTTA...
b5E4,61c-oc0,-1.691993,0.146085,1.357366,0.088154,6,6,0.0,0.5,CGCAATGAATCACTCCATTGAGTGTTTTGAGGGTCCCCAGGCTTTA...
b5E6,61c-oc1,-6.037679,0.137889,-4.26183,0.591296,3,5,,1.5,CGCAATGAATCACTCCATTGAGTGTTTTGAGGGTCCCCAGGCTTTA...


In [20]:
all_locations = set(data_df.index)
len(all_locations)

663

In [29]:
# Compile a list of promoter outliers
outlier_indices = (data_df['outlier'] == True)
outlier_locations = list(data_df.index[outlier_indices])
outlier_locations

['b2E2',
 'b2E4',
 'b1B7',
 'b9D5',
 'b10G3',
 'b2F4',
 'b15A4',
 'b15C4',
 'b5G5',
 'b5G7']

In [48]:
# CRP null promoters
patterns = [
    'r17\.35L..\.10cons',
    'c-r17',
    '61c-r18'
]
names = []


# Complile a list of 61c- promoters
import re
for pattern in patterns:
    names.extend([name for name in data_df['name'] if re.match(pattern,name)])

tmp_df = data_df.copy()
tmp_df.reset_index(inplace=True, drop=False)
tmp_df.set_index('name', inplace=True)
tmp_df.head()

crp_null_locations = list(tmp_df['location'][names])
crp_null_locations
                 

['b18C9',
 'b18D2',
 'b18D3',
 'b18D5',
 'b18D6',
 'b18E1',
 'b18E2',
 'b18E3',
 'b18E4',
 'b18E5',
 'b18E6',
 'b18E7',
 'b18E8',
 'b18E9',
 'b18F1',
 'b1A7',
 'b1A8',
 'b1A7',
 'b1A8',
 'b1E3',
 'b1E4',
 'b1E5',
 'b1E7',
 'b1E8',
 'b1F1',
 'b1F2',
 'b1F3',
 'b1F4',
 'b1F5',
 'b1F6',
 'b1F7',
 'b1F8',
 'b1F9',
 'b1G1',
 'b1G2',
 'b1G3',
 'b1G4',
 'b1G5',
 'b1G6',
 'b1G7',
 'b1G8',
 'b3A5',
 'b3A7']

In [30]:
# Complile a list of c40 promoters
c40_indices = ['c40' in name for name in data_df['name'] ]
c40_locations = list(data_df.index[c40_indices])
c40_locations

['b14E3',
 'b14E4',
 'b14E5',
 'b14E6',
 'b14E8',
 'b14E9',
 'b14F1',
 'b14F3',
 'b14F4',
 'b14G7',
 'b14G8',
 'b14G9',
 'b14H1',
 'b14H2',
 'b14H3',
 'b14H4',
 'b14H5',
 'b14H6',
 'b14H7',
 'b14H8',
 'b15C6',
 'b15C7',
 'b15C8',
 'b15C9',
 'b15D1',
 'b15D2',
 'b15D3',
 'b15D4',
 'b15D5',
 'b15D6',
 'b12I1',
 'b12I2',
 'b12I3']

In [26]:
# Compile a list of all promoters used for model fitting
xl = pd.ExcelFile('../data/results.xlsx',)
resamp_sheets = [name for name in xl.sheet_names if 'resamp' in name] 
resamp_sheets

# Load each sheet, then parse locations from names
resamp_locations = []
for sheet in resamp_sheets:
    resamp_df = pd.read_excel('../data/results.xlsx', sheet_name=sheet)
    locs = [col.split('_')[0] for col in resamp_df.columns if '_log_P' in col]
    print('In sheet %s: %d locations'%(sheet,len(locs)))
    resamp_locations.extend(locs)
    
# Make unique
resamp_locations = list(set(resamp_locations))
print('Total resamp locations: %d'%len(resamp_locations))

In sheet occlusion_resamp: 41 locations
In sheet c61_resamp: 44 locations
In sheet conjoined_resamp: 284 locations
In sheet occlusion_resamp_2.5uM: 8 locations
In sheet occlusion_resamp_5.0uM: 8 locations
In sheet occlusion_resamp_10.0uM: 8 locations
In sheet occlusion_resamp_25.0uM: 8 locations
In sheet occlusion_resamp_50.0uM: 7 locations
In sheet occlusion_resamp_125.0uM: 6 locations
In sheet occlusion_resamp_250.0uM: 41 locations
In sheet c60_beta_resamp: 21 locations
In sheet c61_beta_resamp: 62 locations
In sheet c62_beta_resamp: 23 locations
In sheet c63_beta_resamp: 20 locations
In sheet c64_beta_resamp: 17 locations
In sheet c65_beta_resamp: 17 locations
In sheet c71_beta_resamp: 35 locations
In sheet c72_beta_resamp: 20 locations
In sheet c81_beta_resamp: 32 locations
In sheet c82_beta_resamp: 20 locations
In sheet c41_beta_resamp: 21 locations
Total resamp locations: 364


In [49]:
# Gather all locations
locations_used = list(set(outlier_locations + crp_null_locations + c40_locations + resamp_locations))
print('Total locations used: %d'%len(locations_used))

Total locations used: 448


In [56]:
pruned_df = data_df.copy()
pruned_df = pruned_df.loc[locations_used,:]
pruned_df.sort_values(by='name', inplace=True)
pruned_df.head()

Unnamed: 0_level_0,name,log_t+,dlog_t+,log_t-,dlog_t-,num_t+,num_t-,outlier,spacing,sequence
location,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
b5E4,61c-oc0,-1.691993,0.146085,1.357366,0.088154,6,6,0.0,0.5,CGCAATGAATCACTCCATTGAGTGTTTTGAGGGTCCCCAGGCTTTA...
b5A8,61c-ocl,-4.955585,0.477209,-2.652523,0.185727,12,6,0.0,4.5,CGCAATGAATCACTCCATTGAGTGTTTTGAGGGTCCCCAGGCTTTA...
b5B2,61c-ocl.35L01,-5.426847,1.395136,-3.139291,0.053276,15,9,0.0,4.5,CGCAATGAATCACTCCATTGAGTGTTTTGAGGGTCCCCAGGGTTTA...
b5B3,61c-ocl.35L02,-5.057494,1.232833,-2.840256,0.373761,15,9,0.0,4.5,CGCAATGAATCACTCCATTGAGTGTTTTGAGGGTCCCCAGGCTTTA...
b5B5,61c-ocl.35L04,-4.600446,0.550925,-4.516905,0.056885,14,6,0.0,4.5,CGCAATGAATCACTCCATTGAGTGTTTTGAGGGTCCCCACCCTTAA...


In [60]:
len(pruned_df)

448

In [62]:
# Save pruned measurements
writer = pd.ExcelWriter('../data/pruned_results.xlsx')
pruned_df.to_excel(writer,'pruned_measurements_summary')
writer.save()