In [1]:
# Standard imports
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
%matplotlib inline

# Insert mavenn at beginning of path
import sys
path_to_mavenn_local = '../../../../'
sys.path.insert(0,path_to_mavenn_local)
path_to_suftware_local = '../../../../../suftware/'
sys.path.insert(0,path_to_suftware_local)

# Load mavenn and check path
import mavenn
print(mavenn.__path__)

# Load suftware and check path
import suftware
print(suftware.__file__)

# MAVE-NN utilities
from mavenn.src.entropy import entropy_continuous

# Useful constants
pi = np.pi
e = np.exp(1)

['../../../../mavenn']
../../../../../suftware/suftware.py


In [2]:
# Load GB1 data
data_df = mavenn.load_example_dataset('gb1')

# Compute length and preview df
N = len(data_df)
print(f'N: {N}')
data_df.head()

N: 530737


Unnamed: 0,set,input_ct,selected_ct,y,x
0,training,73.0,62.0,-1.021847,QYKLILNGKTLKGETTTEAHDAATAEKVFKQYANDNGVDGEWTYDD...
1,training,122.0,0.0,-7.732188,QYKLILNGKTLKGETTTEAVDAATAEKVFPQYANDNGVDGEWTYDD...
2,training,794.0,598.0,-1.198072,QYKLILNGKTLKGETTTEAVDAATAEKVFKQYANKNGVDGEWTLDD...
3,training,1115.0,595.0,-1.694626,QYKLILNIKTLKGETTTEAVDAATAEKVFKQYANDNGVDGEWTYDS...
4,validation,97.0,2.0,-5.819421,QYKLINNGKTLKGETTTEAVDAATAEKVFKIYANDNGVDGEWTYDD...


In [3]:
# Select N_sub sequences to estimate intrinsic information on
N_sub = 10000
ix = np.random.choice(N, size=N_sub, replace=False)
sub_df = data_df.iloc[ix].copy().reset_index(drop=True)
sub_df.head()

Unnamed: 0,set,input_ct,selected_ct,y,x
0,training,158.0,541.0,0.979593,QYKLILNGWTLKGETTTEAVDAATAEKVFKQYANDNGVDGMWTYDD...
1,training,95.0,0.0,-7.374636,QYKLILCGKTLKGETTTEAVDAATAEKVFKQYANDNGVDGEGTYDD...
2,training,629.0,863.0,-0.333994,CYKLILNGKTLKGETTTEAVDSATAEKVFKQYANDNGVDGEWTYDD...
3,training,226.0,1283.0,1.710208,QYKLILNGKTLKGETTAEAVDAPTAEKVFKQYANDNGVDGEWTYDD...
4,validation,37032.0,583.0,-6.776373,QYKLILNGKTLKGETTTEAVDAATLEKVFKQYANDNGVDGSWTYDD...


In [4]:
# Extract counts and compute y
i_n = sub_df['input_ct'].values
o_n = sub_df['selected_ct'].values
r_n = (o_n+1)/(i_n+1)
y_n = np.log2(r_n)

In [25]:
# Resample counts and compute list of ys
K = 1000
mu_i_nk = np.repeat(i_n.reshape([N_sub,1]), K, axis=1)
mu_o_nk = np.repeat(o_n.reshape([N_sub,1]), K, axis=1)
i_nk = np.random.poisson(lam=mu_i_nk)
o_nk = np.random.poisson(lam=mu_o_nk)
r_nk = (o_nk+1)/(i_nk+1)
y_nk = np.log2(r_nk)

In [27]:
# Estimate entropy using knn
H_n_knn = np.zeros(N_sub)
for i in range(N_sub):
    y_k = y_nk[i,:].copy()
    H_n_knn[i] = entropy_continuous(y_k, knn=5, uncertainty=False, resolution=.1)
    
H_ygx_knn = np.mean(H_n_knn)
dH_ygx_knn = np.std(H_n_knn)/np.sqrt(N_sub)
print(f'H[y|x] (knn): {H_ygx_knn:.4f} +- {dH_ygx_knn:.4f} bits')

H[y|x] (knn): -0.1406 +- 0.0110 bits


In [28]:
# Compute entropy of dataset using knn
y = y_n.copy()
H_y_knn, dH_y_knn = entropy_continuous(y, knn=5, uncertainty=True, num_subsamples=100)
print(f'H[y] (knn): {H_y_knn:.4f} +- {dH_y_knn:.4f} bits')

H[y] (knn): 3.4219 +- 0.0090 bits


In [29]:
# Report mutual information values for various H_ygx estimates
for (name, H_ygx, dH_ygx) in [('knn',   H_ygx_knn,   dH_ygx_knn)]:
    I_y_x = H_y_knn - H_ygx
    dI_y_x = np.sqrt(dH_y_knn**2 + dH_ygx**2)
    print(f'I_intr ({name}): {I_y_x:.4f} +- {dI_y_x:.4f} bits')
    
# Would be nice to see a plot of this

I_intr (knn): 3.5624 +- 0.0143 bits
