In [1]:
# Standard imports
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
%matplotlib inline

# Load mavenn and check path
import mavenn
print(mavenn.__path__)

# MAVE-NN utilities
from mavenn.src.entropy import entropy_continuous

# Useful constants
pi = np.pi
e = np.exp(1)

['/Users/jkinney/github/mavenn/mavenn']


In [2]:
# Load GB1 data
data_df = mavenn.load_example_dataset('gb1')

# Compute length and preview df
N = len(data_df)
print(f'N: {N}')
data_df.head()

N: 530737


Unnamed: 0,set,input_ct,selected_ct,y,x
0,training,73.0,62.0,-1.021847,QYKLILNGKTLKGETTTEAHDAATAEKVFKQYANDNGVDGEWTYDD...
1,training,122.0,0.0,-7.732188,QYKLILNGKTLKGETTTEAVDAATAEKVFPQYANDNGVDGEWTYDD...
2,training,794.0,598.0,-1.198072,QYKLILNGKTLKGETTTEAVDAATAEKVFKQYANKNGVDGEWTLDD...
3,training,1115.0,595.0,-1.694626,QYKLILNIKTLKGETTTEAVDAATAEKVFKQYANDNGVDGEWTYDS...
4,validation,97.0,2.0,-5.819421,QYKLINNGKTLKGETTTEAVDAATAEKVFKIYANDNGVDGEWTYDD...


In [3]:
# Select N_sub sequences to estimate intrinsic information on
N_sub = 10000
ix = np.random.choice(N, size=N_sub, replace=False)
sub_df = data_df.iloc[ix].copy().reset_index(drop=True)
sub_df.head()

Unnamed: 0,set,input_ct,selected_ct,y,x
0,training,316.0,47.0,-3.51305,QYKLILNDKTLKGETTTEAVDAATAEKVFKQYANWNGVDGEWTYDD...
1,validation,709.0,237.0,-2.366531,QYKLILNGKTLKLETTTNAVDAATAEKVFKQYANDNGVDGEWTYDD...
2,test,35.0,0.0,-5.959598,QYKLALNGKTLKGETTTEAVDAATAEKVFKQYANDNGVDGEYTYDD...
3,training,580.0,6.0,-7.164713,QYKLILNGKTLKGETTTEAVDAATAEKVFFQYQNDNGVDGEWTYDD...
4,validation,2510.0,13.0,-8.276365,QYKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGVDGEWTYDD...


In [4]:
# Extract counts
i_n = sub_df['input_ct'].values
o_n = sub_df['selected_ct'].values
r_n = (o_n+1)/(i_n+1)
y_n = np.log2(r_n)

In [5]:
# Compute naive estimate
dy2_naive_n = (np.log2(e)**2)*(1./(o_n+1.) + 1./(i_n+1))
H_n_naive = 0.5*np.log2(2*pi*e*dy2_naive_n)
H_ygx_naive = np.mean(H_n_naive)
dH_ygx_naive = np.std(H_n_naive)/np.sqrt(N_sub)
print(f'H[y|x] (naive): {H_ygx_naive:.4f} +- {dH_ygx_naive:.4f} bits')

H[y|x] (naive): 0.2125 +- 0.0142 bits


In [6]:
# Compute entropy of dataset using knn
y = y_n.copy()
H_y_knn, dH_y_knn = entropy_continuous(y, knn=5, uncertainty=True, num_subsamples=100)
print(f'H[y] (knn): {H_y_knn:.4f} +- {dH_y_knn:.4f} bits')

H[y] (knn): 3.4252 +- 0.0090 bits


In [7]:
# Report mutual information values for various H_ygx estimates
I_y_x = H_y_knn - H_ygx_naive
dI_y_x = np.sqrt(dH_y_knn**2 + dH_ygx_naive**2)
print(f'I_intr (naive): {I_y_x:.4f} +- {dI_y_x:.4f} bits')

I_intr (naive): 3.2127 +- 0.0168 bits
