### Imports

In [1]:
import pandas as pd
import numpy as np

### Datasets

In [2]:
# Get metadata
metadata = pd.read_csv('data/metadata.csv', usecols = ['sample-id', 'group'])
metadata

Unnamed: 0,sample-id,group
0,ERR1072630,OTHER
1,ERR1072633,OTHER
2,ERR1072638,OTHER
3,ERR1072639,OTHER
4,ERR1072646,NORMAL
...,...,...
2822,ERR2092593,NORMAL
2823,ERR2092610,DEVIANT
2824,ERR2092680,NORMAL
2825,ERR2092729,NORMAL


In [3]:
metadata['group'].value_counts()

NORMAL     1414
OTHER      1271
DEVIANT     142
Name: group, dtype: int64

In [4]:
# Get NIM data
nim_aminoacids = pd.read_csv('data/nim-aminoacids_400.csv', index_col=0)
nim_aminoacids = nim_aminoacids.drop(nim_aminoacids.iloc[:, 7:], axis = 1)
nim_aminoacidsD = pd.read_csv('data/nim-aminoacidsD_400.csv', index_col=0)
nim_sugars = pd.read_csv('data/nim-sugars_400.csv', index_col=0)
nim_vitamins = pd.read_csv('data/nim-vitamins_400.csv', index_col=0)

print(f"NIM Amino Acids Shape: {nim_aminoacids.shape}")
print(f"NIM Amino Acids D Shape: {nim_aminoacidsD.shape}")
print(f"NIM Sugars Shape: {nim_sugars.shape}")
print(f"NIM Vitamins Shape: {nim_vitamins.shape}")

NIM Amino Acids Shape: (400, 7)
NIM Amino Acids D Shape: (400, 6)
NIM Sugars Shape: (400, 56)
NIM Vitamins Shape: (400, 11)


In [5]:
# Merge all NIMs into one dataframe
nim = nim_aminoacids.merge(nim_aminoacidsD, how = 'inner', on = 'taxonomy')
nim = nim.merge(nim_sugars, how = 'inner', on = 'taxonomy')
nim = nim.merge(nim_vitamins, how = 'inner', on = 'taxonomy')
nim

Unnamed: 0_level_0,Trp,His,Pro,Leu,Arg,Ile_Val,Tyr,Thr_D,Trp_D,His_D,...,B2,B3,B5,B6,B7,B9,B12,Q,Lipoate,K
taxonomy,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Faecalibacterium prausnitzii,0.9377,0.4771,0.0000,0.0,0.0623,0.0,0.0,0.0000,0.0,0.0623,...,0.4771,1.0,1.0,1.0000,1.0000,0.9377,0.0623,1.0000,1.0000,1.0000
Phocaeicola vulgatus,0.0000,0.0000,0.1089,0.0,0.0000,0.0,0.0,0.8911,0.0,0.8911,...,0.0000,0.0,0.0,0.1089,0.1089,0.0000,0.1089,0.0000,0.1089,0.0000
Prevotella copri,0.0074,0.0000,0.0077,0.0,0.0000,0.0,0.0,0.0000,0.0,0.0000,...,0.0000,0.0,0.0,0.0077,1.0000,0.0077,1.0000,0.3099,1.0000,0.0074
Bacteroides uniformis,0.0000,0.0000,0.0000,0.0,0.0000,0.0,0.0,1.0000,1.0,1.0000,...,0.0000,0.0,0.0,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,1.0000
[Eubacterium] rectale,0.0000,0.0000,0.0000,0.0,0.0000,0.0,0.0,0.0000,0.0,0.0000,...,0.0000,0.0,0.0,0.0000,1.0000,1.0000,0.0000,1.0000,1.0000,1.0000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Bacteroides caccae/Bacteroides intestinalis/Alloprevotella rava,1.0000,1.0000,1.0000,1.0,1.0000,1.0,0.0,1.0000,0.0,1.0000,...,0.0000,1.0,0.0,0.0000,1.0000,0.0000,1.0000,0.0000,1.0000,0.0000
Ruthenibacterium lactatiformans/Fournierella massiliensis,1.0000,0.2000,0.0000,0.0,0.6000,0.0,0.0,0.2000,0.0,0.0000,...,1.0000,0.8,1.0,1.0000,1.0000,1.0000,0.0000,1.0000,1.0000,1.0000
Stenotrophomonas geniculata/maltophilia/pavanii,0.0000,0.0000,0.0000,0.0,0.0000,0.0,0.0,1.0000,1.0,1.0000,...,0.0000,0.0,0.0,0.0000,0.0000,0.0000,1.0000,0.0000,0.0000,1.0000
Tepidibaculum saccharolyticum/Ruminococcus albus,0.0000,0.0000,0.0000,0.0,0.0000,0.0,0.0,0.0000,0.0,0.0000,...,0.5000,0.0,0.5,0.5000,1.0000,1.0000,1.0000,0.5000,1.0000,1.0000


In [6]:
# Get taxonomy data
taxonomy = pd.read_csv('data/taxonomy_400.csv', index_col=0)
taxonomy = taxonomy.replace('%', '', regex = True).astype(np.float64)
taxonomy

Unnamed: 0_level_0,ERR2032802,ERR1845748,ERR2092355,ERR1845937,ERR1249738,ERR1090583,ERR1250049,ERR1459183,ERR1075840,ERR1458892,...,ERR2057080,ERR1678465,ERR1074540,ERR1077998,ERR2033465,ERR1389801,ERR1845840,ERR1842195,ERR1075183,ERR1077294
taxonomy,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Faecalibacterium prausnitzii,4.4,12.3,2.0,3.1,11.6,5.8,2.5,9.8,6.8,3.3,...,1.8,10.3,2.7,0.9,2.6,0.7,3.6,2.9,0.2,4.7
Phocaeicola vulgatus,23.4,11.0,19.4,10.9,4.4,5.7,1.7,16.8,5.2,16.1,...,0.5,0.3,0.1,0.0,8.5,5.3,0.5,0.1,0.1,0.0
Prevotella copri,0.1,0.0,0.1,32.4,48.0,71.8,63.9,0.0,0.0,0.0,...,0.0,0.9,7.6,0.1,0.0,0.0,0.7,0.0,0.2,0.5
Bacteroides uniformis,4.1,5.5,4.0,0.0,0.2,0.0,0.7,4.8,7.2,1.1,...,0.5,0.3,0.0,2.3,2.0,0.2,1.0,0.0,0.0,0.1
[Eubacterium] rectale,4.2,22.7,1.1,0.0,0.3,0.0,0.2,6.6,2.9,0.0,...,0.0,0.2,0.1,0.0,0.6,0.1,0.6,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Bacteroides caccae/Bacteroides intestinalis/Alloprevotella rava,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Ruthenibacterium lactatiformans/Fournierella massiliensis,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Stenotrophomonas geniculata/maltophilia/pavanii,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Tepidibaculum saccharolyticum/Ruminococcus albus,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Data Cleaning

In [7]:
# Separating out normal & deviant sample-ids
normal = metadata.loc[metadata.group=='NORMAL']
normal = normal.drop(['group'], axis = 1)
normal_ids = normal.to_numpy()

deviant = metadata.loc[metadata.group=='DEVIANT']
deviant = deviant.drop(['group'], axis = 1)
deviant_ids = deviant.to_numpy()

In [8]:
# Taxonomies of normal and deviant samples
normal_taxonomy = taxonomy[normal_ids.reshape(-1)]
deviant_taxonomy = taxonomy[deviant_ids.reshape(-1)]

# Dataframes to numpy arrays
normal_taxonomy = normal_taxonomy.to_numpy()
deviant_taxonomy = deviant_taxonomy.to_numpy()

### Computation

In [9]:
# Compute percentile for specific microbe
# - testing vlow and vhigh for Faecalibacterium prausnitzii
vlow_test = np.percentile(normal_taxonomy[0], 10)
vhigh_test = np.percentile(normal_taxonomy[0], 90)

print("Faecalibacterium prausnitzii")
print(f"v_low : {vlow_test}\nv_high: {vhigh_test}")

Faecalibacterium prausnitzii
v_low : 5.4
v_high: 18.3


In [10]:
# Store vhigh / vlow in arrays
# - with an additional allowance of 5% on both ends
vlow = np.percentile(normal_taxonomy, 10, axis=1) - 5
vhigh = np.percentile(normal_taxonomy, 90, axis=1) + 5

print("Faecalibacterium prausnitzii")
print(f"v_low : {vlow[0]}\nv_high: {vhigh[0]}")

Faecalibacterium prausnitzii
v_low : 0.40000000000000036
v_high: 23.3


In [11]:
def violations(u):
    """ 
    Computes number of ASVs with abundance outside vlow and vhigh (i.e. violation)
    Returns an array of integers representing how many violations each sample has
    """
    
    if len(u.shape) == 1:
        u = u[np.newaxis, :]
        
    assert u.shape[1] == len(vlow) == len(vhigh)
    
    u = u / u.sum(axis=1, keepdims=True) * 100
    vio_low = u < vlow[np.newaxis, :]
    vio_high = u > vhigh[np.newaxis, :]
    vio = (vio_low | vio_high).astype(np.int32).sum(axis=1)
    
    return vio

In [12]:
normal_taxonomy_violoation = violations(normal_taxonomy.transpose((1, 0)))
deviant_taxonomy_violoation = violations(deviant_taxonomy.transpose((1, 0)))

print(f"Normal Taxonomy \n{normal_taxonomy_violoation}\nMean Violoations: {normal_taxonomy_violoation.mean()}\n")
print(f"Deviant Taxonomy \n{deviant_taxonomy_violoation}\nMean Violoations: {deviant_taxonomy_violoation.mean()}\n")

Normal Taxonomy 
[0 1 0 ... 3 0 1]
Mean Violoations: 1.1364922206506365

Deviant Taxonomy 
[ 5  8  2  2  5  1  5  3  4  6  3  5  4  4  2  4  4  4  6  3  2  4  4  3
  7  2  2  5  3  3  7  3  4  2  6  1  3  2  6  4  2  4  3  2  3  6  5  4
  3  4  4  5  6  5 11  6  7  8  6  7  5  7  6  8  8  7  5  8  8  6  7  5
  8  6  5  8  7  6  8  8  7  5 10  6  5  8  7  8  9  6  2  3  2  3  2  2
  3  5  3  1  1  7  1  1  2  3  3  4  1  1  2  4  3  4  1  7  3  4  5  6
  3  1  2  8  2  1  3  3  1  2  4  4  4  2  5  1  4  3  1  1  7  5]
Mean Violoations: 4.345070422535211



In [13]:
#Transpose Deviant Taxonomy to be compatiable with NIM
deviant_taxonomy_T = deviant_taxonomy.T

#sample usage
working = nim['His'] * deviant_taxonomy_T[0]
print(working)

taxonomy
Faecalibacterium prausnitzii                                       2.3855
Phocaeicola vulgatus                                               0.0000
Prevotella copri                                                   0.0000
Bacteroides uniformis                                              0.0000
[Eubacterium] rectale                                              0.0000
                                                                    ...  
Bacteroides caccae/Bacteroides intestinalis/Alloprevotella rava    0.0000
Ruthenibacterium lactatiformans/Fournierella massiliensis          0.0000
Stenotrophomonas geniculata/maltophilia/pavanii                    0.0000
Tepidibaculum saccharolyticum/Ruminococcus albus                   0.0000
Sellimonas intestinalis/Drancourtella massiliensis                 0.0000
Name: His, Length: 400, dtype: float64


In [14]:
arr = []
#Calculations for impact on first deviant sample.
for n in nim:
    working  = nim[n] * deviant_taxonomy_T[0]
    arr.append(working)
#Todo: Use V_high / low for filtering results. 

### Impact

In [15]:
### Deviant Taxonomy

In [16]:
# Transpose Deviant Taxonomy to be compatiable with NIM
deviant_taxonomy_T = deviant_taxonomy.T

# NIM into numpy
nim_np = nim.to_numpy()

# Get a single deviant sample for testing
deviant_sample_0 = deviant_taxonomy_T[0]
deviant_sample_0 = deviant_sample_0[:, np.newaxis]

# Check shapes
print(f"Shape of NIM: {nim_np.shape}")
print(f"Shape of Test Deviant Sample: {deviant_sample_0.shape}")

# Nutrient impact on a single deviant sample
NIM_deviant_0 = np.multiply(nim_np, deviant_sample_0)
NIM_deviant_0

Shape of NIM: (400, 80)
Shape of Test Deviant Sample: (400, 1)


array([[4.6885 , 2.3855 , 0.     , ..., 5.     , 5.     , 5.     ],
       [0.     , 0.     , 0.03267, ..., 0.     , 0.03267, 0.     ],
       [0.     , 0.     , 0.     , ..., 0.     , 0.     , 0.     ],
       ...,
       [0.     , 0.     , 0.     , ..., 0.     , 0.     , 0.     ],
       [0.     , 0.     , 0.     , ..., 0.     , 0.     , 0.     ],
       [0.1    , 0.     , 0.1    , ..., 0.1    , 0.1    , 0.1    ]])

In [17]:
# Testing.
# Violations of this sample after i-th nutrient is supplied. 
np.seterr(invalid='ignore')
NIM_dev0_violation = violations(NIM_deviant_0.transpose(1,0))

# Change in number of violations after i-th nutrient intervention
NIM_dev0_violation - deviant_taxonomy_violoation[0]

array([ 0,  0,  0, -1, -1,  0, -1, -2,  0, -3, -2, -1, -2,  2,  1,  3,  1,
        1,  0,  1,  1,  0, -2,  0,  2,  0,  3,  2, -2,  1,  0,  1, -1, -2,
       -2, -3, -1,  2, -1,  0,  1,  2,  1,  2, -1,  1, -1,  3, -3,  2, -3,
       -2,  0, -1,  1, -1,  2, -2, -3, -3, -2, -1,  0, -1, -1, -1, -3, -5,
       -2, -1,  1, -1,  1,  0,  0, -3, -2,  1,  1,  1])

In [34]:
# Inefficient way to just quickly aggregate
# - using the snippets from above:

violation_change_deviant = []
for i,deviant_sample in enumerate(deviant_taxonomy_T):
    
    NIM_deviant = np.multiply(nim_np, deviant_sample[:, np.newaxis])
    NIM_dev_violation = violations(NIM_deviant.transpose(1,0))
    
    violation_change_deviant.append(NIM_dev_violation - deviant_taxonomy_violoation[i])
    
violation_change_deviant = np.array(violation_change_deviant)
violation_change_deviant

array([[ 0,  0,  0, ...,  1,  1,  1],
       [-3, -3, -4, ..., -4, -2, -2],
       [ 6,  3,  2, ...,  4,  5,  5],
       ...,
       [ 6,  4,  4, ...,  4,  3,  3],
       [-1, -2, -3, ..., -1,  1,  1],
       [-3, -1, -2, ..., -3,  0, -1]])

In [19]:
### Normal Taxonomy

In [35]:
# Transpose Normal Taxonomy to be compatiable with NIM
normal_taxonomy_T = normal_taxonomy.T

# NIM into numpy
nim_np = nim.to_numpy()

# Get a single normal sample for testing
normal_sample_0 = normal_taxonomy_T[0]
normal_sample_0 = normal_sample_0[:, np.newaxis]

# Check shapes
print(f"Shape of NIM: {nim_np.shape}")
print(f"Shape of Test Deviant Sample: {normal_sample_0.shape}")

# Nutrient impact on a single normal sample
NIM_normal_0 = np.multiply(nim_np, normal_sample_0)
NIM_normal_0

Shape of NIM: (400, 80)
Shape of Test Deviant Sample: (400, 1)


array([[12.47141,  6.34543,  0.     , ..., 13.3    , 13.3    , 13.3    ],
       [ 0.     ,  0.     ,  1.37214, ...,  0.     ,  1.37214,  0.     ],
       [ 0.     ,  0.     ,  0.     , ...,  0.     ,  0.     ,  0.     ],
       ...,
       [ 0.     ,  0.     ,  0.     , ...,  0.     ,  0.     ,  0.     ],
       [ 0.     ,  0.     ,  0.     , ...,  0.     ,  0.     ,  0.     ],
       [ 0.     ,  0.     ,  0.     , ...,  0.     ,  0.     ,  0.     ]])

In [36]:
# Testing.
# Violations of this sample after i-th nutrient is supplied. 
np.seterr(invalid='ignore')
NIM_norm0_violation = violations(NIM_normal_0.transpose(1,0))

# Change in number of violations after i-th nutrient intervention
NIM_norm0_violation - normal_taxonomy_violoation[0]

array([4, 2, 6, 7, 5, 7, 5, 5, 4, 3, 4, 5, 6, 1, 1, 2, 1, 2, 2, 4, 3, 6,
       4, 4, 5, 4, 1, 5, 2, 4, 6, 4, 4, 3, 2, 4, 5, 5, 5, 5, 2, 5, 9, 6,
       4, 2, 2, 5, 3, 6, 5, 5, 8, 1, 4, 6, 6, 7, 1, 1, 3, 5, 2, 4, 3, 2,
       3, 3, 3, 3, 5, 1, 3, 3, 3, 2, 6, 3, 1, 1])

In [37]:
violation_change_normal = []

for i,normal_sample in enumerate(normal_taxonomy_T):
    
    NIM_normal = np.multiply(nim_np, normal_sample[:, np.newaxis])
    NIM_norm_violation = violations(NIM_normal.transpose(1,0))
    
    violation_change_normal.append(NIM_norm_violation - normal_taxonomy_violoation[i])
    
violation_change_normal = np.array(violation_change_normal)
violation_change_normal

array([[ 4,  2,  6, ...,  3,  1,  1],
       [ 1,  2,  3, ...,  2,  2,  1],
       [ 3,  4,  7, ...,  1,  2,  3],
       ...,
       [ 1,  1,  2, ...,  0, -1, -1],
       [ 4,  4,  4, ...,  2,  0,  2],
       [ 3,  3,  4, ...,  1,  0, -1]])

In [None]:
### Find top nutrients that reduces the most number of violations 
### in violation_change_deviant

In [70]:
violation_change_deviant, violation_change_deviant.shape

(array([[ 0,  0,  0, ...,  1,  1,  1],
        [-3, -3, -4, ..., -4, -2, -2],
        [ 6,  3,  2, ...,  4,  5,  5],
        ...,
        [ 6,  4,  4, ...,  4,  3,  3],
        [-1, -2, -3, ..., -1,  1,  1],
        [-3, -1, -2, ..., -3,  0, -1]]),
 (142, 80))

In [71]:
# Change in total number of violations after i-th nutrient intervention across 
# all 142 deviant samples
violation_change_sum = violation_change_deviant.sum(axis=0)
violation_change_sum

array([ -20,   25,   46,   84,   31,   78,   75,  -70,  -14,  -94, -102,
        -83, -123,   -5,    2,  -44,  -16,  -66,  -76,  -58,   55,   49,
        -89,   60, -103,  -67,  -86, -103, -139, -239,  -85, -128,   17,
       -228, -125, -272, -260,   12, -171,    4, -183,  -79,   74,  -82,
       -131,  -24, -180,  -22, -142,  -62, -179, -153,   56,  -75, -138,
        -44,  -96, -142, -205, -233, -246, -231, -301, -168, -279, -399,
       -402, -466, -395,   18,   46,   35,   44,  -37,   -2,  -88,   12,
        -43,   -6,    3])

In [80]:
# indices of nutrients that reduced the most number of violations
smallest_indices = np.argsort(violation_change_sum)[:10]
smallest_indices

array([67, 66, 65, 68, 62, 64, 35, 36, 60, 29], dtype=int64)