In [1]:
import pandas as pd
import numpy as np
import networkx as nx
import scipy.io
import random
from collections import defaultdict
import math
import re
import matplotlib.pyplot as plt
from sklearn import preprocessing, linear_model
from sklearn.metrics import precision_score, recall_score
import sklearn
import torch
import torch.nn as nn
import torch.nn.functional as F

In [3]:
co_occ_X = np.load('Generated_Data/co-occ/co_occ_X.npy')
co_occ_y = np.load('Generated_Data/co-occ/co_occ_y.npy')

In [28]:
total_size = len(co_occ_X)
train_size = int(total_size*0.8)
val_size = int(total_size*0.1)

tr = sorted(random.sample(list(range(total_size)),train_size))
val = sorted(random.sample(list(set(range(total_size)).difference(set(tr))),val_size))
te = sorted(list(set(range(total_size)).difference(set(tr)).difference(set(val))))

### Baseline 1: get one-hot vectors of reactions for two bacteria, and feed into linear regression
- cannot generalize to unseen reactions
- misses on average by 3.5 points

In [29]:
train_x = co_occ_X[tr]
train_y = co_occ_y[tr]
val_x = co_occ_X[val]
val_y = co_occ_y[val]
test_x = co_occ_X[te]
test_y = co_occ_y[te]

train_y_binary = [0 if y <= 0 else 1 for y in train_y]
val_y_binary = [0 if y <= 0 else 1 for y in val_y]
test_y_binary = [0 if y <= 0 else 1 for y in test_y]

In [30]:
lr = linear_model.LinearRegression()

lr.fit(train_x,train_y)
print('Linear Regression Training MSE: '+str(sklearn.metrics.mean_squared_error(lr.predict(train_x),train_y)))

Linear Regression Training MSE: 1.7317621415448224e-26


In [31]:
print(np.mean(lr.coef_))
print(np.std(lr.coef_))

-0.0022031301061383947
0.11055990267786438


No wild overfitting

In [32]:
test_preds = lr.predict(test_x)
test_preds_binary = [0 if pred <= 0 else 1 for pred in test_preds]
train_preds = lr.predict(train_x)
train_preds_binary = [0 if pred <= 0 else 1 for pred in train_preds]

print('Linear Regression Test MSE: '+str(sklearn.metrics.mean_squared_error(test_preds,test_y)))
print('Linear Regression (Semi)Binary Train Performance: Precision: '+
      str(precision_score(train_y_binary,train_preds_binary))+', Recall: '+
      str(recall_score(train_y_binary,train_preds_binary)))
print('Linear Regression (Semi)Binary Test Performance: Precision: '+
      str(precision_score(test_y_binary,test_preds_binary))+', Recall: '+
      str(recall_score(test_y_binary,test_preds_binary)))

Linear Regression Test MSE: 9.39087770089048
Linear Regression (Semi)Binary Train Performance: Precision: 1.0, Recall: 1.0
Linear Regression (Semi)Binary Test Performance: Precision: 0.6666666666666666, Recall: 0.46153846153846156


### LASSO

In [33]:
lr = linear_model.Lasso(alpha=0.1)

lr.fit(train_x,train_y)
print('LASSO Regression Training MSE: '+str(sklearn.metrics.mean_squared_error(lr.predict(train_x),train_y)))

LASSO Regression Training MSE: 5.119837779220445


In [34]:
print(np.mean(lr.coef_))
print(np.std(lr.coef_))

-0.0009054004811609957
0.01983854173589269


No wild overfitting

In [35]:
test_preds = lr.predict(test_x)
test_preds_binary = [0 if pred <= 0 else 1 for pred in test_preds]
train_preds = lr.predict(train_x)
train_preds_binary = [0 if pred <= 0 else 1 for pred in train_preds]

print('LASSO Test MSE: '+str(sklearn.metrics.mean_squared_error(test_preds,test_y)))
print('LASSO (Semi)Binary Train Performance: Precision: '+
      str(precision_score(train_y_binary,train_preds_binary))+', Recall: '+
      str(recall_score(train_y_binary,train_preds_binary)))
print('LASSO (Semi)Binary Test Performance: Precision: '+
      str(precision_score(test_y_binary,test_preds_binary))+', Recall: '+
      str(recall_score(test_y_binary,test_preds_binary)))

LASSO Test MSE: 6.213209246087393
LASSO (Semi)Binary Train Performance: Precision: 0.8068965517241379, Recall: 0.6685714285714286
LASSO (Semi)Binary Test Performance: Precision: 0.6842105263157895, Recall: 0.5


### Baseline 2: get one-hot vectors of reactions for two bacteria, and feed into logistic regression (with binary target variable)

In [40]:
class LogisticRegression(nn.Module):
    def __init__(self,input_dim):
        super(LogisticRegression, self).__init__()
        self.linear = nn.Linear(input_dim,1)

    def forward(self, x):
        #x = torch.sigmoid(self.linear(x))
        x = self.linear(x)
        return x

LogReg = LogisticRegression(len(co_occ_X[0]))

criterion = nn.BCEWithLogitsLoss(pos_weight=torch.Tensor([(1-np.mean(train_y_binary))/np.mean(train_y_binary)]))
optimizer = torch.optim.Adam(LogReg.parameters())

epochs = 50

In [41]:
for i in range(epochs):
    curr_loss = 0
    for k in range(len(train_x)):
        X = torch.Tensor(train_x[k])
        y = torch.Tensor([train_y_binary[k]])
        
        optimizer.zero_grad()
        
        output = LogReg(X)
        loss = criterion(output,y)
        loss.backward()
        optimizer.step()
        
        curr_loss += loss.item()
        
    if i%5!=0:
        continue
        
    print('Epoch '+str(i)+', loss: '+ str(curr_loss/len(train_x)))
        
    with torch.no_grad():
        train_output = np.transpose(LogReg(torch.Tensor(train_x)))[0] >= 0
        val_output = np.transpose(LogReg(torch.Tensor(val_x)))[0] >= 0
        print('Train Performance: Precision: '+
            str(precision_score(train_y_binary,train_output))+', Recall: '+
            str(recall_score(train_y_binary,train_output)))
        print('Test Performance: Precision: '+
            str(precision_score(val_y_binary,val_output))+', Recall: '+
            str(recall_score(val_y_binary,val_output)))

Epoch 0, loss: 1.2105110714927088
Train Performance: Precision: 0.6170212765957447, Recall: 0.6628571428571428
Test Performance: Precision: 0.5555555555555556, Recall: 0.5
Epoch 5, loss: 0.841094520574519
Train Performance: Precision: 0.7348484848484849, Recall: 0.5542857142857143
Test Performance: Precision: 0.8333333333333334, Recall: 0.5
Epoch 10, loss: 0.7020623241050998
Train Performance: Precision: 0.78125, Recall: 0.5714285714285714
Test Performance: Precision: 0.75, Recall: 0.45
Epoch 15, loss: 0.5779075294503855
Train Performance: Precision: 0.795774647887324, Recall: 0.6457142857142857
Test Performance: Precision: 0.75, Recall: 0.45
Epoch 20, loss: 0.49995148042092874
Train Performance: Precision: 0.8068965517241379, Recall: 0.6685714285714286
Test Performance: Precision: 0.7142857142857143, Recall: 0.5
Epoch 25, loss: 0.43037945421670804
Train Performance: Precision: 0.8391608391608392, Recall: 0.6857142857142857
Test Performance: Precision: 0.6923076923076923, Recall: 0.45


In [42]:
with torch.no_grad():
    test_output = np.transpose(LogReg(torch.Tensor(test_x)))[0] >= 0
    print('Test Performance: Precision: '+
        str(precision_score(test_y_binary,test_output))+', Recall: '+
        str(recall_score(test_y_binary,test_output)))

Test Performance: Precision: 0.5925925925925926, Recall: 0.6153846153846154


### Baseline 3: get indicator vectors of reactions for two bacteria, and feed into simple neural network

In [61]:
# class NeuralNet(nn.Module):
#     def __init__(self,input_dim,p=0.2):
#         super(NeuralNet, self).__init__()
#         self.fc1 = nn.Linear(input_dim,256)
#         self.fc2 = nn.Linear(256,512)
#         self.fc3 = nn.Linear(512,128)
#         self.fc4 = nn.Linear(128,1)
#         self.dropout = nn.Dropout(p=p)

#     def forward(self, x):
#         x = F.relu(self.fc1(x))
#         x = F.relu(self.fc2(x))
#         x = F.relu(self.fc3(x))
#         x = torch.sigmoid(self.fc4(x))
#         return x

# BasicNN = NeuralNet(len(co_occ_X[0]))

BasicNN = nn.Sequential(
    nn.Linear(len(co_occ_X[0]),512),
    nn.ReLU(),
    nn.Linear(512,128),
    nn.ReLU(),
    nn.Linear(128,1)
)

criterion = nn.BCEWithLogitsLoss()#pos_weight=torch.Tensor([(1-np.mean(train_y_binary))/np.mean(train_y_binary)]))
optimizer = torch.optim.Adam(BasicNN.parameters(),lr=1e-4,weight_decay=0)

epochs = 30

In [62]:
for i in range(epochs):
    curr_loss = 0
    for k in range(len(train_x)):
        X = torch.Tensor(train_x[k])
        y = torch.Tensor([train_y_binary[k]])
        
        optimizer.zero_grad()
        
        output = BasicNN(X)
        loss = criterion(output, y)
        loss.backward()
        optimizer.step()
        
        curr_loss += loss.item()
        
    if i%5!=0:
        continue
        
    print('Epoch '+str(i)+', loss: '+ str(curr_loss/len(train_x)))
        
    with torch.no_grad():
        train_output = np.transpose(BasicNN(torch.Tensor(train_x)))[0] 
        train_output = train_output >= 0
        val_output = np.transpose(BasicNN(torch.Tensor(val_x)))[0] >= 0
        print('Train Performance: Precision: '+
            str(precision_score(train_y_binary,train_output))+', Recall: '+
            str(recall_score(train_y_binary,train_output)))
        print('Val Performance: Precision: '+
            str(precision_score(val_y_binary,val_output))+', Recall: '+
            str(recall_score(val_y_binary,val_output)))

Epoch 0, loss: 0.694880732419816
Train Performance: Precision: 0.4971590909090909, Recall: 1.0
Val Performance: Precision: 0.45454545454545453, Recall: 1.0
Epoch 5, loss: 0.6036328817929395
Train Performance: Precision: 0.5324675324675324, Recall: 0.9371428571428572
Val Performance: Precision: 0.47368421052631576, Recall: 0.9
Epoch 10, loss: 0.4797525833022188
Train Performance: Precision: 0.6374501992031872, Recall: 0.9142857142857143
Val Performance: Precision: 0.53125, Recall: 0.85
Epoch 15, loss: 0.35314291393065517
Train Performance: Precision: 0.7417840375586855, Recall: 0.9028571428571428
Val Performance: Precision: 0.6538461538461539, Recall: 0.85
Epoch 20, loss: 0.2311702287237164
Train Performance: Precision: 0.8097560975609757, Recall: 0.9485714285714286
Val Performance: Precision: 0.6538461538461539, Recall: 0.85
Epoch 25, loss: 0.13537563177314382
Train Performance: Precision: 0.8434343434343434, Recall: 0.9542857142857143
Val Performance: Precision: 0.6538461538461539, Re

In [63]:
with torch.no_grad():
        test_output = np.transpose(BasicNN(torch.Tensor(test_x)))[0] >= 0
        print('Test Performance: Precision: '+
            str(precision_score(test_y_binary,test_output))+', Recall: '+
            str(recall_score(test_y_binary,test_output)))

Test Performance: Precision: 0.7307692307692307, Recall: 0.7307692307692307
