In [6]:
from plants_sm.data_structures.dataset.single_input_dataset import SingleInputDataset

train_dataset = SingleInputDataset.from_csv("train_test.csv", instances_ids_field="accession", representation_field="sequence", 
                                labels_field=slice(8, 2779), features_fields=slice(2780, -1))

In [9]:
train_dataset.X.shape

(100, 320)

In [10]:
train_dataset.y.shape

(100, 2771)

In [11]:
from plants_sm.data_structures.dataset.single_input_dataset import SingleInputDataset

validation_set = SingleInputDataset.from_csv("validation_test.csv", instances_ids_field="accession", representation_field="sequence",
                                labels_field=slice(8, 2779), features_fields=slice(2780, -1))

In [12]:
test_set = SingleInputDataset.from_csv("test_test.csv", instances_ids_field="accession", representation_field="sequence",
                                labels_field=slice(8, 2779), features_fields=slice(2780, -1))

In [13]:
validation_set.X.shape

(100, 320)

In [14]:
validation_set.y.shape

(100, 2771)

In [15]:
#get the ratio between negative labels and positive labels in a numpy array
import numpy as np


def get_ratio(y):
    ratio = np.zeros(y.shape[1])
    for i in range(y.shape[1]):
        ratio[i] = np.sum(y[:,i]==0)/np.sum(y[:,i]==1)
    return ratio

ratio = get_ratio(train_dataset.y)
ratio

  ratio[i] = np.sum(y[:,i]==0)/np.sum(y[:,i]==1)


array([5.25      , 1.63157895, 2.57142857, ...,        inf,        inf,
              inf])

In [16]:
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [18]:
from plants_sm.models.cnn.cnn import CNN1D
from plants_sm.models.pytorch_model import PyTorchModel
from torch import nn


cnn_model = CNN1D([5120, 3000], [160], [2], 2771, False)

optimizer = torch.optim.Adam(params = cnn_model.parameters(), lr=0.001)

pos_weight = torch.tensor(ratio).to("cuda:1") 

model = PyTorchModel(batch_size=200, epochs=1, 
                     loss_function=nn.BCEWithLogitsLoss(), optimizer=optimizer, model=cnn_model,
                     device="cuda:1", logger_path="./logs.log",
                     progress=200)
model.fit(validation_set)

INFO:plants_sm.models.pytorch_model:starting to fit the data...
INFO:plants_sm.models.pytorch_model:[1/1, 0/1] loss: 0.69369113


CNN1D(
  (conv0): Conv1d(1, 160, kernel_size=(2,), stride=(1,))
  (pool0): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (relu_conv0): ReLU()
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (fc_initial): Linear(in_features=25440, out_features=5120, bias=True)
  (relu_initial): ReLU()
  (fc1): Linear(in_features=5120, out_features=3000, bias=True)
  (relu1): ReLU()
  (fc_final): Linear(in_features=3000, out_features=2771, bias=True)
  (sigmoid): Sigmoid()
)

In [19]:
model.model.last_sigmoid = True

In [23]:
model.history["loss"].to_csv("model_history.csv", index=False)

In [34]:
proba = model.predict(test_set)

In [3]:
import pandas as pd

pd.read_csv("./ec_number_prediction/metrics/metrics.csv")

Unnamed: 0,model,metric,train,validation,test
0,CNN1D,hamming_loss,0.002115,0.002097,0.002057
1,CNN1D,accuracy_score,0.0,0.0,0.0
2,CNN1D,precision_score,0.0,0.0,0.0
3,CNN1D,recall_score,0.0,0.0,0.0
4,CNN1D,f1_score,0.0,0.0,0.0
5,CNN1D_esm2_8M,hamming_loss,0.001754,0.001739,0.001696
6,CNN1D_esm2_8M,accuracy_score,0.0,0.0,0.0
7,CNN1D_esm2_8M,precision_score,0.0,0.0,0.0
8,CNN1D_esm2_8M,recall_score,0.0,0.0,0.0
9,CNN1D_esm2_8M,f1_score,0.0,0.0,0.0


In [38]:
from sklearn.metrics import hamming_loss,accuracy_score, precision_score, recall_score, f1_score


hamming_loss = hamming_loss(test_set.y, proba)
accuracy_score = accuracy_score(test_set.y, proba)
precision_score = precision_score(test_set.y, proba, average="samples")
recall_score = recall_score(test_set.y, proba, average="samples")
f1_score = f1_score(test_set.y, proba, average="samples")


In [24]:
model.save("./models_test")

In [27]:
from plants_sm.models.model import Model

model = PyTorchModel.load("./models_test")

In [36]:
model.predict_proba(test_set)

array([[4.70292753e-06, 1.90663137e-04, 8.06279539e-04, ...,
        5.63477943e-06, 1.08162703e-05, 9.12189716e-05],
       [4.18362242e-06, 1.75807756e-04, 7.52965279e-04, ...,
        5.03280626e-06, 9.72344060e-06, 8.38898923e-05],
       [4.57015631e-06, 1.87572485e-04, 7.94192543e-04, ...,
        5.49285141e-06, 1.05671552e-05, 8.92098033e-05],
       ...,
       [5.18859406e-06, 2.04967830e-04, 8.58951302e-04, ...,
        6.24662880e-06, 1.18265707e-05, 9.84770231e-05],
       [5.13705390e-06, 2.03634030e-04, 8.53844685e-04, ...,
        6.18431977e-06, 1.17124218e-05, 9.77273303e-05],
       [5.00267834e-06, 1.99618327e-04, 8.40610941e-04, ...,
        6.01726242e-06, 1.14123923e-05, 9.58416786e-05]])

In [37]:
test_set.y

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.]])

In [None]:
from sklearn.metrics import hamming_loss
from plants_sm.models.pytorch_model import PyTorchModel


model = DenseNet(320, [640, 1280, 2560, 5120], 2771)

optimizer = torch.optim.Adam(params = model.parameters(), lr=0.01)

model = PyTorchModel(batch_size=240, epochs=10, 
                     loss_function=nn.BCEWithLogitsLoss(), optimizer=optimizer, model=cnn_model,
                     device="cuda:2", validation_metric=hamming_loss, logger_path="./logs.log",
                     progress=200)
model.fit(train_dataset, validation_set)

In [1]:
import os


base_directory = "/scratch/jribeiro/ec_number_prediction/esm2_data/"
esm2_data_folders = os.listdir(base_directory)
esm2_data_folders


['esm2_t12_35M', 'esm2_t6_8M', 'esm2_t33_650M', 'esm2_t30_150M']