In [1]:
import os
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import torch

In [2]:
# os.chdir('C:/Users/jaehw/OneDrive/Desktop/protein_classifier_project/')
os.chdir('..')

def reader(partition, data_path= 'data/raw/random_split'):
    data = []
    for file_name in os.listdir(os.path.join(data_path, partition)):
        with open(os.path.join(data_path, partition, file_name)) as file:
            data.append(
                pd.read_csv(
                    file, index_col=None, usecols=["sequence", "family_accession"]
                )
            )

    all_data = pd.concat(data)
    return all_data

In [35]:
df_train = reader('train')
df_test = reader('test')
df_dev = reader('dev')

In [37]:
len(df_train), len(df_test), len(df_dev)

(1086741, 126171, 126171)

In [None]:
len(df_dev)*10

In [31]:
max_length = 150

In [5]:
def build_vocab(data):
    # Build the vocabulary
    voc = set()
    rare_AAs = {'X', 'U', 'B', 'O', 'Z'}
    for sequence in data:
        voc.update(sequence)

    unique_AAs = sorted(voc - rare_AAs)
    
    return {w: i for i, w in enumerate(unique_AAs, start=1)}

In [6]:
word2id = build_vocab(df_train["sequence"])
print(word2id)

{'A': 1, 'C': 2, 'D': 3, 'E': 4, 'F': 5, 'G': 6, 'H': 7, 'I': 8, 'K': 9, 'L': 10, 'M': 11, 'N': 12, 'P': 13, 'Q': 14, 'R': 15, 'S': 16, 'T': 17, 'V': 18, 'W': 19, 'Y': 20}


In [8]:
### Create a tokenizer
label_encoder = LabelEncoder()

label_encoder = label_encoder.fit(df_train['family_accession'])

In [32]:
from torch.utils.data import Dataset, DataLoader
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
import numpy as np

class SequenceDataset(Dataset):
    def __init__(self, df_data, max_sequence, label_encoder, word2id):
        
        integer_encode = []
        for row in df_data["sequence"]:
            encoded_row = []
            for aa in row:
                encoded_row.append(word2id.get(aa, 0))
            integer_encode.append(np.array(encoded_row, dtype="uint8"))
            
        self.sequences = pad_sequences(integer_encode, maxlen=max_length, padding='post', truncating='post')

        
        encoded_labels = label_encoder.transform(df_data['family_accession'])
        labels = to_categorical(encoded_labels, dtype="uint16")
        
        self.labels = labels

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.sequences[idx], self.labels[idx]

In [36]:
train_data = SequenceDataset(df_train, max_length, label_encoder, word2id)
del df_train
test_data = SequenceDataset(df_test, max_length, label_encoder, word2id)
del df_test
dev_data = SequenceDataset(df_dev, max_length, label_encoder, word2id)
del df_dev

MemoryError: Unable to allocate 36.3 GiB for an array with shape (1086741, 17929) and data type uint16

In [30]:
dev_data.sequence.shape

(126171, 200)

In [17]:
labels = set(df_train["family_accession"].to_list())

In [17]:
num_classes = len(labels)
print(num_classes)

17929


In [22]:
def to_category(y, num_classes):
    """ One-hot encodes a tensor of labels.

    Args:
        y (Tensor): A tensor of integer labels of shape [batch_size,].
        num_classes (int): The number of classes.

    Returns:
        Tensor: The one-hot encoded labels as a tensor.
    """
    return torch.eye(num_classes)[y]

from tensorflow.keras.utils import to_categorical

Collecting tensorflow
  Downloading tensorflow-2.15.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (475.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m475.2/475.2 MB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting tensorflow-estimator<2.16,>=2.15.0
  Downloading tensorflow_estimator-2.15.0-py2.py3-none-any.whl (441 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m442.0/442.0 KB[0m [31m23.0 MB/s[0m eta [36m0:00:00[0m
Collecting termcolor>=1.1.0
  Downloading termcolor-2.3.0-py3-none-any.whl (6.9 kB)
Collecting opt-einsum>=2.3.2
  Downloading opt_einsum-3.3.0-py3-none-any.whl (65 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m65.5/65.5 KB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m
Collecting h5py>=2.9.0
  Downloading h5py-3.10.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.8/4.8 MB[0m [31m2

2023-11-20 15:36:01.993159: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-11-20 15:36:01.994811: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-11-20 15:36:02.246923: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-11-20 15:36:02.711989: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [None]:
train_labels = to_categorical(train_encoder, num_classes)