In [1]:
import numpy as np
import pandas as pd
from Bio.Seq import Seq
from gensim.models import Word2Vec

from common.csv_sequence_windowing import window_sequences_parallel
from common.env_config import config

In [2]:
train_df = pd.read_csv(config.TRAIN_DATA_CSV_FILE)
val_df = pd.read_csv(config.VAL_DATA_CSV_FILE)

print("Train DataFrame shape:", train_df.shape)
print("Test DataFrame shape:", val_df.shape)

Train DataFrame shape: (1733, 2)
Test DataFrame shape: (434, 2)


In [3]:
train_df = train_df.dropna()
val_df = val_df.dropna()
print("Train DataFrame shape after dropping NaN values:", train_df.shape)
print("Test DataFrame shape after dropping NaN values:", val_df.shape)

Train DataFrame shape after dropping NaN values: (1733, 2)
Test DataFrame shape after dropping NaN values: (433, 2)


In [4]:
windowed_train_df = window_sequences_parallel(train_df, overlap_percent=50)
windowed_val_df = window_sequences_parallel(val_df, overlap_percent=50)

del train_df
del val_df

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    1.2s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    1.2s
[Parallel(n_jobs=-1)]: Done  21 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Done  32 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Done  45 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Done  58 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.18926339120779512s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done  73 tasks      | elapsed:    1.4s
[Parallel(n_jobs=-1)]: Done  88 tasks      | elapsed:    1.4s
[Parallel(n_jobs=-1)]: Done 105 tasks      | elapsed:    1.4s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.038996219635009766s.) Setting batch_size=4.
[Parallel(n_jobs=-1)]: Done 125 tasks      | elapsed:    1.4s
[Parallel(n_jobs=-1)]: Done 162 tasks      | elapsed:    1.4s
[Parallel(n_jobs=-1)]: Done 200 tasks      | ela

In [5]:
X_train = windowed_train_df["sequence"].values
y_train = windowed_train_df["target"].values
X_val = windowed_val_df["sequence"].values
y_val = windowed_val_df["target"].values

del windowed_train_df
del windowed_val_df

print(f"Số phage trong tập huấn luyện: {len(X_train)}")
print(f"Số phage trong tập kiểm tra: {len(X_val)}")

Số phage trong tập huấn luyện: 497186
Số phage trong tập kiểm tra: 127237


In [6]:
def reverse_complement_augmentation(sequences, labels):
    """Tạo dữ liệu bổ sung bằng cách đảo ngược bổ sung chuỗi DNA."""
    augmented_sequences = []
    augmented_labels = []

    for seq, label in zip(sequences, labels):
        # Thêm chuỗi gốc
        augmented_sequences.append(seq)
        augmented_labels.append(label)

        # Thêm chuỗi đảo ngược bổ sung
        reverse_comp = str(Seq(seq).reverse_complement())
        augmented_sequences.append(reverse_comp)
        augmented_labels.append(label)

    return np.array(augmented_sequences), np.array(augmented_labels)


# Áp dụng reverse complement augmentation
X_train_aug, y_train_aug = reverse_complement_augmentation(X_train, y_train)

del X_train
del y_train

print(f"Số phage trong tập huấn luyện sau augmentation: {len(X_train_aug)}")

Số phage trong tập huấn luyện sau augmentation: 994372


In [7]:
def generate_kmers(sequence, k=6):
    """Tạo k-mers từ một chuỗi DNA bằng cách dùng sliding window."""
    return [sequence[i:i + k] for i in range(len(sequence) - k + 1)]


def prepare_sequences_for_word2vec(sequences, k=6):
    """Chuẩn bị chuỗi cho Word2Vec bằng cách chuyển đổi thành các kmer."""
    corpus = []
    for seq in sequences:
        # Tạo k-mers và chỉ giữ những k-mer hợp lệ (chỉ chứa A, C, G, T)
        valid_kmers = [kmer for kmer in generate_kmers(seq, k)
                       if all(nucleotide in "ACGT" for nucleotide in kmer)]
        corpus.append(valid_kmers)
    return corpus


# # Chuẩn bị dữ liệu cho Word2Vec
# corpus = prepare_sequences_for_word2vec(X_train_aug, k=6)
#
# # Huấn luyện mô hình Word2Vec với Skip-gram
# word2vec_model = Word2Vec(
#     sentences=corpus,
#     vector_size=300,
#     window=5,
#     min_count=1,
#     sample=1e-3,
#     sg=1,  # Skip-gram model
#     hs=0,  # Dùng negative sampling thay vì hierarchical softmax
#     epochs=20,
#     negative=5,
#     workers=4,
#     seed=42
# )
#
# # Lưu mô hình
# word2vec_model.save("phage_word2vec_model.bin")


word2vec_model = Word2Vec.load("phage_word2vec_model.bin")


# Tạo vector cho mỗi chuỗi bằng cách tính trung bình các vector từ các k-mer
def sequence_to_vector(sequence, word2vec_model, k=6):
    """Chuyển đổi một chuỗi DNA thành vector đặc trưng sử dụng Word2Vec."""
    kmers = generate_kmers(sequence, k)
    valid_kmers = [kmer for kmer in kmers if kmer in word2vec_model.wv.key_to_index]

    if not valid_kmers:
        return np.zeros(word2vec_model.vector_size)

    # Tính trung bình các vector
    vectors = [word2vec_model.wv[kmer] for kmer in valid_kmers]
    return np.mean(vectors, axis=0)


# Chuyển đổi chuỗi thành vector đặc trưng
X_train_vectors = np.array([sequence_to_vector(seq, word2vec_model) for seq in X_train_aug])
X_val_vectors = np.array([sequence_to_vector(seq, word2vec_model) for seq in X_val])

print(f"X_train_vectors shape: {X_train_vectors.shape}")
print(f"y_train shape: {y_train_aug.shape}")
print(f"X_val_vectors shape: {X_val_vectors.shape}")
print(f"y_val shape: {y_val.shape}")

X_train_vectors shape: (994372, 300)
y_train shape: (994372,)
X_val_vectors shape: (127237, 300)
y_val shape: (127237,)


In [8]:
np.save("word2vec_train_vector.npy",X_train_vectors)
np.save("y_train.npy",y_train_aug)
np.save("word2vec_val_vector.npy",X_val_vectors)
np.save("y_val.npy",y_val)