In [141]:
from collections import Counter

import pandas as pd
from Bio import SeqIO
from sklearn.model_selection import train_test_split

In [75]:
# Đọc file FASTA
def read_fasta_file(file_path):
    records = []
    accession_number = []
    error_records = []
    for record in SeqIO.parse(file_path, "fasta"):
        records.append(record)
        try:
            accession_number.append(record.id.split("|")[3])
        except IndexError:
            accession_number.append(f'{record.id.split("_")[2]}_{record.id.split("_")[3]}')
    return records, accession_number, error_records

In [80]:
# Dữ liệu DeePhage
ds1_temp = "../data/deephage_data/Dataset-1_temperate.fasta"
ds1_viru = "../data/deephage_data/Dataset-1_virulent.fasta"
ds2_temp = "../data/deephage_data/Dataset-2_temperate.fasta"
ds2_viru = "../data/deephage_data/Dataset-2_virulent.fasta"

ds1_temp_records, ds1_temp_accession_numbers, ds1_temp_error_records = read_fasta_file(ds1_temp)
ds1_viru_records, ds1_viru_accession_numbers, ds1_viru_error_records = read_fasta_file(ds1_viru)
ds2_temp_records, ds2_temp_accession_numbers, ds2_temp_error_records = read_fasta_file(ds2_temp)
ds2_viru_records, ds2_viru_accession_numbers, ds2_viru_error_records = read_fasta_file(ds2_viru)

deephage_accession_numbers = ds1_temp_accession_numbers + ds1_viru_accession_numbers + ds2_temp_accession_numbers + ds2_viru_accession_numbers

print("Dataset 1 - Temperate:", len(ds1_temp_records))
print("Dataset 1 - Virulent:", len(ds1_viru_records))
print("Dataset 2 - Temperate:", len(ds2_temp_records))
print("Dataset 2 - Virulent:", len(ds2_viru_records))

print("Total temperate: ", len(ds1_temp_records) + len(ds2_temp_records))
print("Total virulent: ", len(ds1_viru_records) + len(ds2_viru_records))

Dataset 1 - Temperate: 148
Dataset 1 - Virulent: 77
Dataset 2 - Temperate: 429
Dataset 2 - Virulent: 1211
Total temperate:  577
Total virulent:  1288


In [116]:
deephage_ds1_temp_df = pd.DataFrame(zip(ds1_temp_accession_numbers, ["Lysogenic"] * len(ds1_temp_accession_numbers),
                                        ["dataset_1"] * len(ds1_temp_accession_numbers)),
                                    columns=["accession_number", "lifecycle", "note"])
deephage_ds1_viru_df = pd.DataFrame(zip(ds1_viru_accession_numbers, ["Lytic"] * len(ds1_viru_accession_numbers),
                                        ["dataset_1"] * len(ds1_viru_accession_numbers)),
                                    columns=["accession_number", "lifecycle", "note"])
deephage_ds2_temp_df = pd.DataFrame(zip(ds2_temp_accession_numbers, ["Lysogenic"] * len(ds2_temp_accession_numbers),
                                        ["dataset_2"] * len(ds2_temp_accession_numbers)),
                                    columns=["accession_number", "lifecycle", "note"])
deephage_ds2_viru_df = pd.DataFrame(zip(ds2_viru_accession_numbers, ["Lytic"] * len(ds2_viru_accession_numbers),
                                        ["dataset_2"] * len(ds2_viru_accession_numbers)),
                                    columns=["accession_number", "lifecycle", "note"])

# Concatenate the DataFrames
deephage_ds_df = pd.concat([deephage_ds1_temp_df, deephage_ds1_viru_df, deephage_ds2_temp_df, deephage_ds2_viru_df],
                           ignore_index=True)
deephage_ds_df['source'] = 'deephage'
deephage_ds_df.shape

(1865, 4)

In [117]:
deephage_ds_df.head()

Unnamed: 0,accession_number,lifecycle,note,source
0,NC_013055,Lysogenic,dataset_1,deephage
1,NC_011976,Lysogenic,dataset_1,deephage
2,NC_011613,Lysogenic,dataset_1,deephage
3,NC_011611,Lysogenic,dataset_1,deephage
4,NC_011357,Lysogenic,dataset_1,deephage


In [118]:
deep_pl_dataset = "../data/deep_pl_data/deep_pl_dataset.xlsx"
train_df = pd.read_excel(deep_pl_dataset, sheet_name="Training dataset")
test_df = pd.read_excel(deep_pl_dataset, sheet_name="Test dataset")

print("DeepPL Train:", train_df.shape)
print(Counter(train_df['Lifecycle']))

print("DeepPL Test:", test_df.shape)
print(Counter(test_df['Lifecycle']))

deep_pl_accession_numbers = train_df['Accession number'].tolist() + test_df['Accession number'].tolist()

DeepPL Train: (1806, 4)
Counter({'Lytic': 1227, 'Lysogenic': 579})
DeepPL Test: (374, 3)
Counter({'Lytic': 245, 'Lysogenic': 129})


In [119]:
deep_pl_train_df = train_df[['Accession number', 'Lifecycle', 'Usage']]
deep_pl_train_df = deep_pl_train_df.rename(columns={
    'Accession number': 'accession_number',
    'Lifecycle': 'lifecycle',
    'Usage': 'note'
})
deep_pl_train_df['source'] = 'deep_pl'

deep_pd_test_df = test_df[['Accession number', 'Lifecycle', 'Usage']]
deep_pd_test_df = deep_pd_test_df.rename(columns={
    'Accession number': 'accession_number',
    'Lifecycle': 'lifecycle',
    'Usage': 'note'
})
deep_pd_test_df['source'] = 'deep_pl'

deep_pl_df = pd.concat([deep_pl_train_df, deep_pd_test_df], ignore_index=True)

In [120]:
deep_pl_df.sample(5)

Unnamed: 0,accession_number,lifecycle,note,source
1334,NC_028817,Lytic,Training,deep_pl
2122,MW217515,Lytic,Test,deep_pl
19,NC_020488,Lysogenic,Training,deep_pl
1847,KY658674,Lysogenic,Test,deep_pl
1411,NC_019925,Lytic,Training,deep_pl


In [134]:
import numpy as np

combined_df = pd.merge(deephage_ds_df, deep_pl_df, on='accession_number', suffixes=('_deephage', '_deep_pl'),
                       how='outer', indicator=True)
combined_df['is_valid'] = combined_df.apply(
    lambda row: 'valid' if row['lifecycle_deephage'] == row['lifecycle_deep_pl'] or pd.isna(row['lifecycle_deephage']) or pd.isna(row['lifecycle_deep_pl']) else 'invalid', axis=1)

combined_df['label'] = combined_df.apply(
    lambda row: (
        # Nếu lifecycle_deephage khác null và lifecycle_deep_pl khác null, lấy giá trị từ lifecycle_deephage
        row['lifecycle_deephage'] if pd.notna(row['lifecycle_deephage']) and pd.notna(row['lifecycle_deep_pl']) else
        # Nếu lifecycle_deephage là null và lifecycle_deep_pl khác null, lấy giá trị từ lifecycle_deep_pl
        row['lifecycle_deep_pl'] if pd.isna(row['lifecycle_deephage']) and pd.notna(row['lifecycle_deep_pl']) else
        # Nếu lifecycle_deephage khác null và lifecycle_deep_pl là null, lấy giá trị từ lifecycle_deephage
        row['lifecycle_deephage'] if pd.notna(row['lifecycle_deephage']) and pd.isna(row['lifecycle_deep_pl']) else
        # Trường hợp còn lại (cả hai đều null)
        np.nan
    ),
    axis=1
)
combined_df.sample(5)

Unnamed: 0,accession_number,lifecycle_deephage,note_deephage,source_deephage,lifecycle_deep_pl,note_deep_pl,source_deep_pl,_merge,is_valid,label
581,NC_007924,Lysogenic,dataset_2,deephage,Lysogenic,Training,deep_pl,both,valid,Lysogenic
213,MG676466,,,,Lytic,Test,deep_pl,right_only,valid,Lytic
2179,NC_042354,Lytic,dataset_2,deephage,Lytic,Training,deep_pl,both,valid,Lytic
283,MT227924,,,,Lysogenic,Test,deep_pl,right_only,valid,Lysogenic
1422,NC_024785,Lytic,dataset_2,deephage,Lytic,Training,deep_pl,both,valid,Lytic


In [135]:
combined_df.sample(10)

Unnamed: 0,accession_number,lifecycle_deephage,note_deephage,source_deephage,lifecycle_deep_pl,note_deep_pl,source_deep_pl,_merge,is_valid,label
1235,NC_021867,Lytic,dataset_2,deephage,Lysogenic,Training,deep_pl,both,invalid,Lytic
581,NC_007924,Lysogenic,dataset_2,deephage,Lysogenic,Training,deep_pl,both,valid,Lysogenic
2168,NC_042338,Lysogenic,dataset_2,deephage,Lysogenic,Training,deep_pl,both,valid,Lysogenic
167,KX581095,,,,Lysogenic,Test,deep_pl,right_only,valid,Lysogenic
297,MW073017,,,,Lytic,Test,deep_pl,right_only,valid,Lytic
1296,NC_023007,Lysogenic,dataset_2,deephage,Lysogenic,Training,deep_pl,both,valid,Lysogenic
191,KY883641,,,,Lytic,Test,deep_pl,right_only,valid,Lytic
486,NC_005356,Lysogenic,dataset_1,deephage,Lysogenic,Training,deep_pl,both,valid,Lysogenic
1451,NC_025434,Lysogenic,dataset_2,deephage,Lysogenic,Training,deep_pl,both,valid,Lysogenic
1653,NC_028743,Lysogenic,dataset_2,deephage,Lysogenic,Training,deep_pl,both,valid,Lysogenic


In [130]:
combined_df.columns

Index(['accession_number', 'lifecycle_deephage', 'note_deephage',
       'source_deephage', 'lifecycle_deep_pl', 'note_deep_pl',
       'source_deep_pl', '_merge', 'is_valid'],
      dtype='object')

In [139]:
combined_df.to_csv("../data/my_data/aggregated_deephage_deeppl_data/combined_ds.csv", index=False)
combined_valid_df = combined_df[combined_df['is_valid'] == 'valid']
combined_valid_df.to_csv("../data/my_data/aggregated_deephage_deeppl_data/valid_ds.csv", index=False)

In [142]:
train_df, test_df = train_test_split(combined_valid_df, test_size=0.2, stratify=combined_valid_df['label'], random_state=42)
train_df.to_csv("../data/my_data/aggregated_deephage_deeppl_data/train.csv", index=False)
test_df.to_csv("../data/my_data/aggregated_deephage_deeppl_data/test.csv", index=False)