In [1]:
import glob
import os

import pandas as pd
from Bio import SeqIO
from Bio.SeqUtils import gc_fraction as GC

In [2]:
GENBANK_DIR = "../../data/ncbi_data/gen_bank/train/Lytic"

In [3]:
def analyze_genbank_file(file_path):
    """Phân tích một file GenBank và trả về thông tin cần thiết"""
    try:
        record = SeqIO.read(file_path, "genbank")

        basic_info = {
            "file_name": os.path.basename(file_path),
            "accession": record.id,
            "name": record.name,
            "description": record.description,
            "length": len(record.seq),
            "gc_content": GC(record.seq),
            "num_features": len(record.features),
            "sequence": str(record.seq)
        }

        return basic_info

    except Exception as e:
        print(f"Lỗi khi xử lý file {file_path}: {str(e)}")
        return None


def process_genbank_files():
    gb_files = []
    for group in os.listdir(GENBANK_DIR):
        gb_files += glob.glob(os.path.join(GENBANK_DIR, group, "*.gb"))

    if not gb_files:
        print(f"Không tìm thấy file GenBank trong thư mục {GENBANK_DIR}")
        return None

    print(f"Tìm thấy {len(gb_files)} file GenBank. Đang xử lý...")

    all_results = []
    for i, file_path in enumerate(gb_files):
        print(f"Đang xử lý file {i + 1}/{len(gb_files)}: {os.path.basename(file_path)}")
        result = analyze_genbank_file(file_path)
        if result:
            all_results.append(result)

    print(f"Đã xử lý xong {len(all_results)}/{len(gb_files)} file GenBank.")
    return all_results


def run(all_results):
    if not all_results:
        print("Không có dữ liệu để phân tích.")
        return

    basic_info_list = [r for r in all_results]
    basic_df = pd.DataFrame(basic_info_list)
    return basic_df


all_results = process_genbank_files()

if all_results:
    data_frames = run(all_results)
else:
    print("Không có kết quả để phân tích.")
    # return None

Tìm thấy 1227 file GenBank. Đang xử lý...
Đang xử lý file 1/1227: train_AM084414_Lytic_Group1.gb
Đang xử lý file 2/1227: train_NC_001271_Lytic_Group1.gb
Đang xử lý file 3/1227: train_NC_001629_Lytic_Group1.gb
Đang xử lý file 4/1227: train_NC_001909_Lytic_Group1.gb
Đang xử lý file 5/1227: train_NC_001956_Lytic_Group1.gb
Đang xử lý file 6/1227: train_NC_002014_Lytic_Group1.gb
Đang xử lý file 7/1227: train_NC_002194_Lytic_Group1.gb
Đang xử lý file 8/1227: train_NC_004165_Lytic_Group1.gb
Đang xử lý file 9/1227: train_NC_004333_Lytic_Group1.gb
Đang xử lý file 10/1227: train_NC_004814_Lytic_Group1.gb
Đang xử lý file 11/1227: train_NC_004831_Lytic_Group1.gb
Đang xử lý file 12/1227: train_NC_004902_Lytic_Group1.gb
Đang xử lý file 13/1227: train_NC_005091_Lytic_Group1.gb
Đang xử lý file 14/1227: train_NC_005948_Lytic_Group1.gb
Đang xử lý file 15/1227: train_NC_006883_Lytic_Group1.gb
Đang xử lý file 16/1227: train_NC_007019_Lytic_Group1.gb
Đang xử lý file 17/1227: train_NC_007022_Lytic_Group1.gb

In [4]:
data_frames.head()

Unnamed: 0,file_name,accession,name,description,length,gc_content,num_features,sequence
0,train_AM084414_Lytic_Group1.gb,AM084414.1,AM084414,"Enterobacteria phage K1F, complete genome",39699,0.497771,123,TCTCACAGTTCAAGAACCTCAAGTCTCCCCATAGGCCCTCTTTAAG...
1,train_NC_001271_Lytic_Group1.gb,NC_001271.1,NC_001271,"Yersinia phage phiYeO3-12, complete genome",39600,0.506313,157,TCTCATAGTTCAAGAACCCAAAGTACCCCCCATAGCCCTCTTAAAG...
2,train_NC_001629_Lytic_Group1.gb,NC_001629.1,NC_001629,"Lactococcus phage bIL67, complete genome",22195,0.359901,75,GAGTTAGGCTTGATAGAAAACCCACCCCCTTTATATCACACCCCCT...
3,train_NC_001909_Lytic_Group1.gb,NC_001909.1,NC_001909,"Lactococcus phage bIL170, complete genome",31754,0.343453,166,CACAAAGGACTCCCGGCTGCGAATCCCCCAAAAAATCAAAAAGAAA...
4,train_NC_001956_Lytic_Group1.gb,NC_001956.1,NC_001956,"Vibrio phage fs2, complete genome",8651,0.445266,28,AGTACTGGGATAAGGTAAGGAGCACCAGTCTTACTCACCCCTTCAG...


In [6]:
data_frames.shape

(1227, 8)

In [5]:
data_frames.to_csv("lytic.csv", index=False)