In [4]:
import pandas as pd
from collections import defaultdict
import math
from Bio import SeqIO

In [6]:
# data = pd.read_csv(r'D:\jupyter_lab\transformer_xai_peptide\input\full_feature_input\amp_ncbi.csv')
# data

In [174]:
file_name = "nAMP_notGenBank_part1.csv"

In [176]:
# Đường dẫn tới tệp FASTA
fasta_file = r"C:\Users\hp\anaconda3\envs\TH_TTNT\src\transformer_xai_peptide\126_dac_trung\sequences_part1.fasta"

In [178]:
def fasta_to_dataframe(fasta_file, description):
    
    data = []
    
    # Duyệt qua từng record trong tệp FASTA
    for record in SeqIO.parse(fasta_file, "fasta"):
        # Lấy ID, mô tả, và chuỗi từ từng record
        seq_id = record.id.split()[0]
        if description == '':
            description = record.description.split()[1]
        sequence = str(record.seq)
        
        # Thêm thông tin vào danh sách
        data.append({
            "ID": seq_id,
            "Label": description,
            "Sequence": sequence
        })
    
    # Tạo DataFrame từ danh sách dữ liệu
    df = pd.DataFrame(data)
    
    return df

In [179]:
# Gọi hàm để chuyển đổi tệp FASTA thành DataFrame
df = fasta_to_dataframe(fasta_file, 'nAMP')

# Xem dữ liệu
df.columns

Index(['ID', 'Label', 'Sequence'], dtype='object')

In [181]:
def generate_distribution_features(sequence):
    """
    Generate a dictionary of distribution descriptors for amino acid classes in a given sequence.
    
    Parameters:
    - sequence (str): The amino acid sequence.

    Returns:
    - dict: A dictionary with distribution descriptors for various properties and classes.
    """

    # Define the amino acid classes for each property
    properties = {
        "hydrophobicity": {
            "polar": "RKEDQN",
            "neutral": "GASTPHY",
            "hydrophobic": "CLVIMFW"
        },
        "vdw_volume": {
            "small": "GASTPDC",
            "medium": "NVEQIL",
            "large": "MHKFRYW"
        },
        "polarity": {
            "polar": "EDQNKR",
            "neutral": "GASTPHY",
            "nonpolar": "CLVIMFW"
        },
        "polarizability": {
            "low": "GASDT",
            "medium": "CPNVEQIL",
            "high": "KMHFRYW"
        },
        "charge": {
            "negative": "DE",
            "neutral": "ACFGHILMNPQSTVWY",
            "positive": "KR"
        },
        "secondary_structure": {
            "helix": "EALMQKRH",
            "sheet": "VIYCWT",
            "coil": "GNPSD"
        },
        "solvent_accessibility": {
            "buried": "ALFCGIVW",
            "intermediate": "MPSTHY",
            "exposed": "DEKNQR"
        }
    }

    def calculate_distribution(amino_class):
        """
        Calculate distribution descriptors for a given amino acid class in a sequence.
        Returns a dictionary with descriptors for the first, 20%, 40%, 60%, 80% and 100% residue positions.
        """
        class_residues = [i for i, aa in enumerate(sequence) if aa in amino_class]
        length = len(sequence)
        total = len(class_residues)

        if total == 0:
            # No residues of this class in sequence
            return {"first": 0, "20%": 0, "40%": 0, "60%": 0, "80%": 0, "100%": 0}

        # Calculate descriptors based on positions in sequence
        distribution = {}
        for percentile, label in zip([0, 0.2, 0.4, 0.6, 0.8, 1.0], ["first", "20%", "40%", "60%", "80%", "100%"]):
            position = math.floor(total * percentile)
            if position >= total:  # Prevent index error
                position = total - 1
            residue_position = class_residues[position]
            distribution[label] = (residue_position + 1) / length * 100  # Convert to percentage

        return distribution

    feature_dict = defaultdict(dict)

    for prop, classes in properties.items():
        for class_name, amino_class in classes.items():
            # Calculate the distribution descriptors
            distribution = calculate_distribution(amino_class)
            for key, value in distribution.items():
                feature_name = f"{prop}_{class_name}_{key}"
                feature_dict[feature_name] = value

    return dict(feature_dict)

In [182]:
# # Áp dụng hàm generate_distribution_features cho từng chuỗi trong cột 'Sequence'
# features_list = df['Sequence'].apply(generate_distribution_features)

# # Chuyển đổi danh sách các đặc trưng thành DataFrame
# features_df = pd.DataFrame(features_list.tolist())

# # Nối DataFrame gốc với các đặc trưng mới
# df_with_features = pd.concat([df, features_df], axis=1)

# # Lưu DataFrame kết quả vào tệp CSV
# output_file = r'D:\jupyter_lab\transformer_xai_peptide\input\full_feature_input\amp_httpsapp.peptipedia.clactivity2.csv'
# df_with_features.to_csv(output_file, index=False)

# print(f"Kết quả đã được lưu vào {output_file}")


In [183]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import os


# Áp dụng hàm generate_distribution_features cho từng chuỗi trong cột 'Sequence'
features_list = df['Sequence'].apply(generate_distribution_features)

# Chuyển đổi danh sách các đặc trưng thành DataFrame
features_df = pd.DataFrame(features_list.tolist())

# Nối DataFrame gốc với các đặc trưng mới
df_with_features = pd.concat([df, features_df], axis=1)

# Bước 1: Chọn các cột cần normalize (loại trừ ID, Label, Sequence)
cols_to_normalize = [col for col in df_with_features.columns 
                    if col not in ['ID', 'Label', 'Sequence'] 
                    and pd.api.types.is_numeric_dtype(df_with_features[col])]

# Bước 2: Áp dụng Min-Max normalization cho các cột đã chọn
scaler = MinMaxScaler()
df_with_features[cols_to_normalize] = scaler.fit_transform(df_with_features[cols_to_normalize])

# Bước 3: Lưu file CSV (đã được normalize)
output_dir = r'C:\Users\hp\anaconda3\envs\TH_TTNT\src\transformer_xai_peptide\126_dac_trung\nAMP_notGenBank'
output_file = os.path.join(output_dir, file_name)

df_with_features.to_csv(output_file, index=False)

print(f"Kết quả đã được lưu vào {output_file}")
print("Các cột đã được normalize:", cols_to_normalize)
print("Các cột KHÔNG normalize:", ['ID', 'Label', 'Sequence'])

Kết quả đã được lưu vào C:\Users\hp\anaconda3\envs\TH_TTNT\src\transformer_xai_peptide\126_dac_trung\AMP_notGenBank\AMP_notGenBank_part4.csv
Các cột đã được normalize: ['hydrophobicity_polar_first', 'hydrophobicity_polar_20%', 'hydrophobicity_polar_40%', 'hydrophobicity_polar_60%', 'hydrophobicity_polar_80%', 'hydrophobicity_polar_100%', 'hydrophobicity_neutral_first', 'hydrophobicity_neutral_20%', 'hydrophobicity_neutral_40%', 'hydrophobicity_neutral_60%', 'hydrophobicity_neutral_80%', 'hydrophobicity_neutral_100%', 'hydrophobicity_hydrophobic_first', 'hydrophobicity_hydrophobic_20%', 'hydrophobicity_hydrophobic_40%', 'hydrophobicity_hydrophobic_60%', 'hydrophobicity_hydrophobic_80%', 'hydrophobicity_hydrophobic_100%', 'vdw_volume_small_first', 'vdw_volume_small_20%', 'vdw_volume_small_40%', 'vdw_volume_small_60%', 'vdw_volume_small_80%', 'vdw_volume_small_100%', 'vdw_volume_medium_first', 'vdw_volume_medium_20%', 'vdw_volume_medium_40%', 'vdw_volume_medium_60%', 'vdw_volume_medium_80