In [3]:

import pandas as pd
import numpy as np
from Bio import Entrez, SeqIO
from Bio.SeqUtils.ProtParam import ProteinAnalysis
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error

# email for NCBI access
Entrez.email = "ilakiya1909@gmail.com"

# fetch protein using NCBI
def fetch_protein_sequence(accession):
    try:
        handle = Entrez.efetch(db="protein", id=accession, rettype="fasta", retmode="text")
        record = SeqIO.read(handle, "fasta")
        handle.close()
        return str(record.seq)
    except Exception as e:
        print(f"Error fetching sequence: {e}")
        return None


hiv_protein_accession = "1TKX_A"

# Fetch sequence
hiv_protein_sequence = fetch_protein_sequence(hiv_protein_accession)

if hiv_protein_sequence:
    print(f"Fetched Sequence ({hiv_protein_accession}): {hiv_protein_sequence[:50]}...")


def extract_features(sequence):
    analyzed_seq = ProteinAnalysis(sequence)
    aa_comp = analyzed_seq.amino_acids_percent
    return list(aa_comp.values())

# Example dataset
data = {
    "Sequence": [
        "MKTLLILTCLVAVALARPKT",
        "MGDVEKGKKIFIMKCSQCHTVEKGGKHKTGPNLHGQVC",
        "MAKKNVSTVLASSLAVVTAVIL",
        "MKTVFAGVLMNTSK",
        "MTQLLQFSAFAISASALSTF"
    ],
    "Tm": [55.3, 63.5, 47.2, 58.9, 52.7]  }

df = pd.DataFrame(data)

df["Features"] = df["Sequence"].apply(extract_features)
X = np.vstack(df["Features"].values)
y = df["Tm"].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = DecisionTreeRegressor()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Absolute Error: {mae:.2f}°C")

if hiv_protein_sequence:
    new_features = np.array(extract_features(hiv_protein_sequence)).reshape(1, -1)
    predicted_tm = model.predict(new_features)
    print(f"Predicted Tm for '{hiv_protein_accession}': {predicted_tm[0]:.2f}°C")


Fetched Sequence (1TKX_A): PISPIETVPVKLKPGMDGPKVKQWPLTEEKIKALVEICTEMEKEGKISKI...
Mean Absolute Error: 16.30°C
Predicted Tm for '1TKX_A': 47.20°C
