<a href="https://colab.research.google.com/github/inshra12/deep-learning-protein-research-inshara/blob/main/notebooks/dna_binding_protein_prediction_using_sklearn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Feature Extraction Techniques

In [30]:
pip install biopython




## Import Libraries

In [31]:
import pandas as pd
import numpy as np
from Bio import SeqIO
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier


## Load Fasta Files

In [32]:
# Load sequences from a FASTA file
def load_fasta_sequences(filepath, label):
    records = []
    for record in SeqIO.parse(filepath, "fasta"):
        seq = str(record.seq)
        records.append({
            "id": record.id,
            "sequence": seq,
            "label": label
        })
    return records

# Load both positive and negative classes
positive_records = load_fasta_sequences("/content/positive_160.fasta", label=1)
negative_records = load_fasta_sequences("/content/negetive_160.fasta", label=0)

# Combine both lists
all_records = positive_records + negative_records

# Convert to a DataFrame
df = pd.DataFrame(all_records)

# Preview
print(df)

# Save to CSV (optional)
df.to_csv("/content/protein_sequences.csv", index=False)
print(df)


                            id  \
0    sp|A0A0C5B5G6|MOTSC_HUMAN   
1        sp|A6NI15|MSGN1_HUMAN   
2        sp|A8MT69|CENPX_HUMAN   
3        sp|A8MZ59|LEUTX_HUMAN   
4        sp|O00488|ZN593_HUMAN   
..                         ...   
315      sp|Q9UKS7|IKZF2_HUMAN   
316      sp|Q9ULC4|MCTS1_HUMAN   
317        sp|Q9ULZ3|ASC_HUMAN   
318      sp|Q9UPN6|SCAF8_HUMAN   
319       sp|Q9Y3C8|UFC1_HUMAN   

                                              sequence  label  
0                                     MRWQEMGYIFYPRKLR      1  
1    MDNLRETFLSLEDGLGSSDSPGLLSSWDWKDRAGPFELNQASPSQS...      1  
2    MEGAGAGSGFRKELVSRLLHLHFKDDKTKVSGDALQLMVELLKVFV...      1  
3    MFEGPRRYRRPRTRFLSKQLTALRELLEKTMHPSLATMGKLASKLQ...      1  
4    MGRSRRTGAHRAHSLARQMKAKRRRPDLDEIHRELRPQGSARPQPD...      1  
..                                                 ...    ...  
315  METEAIDGYITCDNELSPEREHSNMAIDLTSSTPNGQHASPSHMTS...      0  
316  MFKKFDEKENVSNCIQLKTSVIKGIKNQLIEQFPGIEPWLNQIMPK...      0  
317  MGRARDAILD

## AAC = Amino Acid Composition

In [33]:
# AAC = Amino Acid Composition
def compute_aac(sequence):
    amino_acids = 'ACDEFGHIKLMNPQRSTVWY'
    seq_len = len(sequence)
    if seq_len == 0:
        return [0] * len(amino_acids)  # if empty, return zeros
    features = []
    for aa in amino_acids:
        features.append(sequence.count(aa) / seq_len)  # frequency
    return features

# Apply AAC to each sequence
X = df['sequence'].apply(compute_aac)
X = np.array(X.tolist())
y = df['label'].values

### Split Model

In [34]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [35]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(),
    "Decision Tree": DecisionTreeClassifier(),
    "Naive Bayes": GaussianNB(),
    "Support Vector Machine": SVC(),
    "K-Nearest Neighbors": KNeighborsClassifier()
}


### Train And Evaluate Model

In [36]:
# Loop through each model and print its accuracy
for name, model in models.items():
    model.fit(X_train, y_train)                      # train the model
    y_pred = model.predict(X_test)                   # test the model
    accuracy = accuracy_score(y_test, y_pred)        # get accuracy
    print(f"{name} Accuracy: {accuracy:.2f}")
   # print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
    #print("\nClassification Report:\n", classification_report(y_test, y_pred))

Logistic Regression Accuracy: 0.65
Random Forest Accuracy: 0.64
Decision Tree Accuracy: 0.60
Naive Bayes Accuracy: 0.66
Support Vector Machine Accuracy: 0.64
K-Nearest Neighbors Accuracy: 0.68


## Dpc = Dipeptide Composition

In [37]:
# Dpc =  Dipeptide Composition
from itertools import product

def compute_dpc(sequence):
    amino_acids = 'ACDEFGHIKLMNPQRSTVWY'
    dipeptides = [''.join(p) for p in product(amino_acids, repeat=2)]

    seq_len = len(sequence)
    total_dipeptides = seq_len - 1
    dpc_dict = dict.fromkeys(dipeptides, 0)

    for i in range(total_dipeptides):
        dipep = sequence[i:i+2]
        if dipep in dpc_dict:
            dpc_dict[dipep] += 1

    features = []
    for dp in dipeptides:
        if total_dipeptides > 0:
            freq = dpc_dict[dp] / total_dipeptides
        else:
            freq = 0
        features.append(freq)
    return features

  # Apply Dpc to each sequence
X = df['sequence'].apply(compute_dpc)
X = np.array(X.tolist())
y = df['label'].values


### Split Model

In [38]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [39]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(),
    "Decision Tree": DecisionTreeClassifier(),
    "Naive Bayes": GaussianNB(),
    "Support Vector Machine": SVC(),
    "K-Nearest Neighbors": KNeighborsClassifier()
}


### Train and evaluate Models

In [40]:
# Loop through each model and print its accuracy
for name, model in models.items():
    model.fit(X_train, y_train)                      # train the model
    y_pred = model.predict(X_test)                   # test the model
    accuracy = accuracy_score(y_test, y_pred)        # get accuracy
    print(f"{name} Accuracy: {accuracy:.2f}")
   # print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
    #print("\nClassification Report:\n", classification_report(y_test, y_pred))

Logistic Regression Accuracy: 0.57
Random Forest Accuracy: 0.71
Decision Tree Accuracy: 0.56
Naive Bayes Accuracy: 0.72
Support Vector Machine Accuracy: 0.68
K-Nearest Neighbors Accuracy: 0.59
