<a href="https://colab.research.google.com/github/inshra12/deep-learning-protein-research-inshara/blob/main/notebooks/AAC_Protein_Detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# AAC Feature Extraction


## Import Libraries

In [35]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier








## Read from csv file

In [51]:
# Read the uploaded file
df = pd.read_csv("/content/protein_sequences_200.csv")
df

Unnamed: 0,Sequence,Label
0,KRNERANYEVMYGIKAPHSLNFAIHLTTTPPLITIIATPQCMMLAE...,0
1,KHPVPKDNPAKAMGANTDIQNQFDQQSKRERALVRTIYFCYYEEAV...,0
2,QPIHKFMLWASICRRPDRAHGKVQLPFGMEKMFEGFVYFKCESNQK...,0
3,LKPDYWPWLCPEPWHMTACNIELLAVCIVPGDASPEEFMKISHLQF...,0
4,YHTIYSPFTVPMCAVWRNQMLQQGIRDFVNNFCACETMQNQVSNSA...,0
...,...,...
195,WEMQGRQYEMASHNVLDLQLDCVHDYWFLKSPAMHDYWQMMALIAT...,1
196,LNAPQPFMDFNAIQMEFARCWRLPKCFKWFESHCYIDKFMAWQFGK...,1
197,TETYRKHWGRKRHDPVSAATPPEYWNISVIVPLQMGQHQVFAIEGT...,0
198,GHMVNMVLTITWVWNNKNDGSERNVCNEWDRDLLCPHPRYDDAGNW...,1


## Applying AAC Technique

In [47]:
# AAC = Amino Acid Composition
def compute_aac(sequence):
    amino_acids = 'ACDEFGHIKLMNPQRSTVWY'
    seq_len = len(sequence)
    if seq_len == 0:
        return [0] * len(amino_acids)  # if empty, return zeros
    features = []
    for aa in amino_acids:
        features.append(sequence.count(aa) / seq_len)  # frequency
    return features

# Apply AAC to each sequence
X = df['Sequence'].apply(compute_aac)
X = np.array(X.tolist())
y = df['Label'].values


## Split Data

In [38]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

## Models

In [39]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(),
    "Decision Tree": DecisionTreeClassifier(),
    "Naive Bayes": GaussianNB(),
    "Support Vector Machine": SVC(),
    "K-Nearest Neighbors": KNeighborsClassifier()
}



## Train & Predict Model

In [41]:

# Loop through each model and print its accuracy
for name, model in models.items():
    model.fit(X_train, y_train)                      # train the model
    y_pred = model.predict(X_test)                   # test the model
    accuracy = accuracy_score(y_test, y_pred)        # get accuracy
    print(f"{name} Accuracy: {accuracy:.2f}")
   # print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
    #print("\nClassification Report:\n", classification_report(y_test, y_pred))




Logistic Regression Accuracy: 0.43
Random Forest Accuracy: 0.48
Decision Tree Accuracy: 0.45
Naive Bayes Accuracy: 0.47
Support Vector Machine Accuracy: 0.42
K-Nearest Neighbors Accuracy: 0.48
