<a href="https://colab.research.google.com/github/inshra12/Protein-Subcellular-Localization-using-Traditional-ML/blob/main/subcelluar_localization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [79]:
!pip install datasets



In [None]:
# Step 1: Import required libraries
from datasets import load_dataset
import pandas as pd
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [None]:
# Step 2: Load dataset from Hugging Face
dataset = load_dataset("proteinea/deeploc", split="train")

# Step 3: Convert to pandas DataFrame
df = dataset.to_pandas()

# Step 4: Define the 20 standard amino acids
AMINO_ACIDS = 'ACDEFGHIKLMNPQRSTVWY'

In [None]:
# Step 5: Function to calculate AAC (Amino Acid Composition)
def calculate_aac(sequence):
    sequence = sequence.upper()
    length = len(sequence)
    counts = Counter(sequence)
    return [counts[aa] / length if aa in counts else 0 for aa in AMINO_ACIDS]

# Step 6: Apply AAC to all sequences
aac_features = df['input'].apply(calculate_aac)
aac_df = pd.DataFrame(aac_features.tolist(), columns=[f"AAC_{aa}" for aa in AMINO_ACIDS])


In [None]:
# Step 7: DPC Feature Extraction
# Generate all 400 possible dipeptides
dipeptides = [aa1 + aa2 for aa1 in AMINO_ACIDS for aa2 in AMINO_ACIDS]

def calculate_dpc(sequence):
    sequence = sequence.upper()
    counts = Counter([sequence[i:i+2] for i in range(len(sequence) - 1)])
    total = sum(counts.values())
    return [counts[dp] / total if total > 0 else 0 for dp in dipeptides]

# Step 8: Apply DPC to all sequences
dpc_features = df["input"].apply(calculate_dpc)
dpc_df = pd.DataFrame(dpc_features.tolist(), columns=[f"DPC_{dp}" for dp in dipeptides])

In [None]:
# Step 9: Combine AAC + DPC features
combined_df = pd.concat([aac_df, dpc_df], axis=1)

# Step 10: Add labels
combined_df["label"] = df["membrane"]

In [None]:
# Step 11: Prepare data for ML
X = combined_df.drop("label", axis=1)
y = combined_df["label"]
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 12: Define and evaluate models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(random_state=31),
    "Decision Tree": DecisionTreeClassifier(),
    "Naive Bayes": GaussianNB(),
    "Support Vector Machine": SVC(),
    "K-Nearest Neighbors": KNeighborsClassifier()
}

In [80]:
# Step 13: Train & evaluate each model
for name, model in models.items():
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    accuracy = accuracy_score(y_test, y_pred)
    print("=" * 50)
    print(f"{name} Accuracy: {accuracy:.2f}")
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred))


Logistic Regression Accuracy: 0.56
Confusion Matrix:
 [[126  67 124]
 [ 18 128 296]
 [ 21  57 488]]
Classification Report:
               precision    recall  f1-score   support

           M       0.76      0.40      0.52       317
           S       0.51      0.29      0.37       442
           U       0.54      0.86      0.66       566

    accuracy                           0.56      1325
   macro avg       0.60      0.52      0.52      1325
weighted avg       0.58      0.56      0.53      1325

Random Forest Accuracy: 0.59
Confusion Matrix:
 [[142  71 104]
 [ 19 193 230]
 [ 23  99 444]]
Classification Report:
               precision    recall  f1-score   support

           M       0.77      0.45      0.57       317
           S       0.53      0.44      0.48       442
           U       0.57      0.78      0.66       566

    accuracy                           0.59      1325
   macro avg       0.62      0.56      0.57      1325
weighted avg       0.61      0.59      0.58      13