<a href="https://colab.research.google.com/github/inshra12/iACP-Replication-AAC-DPC/blob/main/iACP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# iACP anticancer peptides

## Importing Libraries

In [70]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

## Read CSV File

In [37]:
df = pd.read_csv('/content/iACP_dataset.csv')

In [38]:
#print(df)
#print(df.info())
print("\nClass Distribution:")
print(df['Label'].value_counts())



Class Distribution:
Label
0    206
1    143
Name: count, dtype: int64


## Feature Extraction

### AAC - Amino Acid Compostion

In [39]:
def compute_acc(Sequence):
  amino_acids = 'ACDEFGHIKLMNPQRSTVWY'
  seq_len = len(Sequence)
  if seq_len == 0:
    return [0]*len(amino_acids)
  aac = []
  for aa in amino_acids:
    aac.append(Sequence.count(aa)/seq_len)
  return aac


In [40]:
X = df['Sequence'].apply(compute_acc)
X = np.array(X.tolist())
y = df['Label'].values

In [41]:
print(X,y)

[[0.32352941 0.         0.         ... 0.08823529 0.02941176 0.        ]
 [0.07142857 0.         0.07142857 ... 0.         0.         0.        ]
 [0.05882353 0.         0.05882353 ... 0.17647059 0.         0.        ]
 ...
 [0.08333333 0.         0.11111111 ... 0.05555556 0.         0.        ]
 [0.07142857 0.         0.03571429 ... 0.10714286 0.         0.        ]
 [0.03703704 0.         0.07407407 ... 0.03703704 0.         0.        ]] [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 

### DPC - Dipeptide Composition

In [42]:
from itertools import product

# 1. Define 20 amino acids
amino_acids = 'ACDEFGHIKLMNPQRSTVWY'

# 2. Create all 400 possible dipeptides (AA, AC, AD, ..., YY)
dipeptides = [aa1 + aa2 for aa1, aa2 in product(amino_acids, repeat=2)]

# 3. Function to calculate DPC
def calculate_dpc(sequence):
    sequence = sequence.upper()
    dpc_counts = {dipep: 0 for dipep in dipeptides}
    total = len(sequence) - 1  # Total possible dipeptides

    for i in range(total):
        pair = sequence[i:i+2]
        if pair in dpc_counts:
            dpc_counts[pair] += 1

    # Normalize
    dpc_vector = [dpc_counts[dipep] / total if total > 0 else 0 for dipep in dipeptides]

    return dpc_vector

# 4. Apply to your dataset
dpc_features = df['Sequence'].apply(calculate_dpc)

# 5. Convert to DataFrame
dpc_df = pd.DataFrame(dpc_features.tolist(), columns=[f'DPC_{d}' for d in dipeptides])
X = np.array(X.tolist())
y = df['Label'].values


## Combine AAC and DPC

In [43]:
# 1. Combine AAC and DPC
X_df = pd.DataFrame(X)
combined_features = pd.concat([X_df, dpc_df], axis=1)

# 2. Add labels (assumes your original df has 'label' column)
combined_features['Label'] = df['Label']

In [44]:
print("Shape of final dataset:", combined_features.shape)
print("First few rows:")
print(combined_features)

Shape of final dataset: (349, 421)
First few rows:
            0         1         2         3         4         5         6  \
0    0.323529  0.000000  0.000000  0.088235  0.000000  0.117647  0.000000   
1    0.071429  0.000000  0.071429  0.071429  0.071429  0.071429  0.000000   
2    0.058824  0.000000  0.058824  0.000000  0.058824  0.176471  0.000000   
3    0.058824  0.000000  0.058824  0.000000  0.058824  0.176471  0.000000   
4    0.058824  0.000000  0.058824  0.000000  0.058824  0.176471  0.000000   
..        ...       ...       ...       ...       ...       ...       ...   
344  0.088235  0.029412  0.117647  0.058824  0.000000  0.000000  0.000000   
345  0.035714  0.000000  0.035714  0.000000  0.035714  0.071429  0.071429   
346  0.083333  0.000000  0.111111  0.083333  0.000000  0.027778  0.000000   
347  0.071429  0.000000  0.035714  0.035714  0.000000  0.035714  0.035714   
348  0.037037  0.000000  0.074074  0.037037  0.000000  0.074074  0.037037   

            7         8 

In [45]:
# 1. Separate features and labels
X = combined_features.drop('Label', axis=1)
y = combined_features['Label']

In [46]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [81]:

models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(random_state=31),
    "Decision Tree": DecisionTreeClassifier(),
    "Naive Bayes": GaussianNB(),
   "Support Vector Machine": SVC(),
   "K-Nearest Neighbors": KNeighborsClassifier()
}

In [82]:
# Loop through each model and print its accuracy
for name, model in models.items():
    # Convert column names to strings
    X_train.columns = X_train.columns.astype(str)
    X_test.columns = X_test.columns.astype(str)

    model.fit(X_train, y_train)                      # train the model
    y_pred = model.predict(X_test)                   # test the model
    accuracy = accuracy_score(y_test, y_pred)        # get accuracy
    print(f"{name} Accuracy: {accuracy:.2f}")
    print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("\nClassification Report:\n", classification_report(y_test, y_pred))

Logistic Regression Accuracy: 0.84

Confusion Matrix:
 [[58  1]
 [16 30]]

Classification Report:
               precision    recall  f1-score   support

           0       0.78      0.98      0.87        59
           1       0.97      0.65      0.78        46

    accuracy                           0.84       105
   macro avg       0.88      0.82      0.83       105
weighted avg       0.86      0.84      0.83       105

Random Forest Accuracy: 0.90

Confusion Matrix:
 [[57  2]
 [ 9 37]]

Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.97      0.91        59
           1       0.95      0.80      0.87        46

    accuracy                           0.90       105
   macro avg       0.91      0.89      0.89       105
weighted avg       0.90      0.90      0.89       105

Decision Tree Accuracy: 0.83

Confusion Matrix:
 [[52  7]
 [11 35]]

Classification Report:
               precision    recall  f1-score   support

      