<a href="https://colab.research.google.com/github/inshra12/iACP-Replication-AAC-DPC/blob/main/iACP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# iACP anticancer peptides

## Importing Libraries

In [2]:
import pandas as pd
import numpy as np

## Read CSV File

In [3]:
df = pd.read_csv('/content/iACP_dataset.csv')

In [4]:
#print(df)
#print(df.info())
print("\nClass Distribution:")
print(df['Label'].value_counts())



Class Distribution:
Label
0    206
1    143
Name: count, dtype: int64


## Feature Extraction

### AAC - Amino Acid Compostion

In [5]:
def compute_acc(Sequence):
  amino_acids = 'ACDEFGHIKLMNPQRSTVWY'
  seq_len = len(Sequence)
  if seq_len == 0:
    return [0]*len(amino_acids)
  aac = []
  for aa in amino_acids:
    aac.append(Sequence.count(aa)/seq_len)
  return aac


In [6]:
X = df['Sequence'].apply(compute_acc)
X = np.array(X.tolist())
y = df['Label'].values

In [7]:
print(X,y)

[[0.32352941 0.         0.         ... 0.08823529 0.02941176 0.        ]
 [0.07142857 0.         0.07142857 ... 0.         0.         0.        ]
 [0.05882353 0.         0.05882353 ... 0.17647059 0.         0.        ]
 ...
 [0.08333333 0.         0.11111111 ... 0.05555556 0.         0.        ]
 [0.07142857 0.         0.03571429 ... 0.10714286 0.         0.        ]
 [0.03703704 0.         0.07407407 ... 0.03703704 0.         0.        ]] [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 

### DPC - Dipeptide Composition

In [9]:
from itertools import product

# 1. Define 20 amino acids
amino_acids = 'ACDEFGHIKLMNPQRSTVWY'

# 2. Create all 400 possible dipeptides (AA, AC, AD, ..., YY)
dipeptides = [aa1 + aa2 for aa1, aa2 in product(amino_acids, repeat=2)]

# 3. Function to calculate DPC
def calculate_dpc(sequence):
    sequence = sequence.upper()
    dpc_counts = {dipep: 0 for dipep in dipeptides}
    total = len(sequence) - 1  # Total possible dipeptides

    for i in range(total):
        pair = sequence[i:i+2]
        if pair in dpc_counts:
            dpc_counts[pair] += 1

    # Normalize
    dpc_vector = [dpc_counts[dipep] / total if total > 0 else 0 for dipep in dipeptides]

    return dpc_vector

# 4. Apply to your dataset
dpc_features = df['Sequence'].apply(calculate_dpc)

# 5. Convert to DataFrame
dpc_df = pd.DataFrame(dpc_features.tolist(), columns=[f'DPC_{d}' for d in dipeptides])
X = np.array(X.tolist())
y = df['Label'].values


In [10]:
print(dpc_df)

       DPC_AA  DPC_AC  DPC_AD    DPC_AE  DPC_AF    DPC_AG  DPC_AH    DPC_AI  \
0    0.151515     0.0     0.0  0.000000  0.0000  0.030303     0.0  0.000000   
1    0.000000     0.0     0.0  0.076923  0.0000  0.000000     0.0  0.000000   
2    0.000000     0.0     0.0  0.000000  0.0625  0.000000     0.0  0.000000   
3    0.000000     0.0     0.0  0.000000  0.0000  0.000000     0.0  0.000000   
4    0.000000     0.0     0.0  0.000000  0.0000  0.062500     0.0  0.000000   
..        ...     ...     ...       ...     ...       ...     ...       ...   
344  0.000000     0.0     0.0  0.000000  0.0000  0.000000     0.0  0.060606   
345  0.000000     0.0     0.0  0.000000  0.0000  0.000000     0.0  0.000000   
346  0.000000     0.0     0.0  0.000000  0.0000  0.000000     0.0  0.057143   
347  0.000000     0.0     0.0  0.000000  0.0000  0.037037     0.0  0.000000   
348  0.000000     0.0     0.0  0.038462  0.0000  0.000000     0.0  0.000000   

       DPC_AK    DPC_AL  ...  DPC_YM  DPC_YN    DPC