<a href="https://colab.research.google.com/github/kiranshehzadichemist/AIDD_Peptide-targeting-RdRp-of-SARS-CoV-2/blob/main/AI_model_for_design_and_optimization_of_short_peptide_targeting_RdRp_of_SARS_CoV_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

# Load prepared dataset
ml_df = pd.read_csv('amino_acid_interaction_dataset.csv')

# Split data
X = ml_df[['AA1_encoded', 'AA2_encoded', 'AvgDistance']]
y = ml_df['Interaction']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Train classifier
model = RandomForestClassifier(n_estimators=100)
model.fit(X_train, y_train)

# Evaluate
print("Test accuracy:", model.score(X_test, y_test))

# Save model
import joblib
joblib.dump(model, 'amino_acid_interaction_model.pkl')

Test accuracy: 1.0


['amino_acid_interaction_model.pkl']

In [None]:
def predict_interaction(aa1, aa2, model):
    aa1_code = extract_aa(aa1)
    aa2_code = extract_aa(aa2)
    encoded_aa1 = le.transform([aa1_code])[0]
    encoded_aa2 = le.transform([aa2_code])[0]
    avg_dist = distance_matrix.loc[aa1_code, aa2_code]
    return model.predict_proba([[encoded_aa1, encoded_aa2, avg_dist]])[0][1]

# Example prediction
print("Interaction probability between ARG and ASP:",
      predict_interaction('ARG', 'ASP', model))

Interaction probability between ARG and ASP: 1.0




In [None]:
def predict_interaction(aa1, aa2, model):
    aa1_code = extract_aa(aa1)
    aa2_code = extract_aa(aa2)
    encoded_aa1 = le.transform([aa1_code])[0]
    encoded_aa2 = le.transform([aa2_code])[0]
    avg_dist = distance_matrix.loc[aa1_code, aa2_code]
    return model.predict_proba([[encoded_aa1, encoded_aa2, avg_dist]])[0][1]

# Example prediction
print("Interaction probability between ARG and ASP:",
      predict_interaction('ARG', 'ASP', model))

In [None]:
import pandas as pd
import numpy as np

# Load data with type specification and cleaning
df = pd.read_csv('/content/aa_interactions.csv', dtype={
    'Residue1': str,
    'Residue2': str
}).dropna(subset=['Residue1', 'Residue2'])

# Improved amino acid extraction with error handling
def extract_aa(residue):
    """Safely extract 3-letter amino acid code from various formats"""
    try:
        # Remove numbers and special characters
        cleaned = ''.join([c for c in str(residue) if c.isalpha()])
        return cleaned[:3].upper() if len(cleaned) >= 3 else 'UNK'
    except:
        return 'UNK'

# Create validated interaction pairs
valid_pairs = []
invalid_count = 0

for _, row in df.iterrows():
    try:
        aa1 = extract_aa(row['Residue1'])
        aa2 = extract_aa(row['Residue2'])

        if aa1 == 'UNK' or aa2 == 'UNK':
            invalid_count += 1
            continue

        valid_pairs.append((
            aa1,
            aa2,
            float(row['Distance']),
            row['Atom1'],
            row['Atom2']
        ))
    except Exception as e:
        invalid_count += 1
        continue

print(f"Cleaned data: {len(valid_pairs)} valid interactions")
print(f"Skipped {invalid_count} invalid entries")

# Create DataFrame from cleaned data
clean_df = pd.DataFrame(valid_pairs, columns=[
    'Residue1', 'Residue2', 'Distance', 'Atom1', 'Atom2'
])

# Proceed with analysis using clean_df
# ... [rest of your analysis code] ...

Cleaned data: 160721 valid interactions
Skipped 0 invalid entries


In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from itertools import combinations

# Load interaction data
df = pd.read_csv('/content/aa_interactions.csv')

# Extract amino acid types from residue information
def extract_aa(residue):
    return residue[:3].upper()  # First 3 characters are amino acid code

# Create interaction pairs
interaction_pairs = []
for _, row in df.iterrows():
    aa1 = extract_aa(row['Residue1'])
    aa2 = extract_aa(row['Residue2'])
    interaction_pairs.append((aa1, aa2, row['Distance']))

# Create affinity matrix
aa_list = sorted(set([p[0] for p in interaction_pairs] + [p[1] for p in interaction_pairs]))
affinity_matrix = pd.DataFrame(0, index=aa_list, columns=aa_list, dtype=float)
distance_matrix = pd.DataFrame(0, index=aa_list, columns=aa_list, dtype=float)
count_matrix = pd.DataFrame(0, index=aa_list, columns=aa_list, dtype=int)

for aa1, aa2, dist in interaction_pairs:
    count_matrix.loc[aa1, aa2] += 1
    distance_matrix.loc[aa1, aa2] += dist
    # Add reverse interaction
    count_matrix.loc[aa2, aa1] += 1
    distance_matrix.loc[aa2, aa1] += dist

# Calculate average distance and affinity percentage
for aa in aa_list:
    total = count_matrix.loc[aa].sum()
    if total > 0:
        affinity_matrix.loc[aa] = count_matrix.loc[aa] / total * 100
        distance_matrix.loc[aa] = distance_matrix.loc[aa] / count_matrix.loc[aa]

# Fill NaN values with max distance
distance_matrix = distance_matrix.fillna(distance_matrix.max().max())

print("Affinity Matrix (%):")
print(affinity_matrix.head())

print("\nAverage Distance Matrix (Å):")
print(distance_matrix.head())

# Save matrices
affinity_matrix.to_csv('amino_acid_affinity_matrix.csv')
distance_matrix.to_csv('amino_acid_distance_matrix.csv')

# Prepare dataset for ML model
data = []
for aa1, aa2, dist in interaction_pairs:
    data.append({
        'AA1': aa1,
        'AA2': aa2,
        'Affinity': affinity_matrix.loc[aa1, aa2],
        'AvgDistance': distance_matrix.loc[aa1, aa2],
        'Interaction': 1  # Positive sample
    })

# Create negative samples (non-interacting pairs)
all_aa = list(affinity_matrix.index)
existing_pairs = set([(p[0], p[1]) for p in interaction_pairs])
negative_samples = []

for _ in range(len(data)):  # Create balanced dataset
    aa1, aa2 = np.random.choice(all_aa, 2, replace=False)
    if (aa1, aa2) not in existing_pairs and (aa2, aa1) not in existing_pairs:
        negative_samples.append({
            'AA1': aa1,
            'AA2': aa2,
            'Affinity': 0,
            'AvgDistance': distance_matrix.loc[aa1, aa2],
            'Interaction': 0
        })

full_data = data + negative_samples[:len(data)]
ml_df = pd.DataFrame(full_data)

# Encode amino acids
le = LabelEncoder()
le.fit(all_aa)
ml_df['AA1_encoded'] = le.transform(ml_df['AA1'])
ml_df['AA2_encoded'] = le.transform(ml_df['AA2'])

# Save ML-ready dataset
ml_df.to_csv('amino_acid_interaction_dataset.csv', index=False)

# Example: Predict potential inhibitors for a target sequence
def predict_inhibitors(target_sequence, top_n=5):
    """
    Predict top inhibitor amino acids for a given target sequence
    target_sequence: List of amino acid codes (e.g., ['ARG', 'LYS', 'ASP'])
    """
    target_aa = [extract_aa(aa) for aa in target_sequence]

    # Calculate average affinity scores
    scores = defaultdict(float)
    for aa in target_aa:
        for potential_aa in all_aa:
            scores[potential_aa] += affinity_matrix.loc[aa, potential_aa]

    # Normalize scores
    total = len(target_aa)
    scores = {k: v/total for k, v in scores.items()}

    # Exclude amino acids already in target
    for aa in target_aa:
        if aa in scores:
            del scores[aa]

    # Get top candidates
    sorted_scores = sorted(scores.items(), key=lambda x: x[1], reverse=True)
    return sorted_scores[:top_n]

# Example usage
target = ['LYS', 'ALA', 'LYS', 'ARG', 'ARG', 'CYS', 'ASN', 'ASP', 'ASP']
print(f"\nTop inhibitor candidates for {target}:")
print(predict_inhibitors(target))

Affinity Matrix (%):
          ALA       ARG       ASN       ASP       C1J       CYS        GLN  \
ALA  8.134851  5.002151  4.208221  5.291564  0.015644  1.286714   4.098713   
ARG  7.902379  4.967563  4.850170  6.827309  0.024714  1.167748   3.978993   
ASN  6.601632  4.816246  5.595435  5.669059  0.018406  0.963249   4.080005   
ASP  7.506658  6.130715  5.126498  4.604971  0.022193  1.287173   3.828229   
C1J  4.545455  4.545455  3.409091  4.545455  0.000000  0.000000  11.363636   

          GLU       GLY       HIS  ...       MET       PHE       PRO  \
ALA  4.732293  4.302085  1.943760  ...  1.849896  5.279831  4.654073   
ARG  6.407167  4.219957  2.069818  ...  1.686747  4.405314  4.442385   
ASN  4.767164  5.129149  2.122830  ...  1.914228  4.846923  5.190502   
ASP  5.043276  4.305371  2.080559  ...  1.925211  4.671549  4.510652   
C1J  4.545455  9.090909  0.000000  ...  0.000000  3.409091  6.818182   

          PTR       SEP       SER        THR       TRP       TYR       VAL  


In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from itertools import combinations

# Load interaction data
df = pd.read_csv('/content/aa_interactions.csv')

# Extract amino acid types from residue information
def extract_aa(residue):
    return residue[:3].upper()  # First 3 characters are amino acid code

# Create interaction pairs
interaction_pairs = []
for _, row in df.iterrows():
    aa1 = extract_aa(row['Residue1'])
    aa2 = extract_aa(row['Residue2'])
    interaction_pairs.append((aa1, aa2, row['Distance']))

# Create affinity matrix
aa_list = sorted(set([p[0] for p in interaction_pairs] + [p[1] for p in interaction_pairs]))
affinity_matrix = pd.DataFrame(0, index=aa_list, columns=aa_list, dtype=float)
distance_matrix = pd.DataFrame(0, index=aa_list, columns=aa_list, dtype=float)
count_matrix = pd.DataFrame(0, index=aa_list, columns=aa_list, dtype=int)

for aa1, aa2, dist in interaction_pairs:
    count_matrix.loc[aa1, aa2] += 1
    distance_matrix.loc[aa1, aa2] += dist
    # Add reverse interaction
    count_matrix.loc[aa2, aa1] += 1
    distance_matrix.loc[aa2, aa1] += dist

# Calculate average distance and affinity percentage
for aa in aa_list:
    total = count_matrix.loc[aa].sum()
    if total > 0:
        affinity_matrix.loc[aa] = count_matrix.loc[aa] / total * 100
        distance_matrix.loc[aa] = distance_matrix.loc[aa] / count_matrix.loc[aa]

# Fill NaN values with max distance
distance_matrix = distance_matrix.fillna(distance_matrix.max().max())

print("Affinity Matrix (%):")
print(affinity_matrix.head())

print("\nAverage Distance Matrix (Å):")
print(distance_matrix.head())

# Save matrices
affinity_matrix.to_csv('amino_acid_affinity_matrix.csv')
distance_matrix.to_csv('amino_acid_distance_matrix.csv')

# Prepare dataset for ML model
data = []
for aa1, aa2, dist in interaction_pairs:
    data.append({
        'AA1': aa1,
        'AA2': aa2,
        'Affinity': affinity_matrix.loc[aa1, aa2],
        'AvgDistance': distance_matrix.loc[aa1, aa2],
        'Interaction': 1  # Positive sample
    })

# Create negative samples (non-interacting pairs)
all_aa = list(affinity_matrix.index)
existing_pairs = set([(p[0], p[1]) for p in interaction_pairs])
negative_samples = []

for _ in range(len(data)):  # Create balanced dataset
    aa1, aa2 = np.random.choice(all_aa, 2, replace=False)
    if (aa1, aa2) not in existing_pairs and (aa2, aa1) not in existing_pairs:
        negative_samples.append({
            'AA1': aa1,
            'AA2': aa2,
            'Affinity': 0,
            'AvgDistance': distance_matrix.loc[aa1, aa2],
            'Interaction': 0
        })

full_data = data + negative_samples[:len(data)]
ml_df = pd.DataFrame(full_data)

# Encode amino acids
le = LabelEncoder()
le.fit(all_aa)
ml_df['AA1_encoded'] = le.transform(ml_df['AA1'])
ml_df['AA2_encoded'] = le.transform(ml_df['AA2'])

# Save ML-ready dataset
ml_df.to_csv('amino_acid_interaction_dataset.csv', index=False)

# Example: Predict potential inhibitors for a target sequence
def predict_inhibitors(target_sequence, top_n=5):
    """
    Predict top inhibitor amino acids for a given target sequence
    target_sequence: List of amino acid codes (e.g., ['ARG', 'LYS', 'ASP'])
    """
    target_aa = [extract_aa(aa) for aa in target_sequence]

    # Calculate average affinity scores
    scores = defaultdict(float)
    for aa in target_aa:
        for potential_aa in all_aa:
            scores[potential_aa] += affinity_matrix.loc[aa, potential_aa]

    # Normalize scores
    total = len(target_aa)
    scores = {k: v/total for k, v in scores.items()}

    # Exclude amino acids already in target
    for aa in target_aa:
        if aa in scores:
            del scores[aa]

    # Get top candidates
    sorted_scores = sorted(scores.items(), key=lambda x: x[1], reverse=True)
    return sorted_scores[:top_n]

# Example usage
target = ['LYS', 'ALA', 'LYS', 'ARG', 'ARG', 'CYS', 'ASN', 'ASP', 'ASP']
print(f"\nTop inhibitor candidates for {target}:")
print(predict_inhibitors(target))

Affinity Matrix (%):
          ALA       ARG       ASN       ASP       C1J       CYS        GLN  \
ALA  8.134851  5.002151  4.208221  5.291564  0.015644  1.286714   4.098713   
ARG  7.902379  4.967563  4.850170  6.827309  0.024714  1.167748   3.978993   
ASN  6.601632  4.816246  5.595435  5.669059  0.018406  0.963249   4.080005   
ASP  7.506658  6.130715  5.126498  4.604971  0.022193  1.287173   3.828229   
C1J  4.545455  4.545455  3.409091  4.545455  0.000000  0.000000  11.363636   

          GLU       GLY       HIS  ...       MET       PHE       PRO  \
ALA  4.732293  4.302085  1.943760  ...  1.849896  5.279831  4.654073   
ARG  6.407167  4.219957  2.069818  ...  1.686747  4.405314  4.442385   
ASN  4.767164  5.129149  2.122830  ...  1.914228  4.846923  5.190502   
ASP  5.043276  4.305371  2.080559  ...  1.925211  4.671549  4.510652   
C1J  4.545455  9.090909  0.000000  ...  0.000000  3.409091  6.818182   

          PTR       SEP       SER        THR       TRP       TYR       VAL  


In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from itertools import combinations

# Load interaction data
df = pd.read_csv('/content/aa_interactions.csv')

# Extract amino acid types from residue information
def extract_aa(residue):
    return residue[:3].upper()  # First 3 characters are amino acid code

# Create interaction pairs
interaction_pairs = []
for _, row in df.iterrows():
    aa1 = extract_aa(row['Residue1'])
    aa2 = extract_aa(row['Residue2'])
    interaction_pairs.append((aa1, aa2, row['Distance']))

# Create affinity matrix
aa_list = sorted(set([p[0] for p in interaction_pairs] + [p[1] for p in interaction_pairs]))
affinity_matrix = pd.DataFrame(0, index=aa_list, columns=aa_list, dtype=float)
distance_matrix = pd.DataFrame(0, index=aa_list, columns=aa_list, dtype=float)
count_matrix = pd.DataFrame(0, index=aa_list, columns=aa_list, dtype=int)

for aa1, aa2, dist in interaction_pairs:
    count_matrix.loc[aa1, aa2] += 1
    distance_matrix.loc[aa1, aa2] += dist
    # Add reverse interaction
    count_matrix.loc[aa2, aa1] += 1
    distance_matrix.loc[aa2, aa1] += dist

# Calculate average distance and affinity percentage
for aa in aa_list:
    total = count_matrix.loc[aa].sum()
    if total > 0:
        affinity_matrix.loc[aa] = count_matrix.loc[aa] / total * 100
        distance_matrix.loc[aa] = distance_matrix.loc[aa] / count_matrix.loc[aa]

# Fill NaN values with max distance
distance_matrix = distance_matrix.fillna(distance_matrix.max().max())

print("Affinity Matrix (%):")
print(affinity_matrix.head())

print("\nAverage Distance Matrix (Å):")
print(distance_matrix.head())

# Save matrices
affinity_matrix.to_csv('amino_acid_affinity_matrix.csv')
distance_matrix.to_csv('amino_acid_distance_matrix.csv')

# Prepare dataset for ML model
data = []
for aa1, aa2, dist in interaction_pairs:
    data.append({
        'AA1': aa1,
        'AA2': aa2,
        'Affinity': affinity_matrix.loc[aa1, aa2],
        'AvgDistance': distance_matrix.loc[aa1, aa2],
        'Interaction': 1  # Positive sample
    })

# Create negative samples (non-interacting pairs)
all_aa = list(affinity_matrix.index)
existing_pairs = set([(p[0], p[1]) for p in interaction_pairs])
negative_samples = []

for _ in range(len(data)):  # Create balanced dataset
    aa1, aa2 = np.random.choice(all_aa, 2, replace=False)
    if (aa1, aa2) not in existing_pairs and (aa2, aa1) not in existing_pairs:
        negative_samples.append({
            'AA1': aa1,
            'AA2': aa2,
            'Affinity': 0,
            'AvgDistance': distance_matrix.loc[aa1, aa2],
            'Interaction': 0
        })

full_data = data + negative_samples[:len(data)]
ml_df = pd.DataFrame(full_data)

# Encode amino acids
le = LabelEncoder()
le.fit(all_aa)
ml_df['AA1_encoded'] = le.transform(ml_df['AA1'])
ml_df['AA2_encoded'] = le.transform(ml_df['AA2'])

# Save ML-ready dataset
ml_df.to_csv('amino_acid_interaction_dataset.csv', index=False)

# Example: Predict potential inhibitors for a target sequence
def predict_inhibitors(target_sequence, top_n=5):
    """
    Predict top inhibitor amino acids for a given target sequence
    target_sequence: List of amino acid codes (e.g., ['ARG', 'LYS', 'ASP'])
    """
    target_aa = [extract_aa(aa) for aa in target_sequence]

    # Calculate average affinity scores
    scores = defaultdict(float)
    for aa in target_aa:
        for potential_aa in all_aa:
            scores[potential_aa] += affinity_matrix.loc[aa, potential_aa]

    # Normalize scores
    total = len(target_aa)
    scores = {k: v/total for k, v in scores.items()}

    # Exclude amino acids already in target
    for aa in target_aa:
        if aa in scores:
            del scores[aa]

    # Get top candidates
    sorted_scores = sorted(scores.items(), key=lambda x: x[1], reverse=True)
    return sorted_scores[:top_n]

# Example usage
target = ['LYS', 'ALA', 'LYS', 'ARG', 'ARG', 'CYS', 'ASN', 'ASP', 'ASP']
print(f"\nTop inhibitor candidates for {target}:")
print(predict_inhibitors(target))

Affinity Matrix (%):
          ALA       ARG       ASN       ASP       C1J       CYS        GLN  \
ALA  8.134851  5.002151  4.208221  5.291564  0.015644  1.286714   4.098713   
ARG  7.902379  4.967563  4.850170  6.827309  0.024714  1.167748   3.978993   
ASN  6.601632  4.816246  5.595435  5.669059  0.018406  0.963249   4.080005   
ASP  7.506658  6.130715  5.126498  4.604971  0.022193  1.287173   3.828229   
C1J  4.545455  4.545455  3.409091  4.545455  0.000000  0.000000  11.363636   

          GLU       GLY       HIS  ...       MET       PHE       PRO  \
ALA  4.732293  4.302085  1.943760  ...  1.849896  5.279831  4.654073   
ARG  6.407167  4.219957  2.069818  ...  1.686747  4.405314  4.442385   
ASN  4.767164  5.129149  2.122830  ...  1.914228  4.846923  5.190502   
ASP  5.043276  4.305371  2.080559  ...  1.925211  4.671549  4.510652   
C1J  4.545455  9.090909  0.000000  ...  0.000000  3.409091  6.818182   

          PTR       SEP       SER        THR       TRP       TYR       VAL  


In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from itertools import combinations

# Load interaction data
df = pd.read_csv('/content/aa_interactions.csv')

# Extract amino acid types from residue information
def extract_aa(residue):
    return residue[:3].upper()  # First 3 characters are amino acid code

# Create interaction pairs
interaction_pairs = []
for _, row in df.iterrows():
    aa1 = extract_aa(row['Residue1'])
    aa2 = extract_aa(row['Residue2'])
    interaction_pairs.append((aa1, aa2, row['Distance']))

# Create affinity matrix
aa_list = sorted(set([p[0] for p in interaction_pairs] + [p[1] for p in interaction_pairs]))
affinity_matrix = pd.DataFrame(0, index=aa_list, columns=aa_list, dtype=float)
distance_matrix = pd.DataFrame(0, index=aa_list, columns=aa_list, dtype=float)
count_matrix = pd.DataFrame(0, index=aa_list, columns=aa_list, dtype=int)

for aa1, aa2, dist in interaction_pairs:
    count_matrix.loc[aa1, aa2] += 1
    distance_matrix.loc[aa1, aa2] += dist
    # Add reverse interaction
    count_matrix.loc[aa2, aa1] += 1
    distance_matrix.loc[aa2, aa1] += dist

# Calculate average distance and affinity percentage
for aa in aa_list:
    total = count_matrix.loc[aa].sum()
    if total > 0:
        affinity_matrix.loc[aa] = count_matrix.loc[aa] / total * 100
        distance_matrix.loc[aa] = distance_matrix.loc[aa] / count_matrix.loc[aa]

# Fill NaN values with max distance
distance_matrix = distance_matrix.fillna(distance_matrix.max().max())

print("Affinity Matrix (%):")
print(affinity_matrix.head())

print("\nAverage Distance Matrix (Å):")
print(distance_matrix.head())

# Save matrices
affinity_matrix.to_csv('amino_acid_affinity_matrix.csv')
distance_matrix.to_csv('amino_acid_distance_matrix.csv')

# Prepare dataset for ML model
data = []
for aa1, aa2, dist in interaction_pairs:
    data.append({
        'AA1': aa1,
        'AA2': aa2,
        'Affinity': affinity_matrix.loc[aa1, aa2],
        'AvgDistance': distance_matrix.loc[aa1, aa2],
        'Interaction': 1  # Positive sample
    })

# Create negative samples (non-interacting pairs)
all_aa = list(affinity_matrix.index)
existing_pairs = set([(p[0], p[1]) for p in interaction_pairs])
negative_samples = []

for _ in range(len(data)):  # Create balanced dataset
    aa1, aa2 = np.random.choice(all_aa, 2, replace=False)
    if (aa1, aa2) not in existing_pairs and (aa2, aa1) not in existing_pairs:
        negative_samples.append({
            'AA1': aa1,
            'AA2': aa2,
            'Affinity': 0,
            'AvgDistance': distance_matrix.loc[aa1, aa2],
            'Interaction': 0
        })

full_data = data + negative_samples[:len(data)]
ml_df = pd.DataFrame(full_data)

# Encode amino acids
le = LabelEncoder()
le.fit(all_aa)
ml_df['AA1_encoded'] = le.transform(ml_df['AA1'])
ml_df['AA2_encoded'] = le.transform(ml_df['AA2'])

# Save ML-ready dataset
ml_df.to_csv('amino_acid_interaction_dataset.csv', index=False)

# Example: Predict potential inhibitors for a target sequence
def predict_inhibitors(target_sequence, top_n=5):
    """
    Predict top inhibitor amino acids for a given target sequence
    target_sequence: List of amino acid codes (e.g., ['ARG', 'LYS', 'ASP'])
    """
    target_aa = [extract_aa(aa) for aa in target_sequence]

    # Calculate average affinity scores
    scores = defaultdict(float)
    for aa in target_aa:
        for potential_aa in all_aa:
            scores[potential_aa] += affinity_matrix.loc[aa, potential_aa]

    # Normalize scores
    total = len(target_aa)
    scores = {k: v/total for k, v in scores.items()}

    # Exclude amino acids already in target
    for aa in target_aa:
        if aa in scores:
            del scores[aa]

    # Get top candidates
    sorted_scores = sorted(scores.items(), key=lambda x: x[1], reverse=True)
    return sorted_scores[:top_n]

# Example usage
target = ['LYS', 'LYS', 'ARG', 'ARG']
print(f"\nTop inhibitor candidates for {target}:")
print(predict_inhibitors(target))

Affinity Matrix (%):
          ALA       ARG       ASN       ASP       C1J       CYS        GLN  \
ALA  8.134851  5.002151  4.208221  5.291564  0.015644  1.286714   4.098713   
ARG  7.902379  4.967563  4.850170  6.827309  0.024714  1.167748   3.978993   
ASN  6.601632  4.816246  5.595435  5.669059  0.018406  0.963249   4.080005   
ASP  7.506658  6.130715  5.126498  4.604971  0.022193  1.287173   3.828229   
C1J  4.545455  4.545455  3.409091  4.545455  0.000000  0.000000  11.363636   

          GLU       GLY       HIS  ...       MET       PHE       PRO  \
ALA  4.732293  4.302085  1.943760  ...  1.849896  5.279831  4.654073   
ARG  6.407167  4.219957  2.069818  ...  1.686747  4.405314  4.442385   
ASN  4.767164  5.129149  2.122830  ...  1.914228  4.846923  5.190502   
ASP  5.043276  4.305371  2.080559  ...  1.925211  4.671549  4.510652   
C1J  4.545455  9.090909  0.000000  ...  0.000000  3.409091  6.818182   

          PTR       SEP       SER        THR       TRP       TYR       VAL  


In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from itertools import combinations

# Load interaction data
df = pd.read_csv('/content/aa_interactions.csv')

# Extract amino acid types from residue information
def extract_aa(residue):
    return residue[:3].upper()  # First 3 characters are amino acid code

# Create interaction pairs
interaction_pairs = []
for _, row in df.iterrows():
    aa1 = extract_aa(row['Residue1'])
    aa2 = extract_aa(row['Residue2'])
    interaction_pairs.append((aa1, aa2, row['Distance']))

# Create affinity matrix
aa_list = sorted(set([p[0] for p in interaction_pairs] + [p[1] for p in interaction_pairs]))
affinity_matrix = pd.DataFrame(0, index=aa_list, columns=aa_list, dtype=float)
distance_matrix = pd.DataFrame(0, index=aa_list, columns=aa_list, dtype=float)
count_matrix = pd.DataFrame(0, index=aa_list, columns=aa_list, dtype=int)

for aa1, aa2, dist in interaction_pairs:
    count_matrix.loc[aa1, aa2] += 1
    distance_matrix.loc[aa1, aa2] += dist
    # Add reverse interaction
    count_matrix.loc[aa2, aa1] += 1
    distance_matrix.loc[aa2, aa1] += dist

# Calculate average distance and affinity percentage
for aa in aa_list:
    total = count_matrix.loc[aa].sum()
    if total > 0:
        affinity_matrix.loc[aa] = count_matrix.loc[aa] / total * 100
        distance_matrix.loc[aa] = distance_matrix.loc[aa] / count_matrix.loc[aa]

# Fill NaN values with max distance
distance_matrix = distance_matrix.fillna(distance_matrix.max().max())

print("Affinity Matrix (%):")
print(affinity_matrix.head())

print("\nAverage Distance Matrix (Å):")
print(distance_matrix.head())

# Save matrices
affinity_matrix.to_csv('amino_acid_affinity_matrix.csv')
distance_matrix.to_csv('amino_acid_distance_matrix.csv')

# Prepare dataset for ML model
data = []
for aa1, aa2, dist in interaction_pairs:
    data.append({
        'AA1': aa1,
        'AA2': aa2,
        'Affinity': affinity_matrix.loc[aa1, aa2],
        'AvgDistance': distance_matrix.loc[aa1, aa2],
        'Interaction': 1  # Positive sample
    })

# Create negative samples (non-interacting pairs)
all_aa = list(affinity_matrix.index)
existing_pairs = set([(p[0], p[1]) for p in interaction_pairs])
negative_samples = []

for _ in range(len(data)):  # Create balanced dataset
    aa1, aa2 = np.random.choice(all_aa, 2, replace=False)
    if (aa1, aa2) not in existing_pairs and (aa2, aa1) not in existing_pairs:
        negative_samples.append({
            'AA1': aa1,
            'AA2': aa2,
            'Affinity': 0,
            'AvgDistance': distance_matrix.loc[aa1, aa2],
            'Interaction': 0
        })

full_data = data + negative_samples[:len(data)]
ml_df = pd.DataFrame(full_data)

# Encode amino acids
le = LabelEncoder()
le.fit(all_aa)
ml_df['AA1_encoded'] = le.transform(ml_df['AA1'])
ml_df['AA2_encoded'] = le.transform(ml_df['AA2'])

# Save ML-ready dataset
ml_df.to_csv('amino_acid_interaction_dataset.csv', index=False)

# Example: Predict potential inhibitors for a target sequence
def predict_inhibitors(target_sequence, top_n=5):
    """
    Predict top inhibitor amino acids for a given target sequence
    target_sequence: List of amino acid codes (e.g., ['ARG', 'LYS', 'ASP'])
    """
    target_aa = [extract_aa(aa) for aa in target_sequence]

    # Calculate average affinity scores
    scores = defaultdict(float)
    for aa in target_aa:
        for potential_aa in all_aa:
            scores[potential_aa] += affinity_matrix.loc[aa, potential_aa]

    # Normalize scores
    total = len(target_aa)
    scores = {k: v/total for k, v in scores.items()}

    # Exclude amino acids already in target
    for aa in target_aa:
        if aa in scores:
            del scores[aa]

    # Get top candidates
    sorted_scores = sorted(scores.items(), key=lambda x: x[1], reverse=True)
    return sorted_scores[:top_n]

# Example usage
target = ['LYS', 'ARG', 'ARG']
print(f"\nTop inhibitor candidates for {target}:")
print(predict_inhibitors(target))

Affinity Matrix (%):
          ALA       ARG       ASN       ASP       C1J       CYS        GLN  \
ALA  8.134851  5.002151  4.208221  5.291564  0.015644  1.286714   4.098713   
ARG  7.902379  4.967563  4.850170  6.827309  0.024714  1.167748   3.978993   
ASN  6.601632  4.816246  5.595435  5.669059  0.018406  0.963249   4.080005   
ASP  7.506658  6.130715  5.126498  4.604971  0.022193  1.287173   3.828229   
C1J  4.545455  4.545455  3.409091  4.545455  0.000000  0.000000  11.363636   

          GLU       GLY       HIS  ...       MET       PHE       PRO  \
ALA  4.732293  4.302085  1.943760  ...  1.849896  5.279831  4.654073   
ARG  6.407167  4.219957  2.069818  ...  1.686747  4.405314  4.442385   
ASN  4.767164  5.129149  2.122830  ...  1.914228  4.846923  5.190502   
ASP  5.043276  4.305371  2.080559  ...  1.925211  4.671549  4.510652   
C1J  4.545455  9.090909  0.000000  ...  0.000000  3.409091  6.818182   

          PTR       SEP       SER        THR       TRP       TYR       VAL  


In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from itertools import combinations

# Load interaction data
df = pd.read_csv('/content/aa_interactions.csv')

# Extract amino acid types from residue information
def extract_aa(residue):
    return residue[:3].upper()  # First 3 characters are amino acid code

# Create interaction pairs
interaction_pairs = []
for _, row in df.iterrows():
    aa1 = extract_aa(row['Residue1'])
    aa2 = extract_aa(row['Residue2'])
    interaction_pairs.append((aa1, aa2, row['Distance']))

# Create affinity matrix
aa_list = sorted(set([p[0] for p in interaction_pairs] + [p[1] for p in interaction_pairs]))
affinity_matrix = pd.DataFrame(0, index=aa_list, columns=aa_list, dtype=float)
distance_matrix = pd.DataFrame(0, index=aa_list, columns=aa_list, dtype=float)
count_matrix = pd.DataFrame(0, index=aa_list, columns=aa_list, dtype=int)

for aa1, aa2, dist in interaction_pairs:
    count_matrix.loc[aa1, aa2] += 1
    distance_matrix.loc[aa1, aa2] += dist
    # Add reverse interaction
    count_matrix.loc[aa2, aa1] += 1
    distance_matrix.loc[aa2, aa1] += dist

# Calculate average distance and affinity percentage
for aa in aa_list:
    total = count_matrix.loc[aa].sum()
    if total > 0:
        affinity_matrix.loc[aa] = count_matrix.loc[aa] / total * 100
        distance_matrix.loc[aa] = distance_matrix.loc[aa] / count_matrix.loc[aa]

# Fill NaN values with max distance
distance_matrix = distance_matrix.fillna(distance_matrix.max().max())

print("Affinity Matrix (%):")
print(affinity_matrix.head())

print("\nAverage Distance Matrix (Å):")
print(distance_matrix.head())

# Save matrices
affinity_matrix.to_csv('amino_acid_affinity_matrix.csv')
distance_matrix.to_csv('amino_acid_distance_matrix.csv')

# Prepare dataset for ML model
data = []
for aa1, aa2, dist in interaction_pairs:
    data.append({
        'AA1': aa1,
        'AA2': aa2,
        'Affinity': affinity_matrix.loc[aa1, aa2],
        'AvgDistance': distance_matrix.loc[aa1, aa2],
        'Interaction': 1  # Positive sample
    })

# Create negative samples (non-interacting pairs)
all_aa = list(affinity_matrix.index)
existing_pairs = set([(p[0], p[1]) for p in interaction_pairs])
negative_samples = []

for _ in range(len(data)):  # Create balanced dataset
    aa1, aa2 = np.random.choice(all_aa, 2, replace=False)
    if (aa1, aa2) not in existing_pairs and (aa2, aa1) not in existing_pairs:
        negative_samples.append({
            'AA1': aa1,
            'AA2': aa2,
            'Affinity': 0,
            'AvgDistance': distance_matrix.loc[aa1, aa2],
            'Interaction': 0
        })

full_data = data + negative_samples[:len(data)]
ml_df = pd.DataFrame(full_data)

# Encode amino acids
le = LabelEncoder()
le.fit(all_aa)
ml_df['AA1_encoded'] = le.transform(ml_df['AA1'])
ml_df['AA2_encoded'] = le.transform(ml_df['AA2'])

# Save ML-ready dataset
ml_df.to_csv('amino_acid_interaction_dataset.csv', index=False)

# Example: Predict potential inhibitors for a target sequence
def predict_inhibitors(target_sequence, top_n=5):
    """
    Predict top inhibitor amino acids for a given target sequence
    target_sequence: List of amino acid codes (e.g., ['ARG', 'LYS', 'ASP'])
    """
    target_aa = [extract_aa(aa) for aa in target_sequence]

    # Calculate average affinity scores
    scores = defaultdict(float)
    for aa in target_aa:
        for potential_aa in all_aa:
            scores[potential_aa] += affinity_matrix.loc[aa, potential_aa]

    # Normalize scores
    total = len(target_aa)
    scores = {k: v/total for k, v in scores.items()}

    # Exclude amino acids already in target
    for aa in target_aa:
        if aa in scores:
            del scores[aa]

    # Get top candidates
    sorted_scores = sorted(scores.items(), key=lambda x: x[1], reverse=True)
    return sorted_scores[:top_n]

# Example usage
target = ['LYS', 'ARG']
print(f"\nTop inhibitor candidates for {target}:")
print(predict_inhibitors(target))

Affinity Matrix (%):
          ALA       ARG       ASN       ASP       C1J       CYS        GLN  \
ALA  8.134851  5.002151  4.208221  5.291564  0.015644  1.286714   4.098713   
ARG  7.902379  4.967563  4.850170  6.827309  0.024714  1.167748   3.978993   
ASN  6.601632  4.816246  5.595435  5.669059  0.018406  0.963249   4.080005   
ASP  7.506658  6.130715  5.126498  4.604971  0.022193  1.287173   3.828229   
C1J  4.545455  4.545455  3.409091  4.545455  0.000000  0.000000  11.363636   

          GLU       GLY       HIS  ...       MET       PHE       PRO  \
ALA  4.732293  4.302085  1.943760  ...  1.849896  5.279831  4.654073   
ARG  6.407167  4.219957  2.069818  ...  1.686747  4.405314  4.442385   
ASN  4.767164  5.129149  2.122830  ...  1.914228  4.846923  5.190502   
ASP  5.043276  4.305371  2.080559  ...  1.925211  4.671549  4.510652   
C1J  4.545455  9.090909  0.000000  ...  0.000000  3.409091  6.818182   

          PTR       SEP       SER        THR       TRP       TYR       VAL  
