In [13]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from icecream import ic
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from xgboost import XGBClassifier


In [14]:
# df = pd.read_csv('all_seq722.csv')
df = pd.read_csv('all_seq702.csv')

### ngram

In [15]:
# Re-import necessary libraries after execution state reset
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.feature_extraction.text import CountVectorizer


# Step 1: Alternative Embedding - k-mer Frequency (simple n-gram approach)
def get_kmer_features(sequences, k=3):
    """Convert sequences into k-mer frequency vectors"""
    vectorizer = CountVectorizer(analyzer='char', ngram_range=(k, k))
    X_kmers = vectorizer.fit_transform(sequences).toarray()
    return X_kmers

X = get_kmer_features(df["Sequences"], k=3)
y = df["AMP"].values

# Step 2: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Step 3: Baseline Models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(n_estimators=100),
    "SVM": SVC(kernel="linear", probability=True),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric="logloss")
}

# Step 4: Model Evaluation
results = []

for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else None

    acc = accuracy_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_prob) if y_prob is not None else None
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    sens = tp / (tp + fn) if (tp + fn) > 0 else 0  # Sensitivity (Recall)
    spec = tn / (tn + fp) if (tn + fp) > 0 else 0  # Specificity

    results.append([model_name, acc, auc, sens, spec])

# Convert results to DataFrame and display
results_df = pd.DataFrame(results, columns=["Model", "Accuracy", "AUC", "Sensitivity", "Specificity"])




Parameters: { "use_label_encoder" } are not used.



In [16]:
results_df

Unnamed: 0,Model,Accuracy,AUC,Sensitivity,Specificity
0,Logistic Regression,0.808511,0.889135,0.814286,0.802817
1,Random Forest,0.723404,0.878169,0.957143,0.492958
2,SVM,0.723404,0.874648,0.885714,0.56338
3,XGBoost,0.702128,0.766197,0.471429,0.929577


### onehot

In [17]:
# Fixing the OneHotEncoder issue by using the correct parameter (`sparse=False` for older versions)
from sklearn.preprocessing import OneHotEncoder

def get_onehot_features(sequences):
    """Convert sequences into one-hot encoded vectors."""
    # Flatten all sequences into a set of unique characters
    unique_chars = sorted(set("".join(sequences)))
    
    # Create a mapping from characters to indices
    char_to_index = {char: idx for idx, char in enumerate(unique_chars)}
    
    # Encode each sequence as a list of indices
    indexed_sequences = [[char_to_index[char] for char in seq] for seq in sequences]

    # Pad sequences to the same length (max sequence length)
    max_length = max(len(seq) for seq in sequences)
    padded_sequences = [seq + [len(unique_chars)] * (max_length - len(seq)) for seq in indexed_sequences]  # Padding index
    
    # Convert to NumPy array
    padded_sequences = np.array(padded_sequences)

    # One-Hot Encode with corrected parameter name
    encoder = OneHotEncoder( categories=[list(range(len(unique_chars) + 1))] * max_length)
    X_onehot = encoder.fit_transform(padded_sequences)
    
    return X_onehot

# # Sample DataFrame
# data = {
#     "Sequences": ["WKWLKKWIK", "ILRWKWRWWRWRR", "ILPWKWRWWKWRR", "RWRRKWWWW", "WRKFWKYLK"],
#     "AMP": [1, 1, 1, 1, 1]  # Example data, need to add negatives for proper training
# }
# df = pd.DataFrame(data)

# # Simulate some negative examples (for realistic binary classification)
# negative_sequences = ["MVLSPADKT", "SGRGKQGGKV", "ADEMKRYGQ", "TSLYNRFST", "MGDVEKGKK"]
# negative_labels = [0] * len(negative_sequences)

# df_neg = pd.DataFrame({"Sequences": negative_sequences, "AMP": negative_labels})
# df = pd.concat([df, df_neg], ignore_index=True)

# Apply One-Hot Encoding to Sequences
X = get_onehot_features(df["Sequences"])
y = df["AMP"].values

# Step 2: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Step 3: Baseline Models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(n_estimators=100),
    "SVM": SVC(kernel="linear", probability=True)
}

# Step 4: Model Evaluation
results = []

for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else None

    acc = accuracy_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_prob) if y_prob is not None else None
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    sens = tp / (tp + fn) if (tp + fn) > 0 else 0  # Sensitivity (Recall)
    spec = tn / (tn + fp) if (tn + fp) > 0 else 0  # Specificity

    results.append([model_name, acc, auc, sens, spec])

# Convert results to DataFrame and display
results_df = pd.DataFrame(results, columns=["Model", "Accuracy", "AUC", "Sensitivity", "Specificity"])

In [18]:
results_df 

Unnamed: 0,Model,Accuracy,AUC,Sensitivity,Specificity
0,Logistic Regression,0.865248,0.932998,0.914286,0.816901
1,Random Forest,0.851064,0.945875,0.9,0.802817
2,SVM,0.858156,0.91006,0.928571,0.788732
