In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

### 1- Upload data 

In [2]:
df = pd.read_excel('final_df.xlsx')

In [3]:
# Get number of columns
num_columns = len(df.columns)

# Get column names
column_names = df.columns.tolist()

# Print the results
print(f"🧾 Total columns: {num_columns}")
print("📋 Column names:")
for i, name in enumerate(column_names, start=1):
    print(f"{i}. {name}")

🧾 Total columns: 271
📋 Column names:
1. UniProt ID
2. Gene
3. Mutation
4. Class
5. ANDN920101
6. ARGP820101
7. ARGP820102
8. ARGP820103
9. BEGF750101
10. BEGF750102
11. BEGF750103
12. BHAR880101
13. BIGC670101
14. BIOV880101
15. BIOV880102
16. BROC820101
17. BROC820102
18. BULH740101
19. BULH740102
20. BUNA790101
21. BUNA790102
22. BUNA790103
23. BURA740101
24. BURA740102
25. CHAM810101
26. CHAM820101
27. CHAM820102
28. CHAM830101
29. CHAM830102
30. CHAM830103
31. CHAM830104
32. CHAM830105
33. CHAM830106
34. CHAM830107
35. bf_peptide
36. af_peptide
37. bf_peptide_ratio
38. af_peptide_ratio
39. rsa_-5
40. asa_-5
41. p[q3_H]_-5
42. p[q3_E]_-5
43. p[q3_C]_-5
44. p[q8_G]_-5
45. p[q8_H]_-5
46. p[q8_I]_-5
47. p[q8_B]_-5
48. p[q8_E]_-5
49. p[q8_S]_-5
50. p[q8_T]_-5
51. p[q8_C]_-5
52. phi_-5
53. psi_-5
54. disorder_-5
55. rsa_-4
56. asa_-4
57. p[q3_H]_-4
58. p[q3_E]_-4
59. p[q3_C]_-4
60. p[q8_G]_-4
61. p[q8_H]_-4
62. p[q8_I]_-4
63. p[q8_B]_-4
64. p[q8_E]_-4
65. p[q8_S]_-4
66. p[q8_T]_-4
67. p[


### 2- Clustering data based on secondary structure

In [4]:
# create the columns for the secondary structure
df["secondary_structure"] = None
for i, row in df.iterrows():
    helix_score = (df.at[i, "p[q3_H]_-1"] + df.at[i, "p[q3_H]_1"]) / 2
    strand_score = (df.at[i, "p[q3_E]_-1"] + df.at[i, "p[q3_E]_1"]) / 2
    coil_score = (df.at[i, "p[q3_C]_-1"] + df.at[i, "p[q3_C]_1"]) / 2

    P = max(helix_score, strand_score, coil_score)
    if P == helix_score:
        df.at[i, "secondary_structure"] = "Helix"
    elif P == strand_score:
        df.at[i, "secondary_structure"] = "Strand"
    else:
        df.at[i, "secondary_structure"] = "Coil"


In [5]:
def cluster_data(df):
    """this function take dataframe, and return 3 dataframes, that are filtered by the secondary structure"""
    helix = df[df["secondary_structure"] == "Helix"]
    strand = df[df["secondary_structure"] == "Strand"]
    coil = df[df["secondary_structure"] == "Coil"]
    return helix, strand, coil

### 3- Splitting data 

In [6]:
# === Split into initial train/test ===
training_set, initial_test = train_test_split(
    df, test_size=0.2, stratify=df["Class"], random_state=42
)

# === Filter test set to remove overlaps by (UniProt ID, Position) ===
train_sites = set(zip(training_set["UniProt ID"], training_set["Position"]))
filtered_test = initial_test[~initial_test.apply(
    lambda row: (row["UniProt ID"], row["Position"]) in train_sites, axis=1
)]


test_set = filtered_test.copy()


## 4-Training models:

My data has ealready the column secondary structure.
so will first . train the model, tune the model, use for that 10 folds in cross validation. I want to know the most important features. 
evaluate the modele in the test set , 
or each class in secondary_structure, filter the test set for that class and compute metrics only for those samples.

### SVMs:


In [13]:
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.metrics import classification_report, accuracy_score
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import confusion_matrix, matthews_corrcoef, roc_auc_score
import numpy as np
import pandas as pd
from sklearn.metrics import recall_score

import matplotlib.pyplot as plt


# Ensure feature_cols contains only numeric columns
feature_cols = [col for col in training_set.columns if np.issubdtype(training_set[col].dtype, np.number)]

X_train = training_set[feature_cols]
y_train = training_set["Class"]
X_test = test_set[feature_cols]
y_test = test_set["Class"]

# SVM with grid search and 10-fold cross-validation
param_grid = {'C': [0.1, 0.01, 1, 10], 'kernel': ['rbf']}
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
grid = GridSearchCV(SVC(probability=True), param_grid, cv=cv, scoring='accuracy', n_jobs=-1)
grid.fit(X_train, y_train)

print("Best parameters:", grid.best_params_)
print("Best cross-validation accuracy:", grid.best_score_)

# Feature importance for linear SVM
if grid.best_params_['kernel'] == 'linear':
    importances = abs(grid.best_estimator_.coef_[0])
    indices = importances.argsort()[::-1]
    plt.figure(figsize=(10, 6))
    plt.title("Top 20 Feature Importances (SVM)")
    plt.bar(range(20), importances[indices[:20]])
    plt.xticks(range(20), [feature_cols[i] for i in indices[:20]], rotation=90)
    plt.tight_layout()
    plt.show()

# Evaluate on test set
y_pred = grid.predict(X_test)
print("Test set accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Helper to compute metrics for binary classification
def compute_metrics(y_true, y_pred, y_prob=None):
    labels = np.unique(y_true)
    pos_label = labels[1]
    cm = confusion_matrix(y_true, y_pred, labels=labels)
    tn, fp, fn, tp = cm.ravel()
    sensitivity = tp / (tp + fn) if (tp + fn) > 0 else 0
    specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
    acc = accuracy_score(y_true, y_pred)
    mcc = matthews_corrcoef(y_true, y_pred)
    if y_prob is not None:
        try:
            au = roc_auc_score(y_true, y_prob[:, list(labels).index(pos_label)])
        except Exception:
            au = np.nan
    else:
        au = np.nan
    return sensitivity, specificity, acc, mcc, au

# Collect results for each secondary_structure class
results = []
for sec_class in test_set['secondary_structure'].unique():
    subset = test_set[test_set['secondary_structure'] == sec_class]
    if len(subset) == 0:
        continue
    X_sub = subset[feature_cols]
    y_sub = subset['Class']
    y_pred_sub = grid.predict(X_sub)
    try:
        y_prob_sub = grid.predict_proba(X_sub)
    except Exception:
        y_prob_sub = None
    sens, spec, acc, mcc, au = compute_metrics(y_sub, y_pred_sub, y_prob_sub)
    results.append({
        'Secondary Structure': sec_class,
        'Sensitivity': sens,
        'Specificity': spec,
        'Accuracy': acc,
        'MCC': mcc,
        'AUROC': au,
        'Samples': len(subset)
    })

# Add overall test set metrics
try:
    y_prob = grid.predict_proba(X_test)
except Exception:
    y_prob = None
sens, spec, acc, mcc, au = compute_metrics(y_test, y_pred, y_prob)
results.append({
    'Secondary Structure': 'Overall',
    'Sensitivity': sens,
    'Specificity': spec,
    'Accuracy': acc,
    'MCC': mcc,
    'AUROC': au,
    'Samples': len(y_test)
})

# Display as DataFrame
results_df = pd.DataFrame(results)
display(results_df)




Best parameters: {'C': 10, 'kernel': 'rbf'}
Best cross-validation accuracy: 0.6471288515406163
Test set accuracy: 0.6532305868405454
              precision    recall  f1-score   support

      Driver       0.64      0.77      0.70      1747
   Passenger       0.68      0.53      0.60      1627

    accuracy                           0.65      3374
   macro avg       0.66      0.65      0.65      3374
weighted avg       0.66      0.65      0.65      3374



Unnamed: 0,Secondary Structure,Sensitivity,Specificity,Accuracy,MCC,AUROC,Samples
0,Coil,0.539535,0.757895,0.650382,0.30507,0.705896,2620
1,Helix,0.501502,0.793187,0.662634,0.309616,0.727779,744
2,Strand,0.75,0.666667,0.7,0.408248,0.708333,10
3,Overall,0.532268,0.765884,0.653231,0.307268,0.711892,3374


### MLP : 

In [None]:
#

### XGBoost:

### Random forests