<a href="https://colab.research.google.com/github/jonglees/transmembrane-practical/blob/main/practical_transmem_ML_datasets.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Task 0: Set things up
Download the training set for a transmembrane predictor
(just run the cells in this section, no changes needed)

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import h5py


In [None]:
"""Download training data"""
!wget https://github.com/jonglees/transmembrane-practical/raw/refs/heads/main/df_tm_surf_train.h5

In [None]:
"""Make training data balanced between transmembrane and non-transmembrane"""
df_train=pd.read_hdf("df_tm_surf_train.h5")
num_pos_tm=df_train[df_train.tm==1].shape[0]
num_neg_tm=df_train[df_train.tm==0].shape[0]
print("number transmembrane", num_pos_tm, "number non transmembrane", num_neg_tm, "before balancing classes")
df_train_pos_tm = df_train[df_train.tm==1]
df_train_neg_tm = df_train[df_train.tm==0].sample(num_pos_tm)
df_train_tm = pd.concat([df_train_pos_tm, df_train_neg_tm])
num_pos_tm=df_train_tm[df_train_tm.tm==1].shape[0]
num_neg_tm=df_train_tm[df_train_tm.tm==0].shape[0]
print("number transmembrane", num_pos_tm, "number non transmembrane", num_neg_tm, "after balancing classes")

# Task 1: Train and evaluate your own ML method to predict transmembrane proteins
(if you get stuck you can ask Gemini for help or click Example Answer but try yourself first)

In [None]:
#train and evaluate random forest classifier using scikits with the df_train_tm data frame using the embeddings column as inputs and the binary label 'tm' as the ouput


In [None]:
# @title Example Answer


# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    list(df_train_tm['embeddings']), df_train_tm['tm'], test_size=0.2, random_state=42
)

# Initialize and train a RandomForestClassifier
clf_tm = RandomForestClassifier(random_state=42,n_estimators=50,max_depth=4)  # You can adjust hyperparameters here
clf_tm.fit(X_train, y_train)

# Make predictions on the test set
y_pred = clf_tm.predict(X_test)

# Evaluate the classifier
print(classification_report(y_test, y_pred))

# Task 2: Plot a ROC curve of your data

In [None]:
#write code to plot a ROC curve using scikits on the test set using the trained classifier


In [None]:
# @title Example Answer
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc

# Predict probabilities for the positive class
y_prob = clf_tm.predict_proba(X_test)[:, 1]

# Compute ROC curve and AUC
fpr, tpr, thresholds = roc_curve(y_test, y_prob)
roc_auc = auc(fpr, tpr)

# Plot the ROC curve
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.show()


# Task 3: Train and evaluate your own ML method to predict cell surface proteins

In [6]:
"""First just run this cell if you want a balanced set of positive and negative examples"""
num_pos_surf=df_train[df_train.surf==1].shape[0]
num_neg_surf=df_train[df_train.surf==0].shape[0]
df_train_pos_surf = df_train[df_train.surf==1].sample(num_neg_surf)
df_train_neg_surf = df_train[df_train.surf==0]
df_train_surf = pd.concat([df_train_pos_surf, df_train_neg_surf])

In [None]:
#train and evaluate random forest classifier using scikits with the df_train_surf data frame using the embeddings column as inputs and the binary label 'surf' as the ouput


In [None]:
# @title Example Answer
#plot a ROC curve using scikits on the test set using the trained classifier

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    list(df_train_surf['embeddings']), df_train_surf['surf'], test_size=0.2, random_state=42
)

# Initialize and train a RandomForestClassifier
clf_surf = RandomForestClassifier(random_state=42,n_estimators=100,max_depth=5)  # You can adjust hyperparameters here
clf_surf.fit(X_train, y_train)

# Make predictions on the test set
y_pred = clf_surf.predict(X_test)

# Evaluate the classifier
print(classification_report(y_test, y_pred))

# Predict probabilities for the positive class
y_prob = clf_surf.predict_proba(X_test)[:, 1]

# Compute ROC curve and AUC
fpr, tpr, thresholds = roc_curve(y_test, y_prob)
roc_auc = auc(fpr, tpr)

# Plot the ROC curve
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.show()

# Task 4: Are some human proteins misannotated in the database, do your ML tools help?
For example this protein A8MTB9 is not predicted as transmembrane what do your cell surface and transmembrane classifiers assign this protein as. Does this prediction tool https://services.healthtech.dtu.dk/services/TMHMM-2.0/ agree with your predictions

In [10]:
# @title Example Answer
df_train['surf_pred']=clf_surf.predict(list(df_train['embeddings']))
df_train['tm_pred']=clf_tm.predict(list(df_train['embeddings']))

df_train['surf_pred_prob']=clf_surf.predict_proba(list(df_train['embeddings']))[:, 1]
df_train['tm_pred_prob']=clf_tm.predict_proba(list(df_train['embeddings']))[:, 1]

print(df_train[df_train.accessions=="A8MTB9"])

# Task 5: Can we apply our method to malaria
Can you run your classifier on the p.falciparum (malaria) proteome. Anything new targets? e.g. What does your classifier make of this malaria protein for example: O96274

In [None]:
"""First download the malaria proteome embeddings"""
!wget https://github.com/jonglees/transmembrane-practical/raw/refs/heads/main/df_falcip.h5
df_falcip=pd.read_hdf("df_falcip.h5")

In [None]:
# @title Example answer
df_falcip['surf_pred']=clf_surf.predict(list(df_falcip['embeddings']))
df_falcip['tm_pred']=clf_tm.predict(list(df_falcip['embeddings']))

df_falcip['surf_pred_prob']=clf_surf.predict_proba(list(df_falcip['embeddings']))[:, 1]
df_falcip['tm_pred_prob']=clf_tm.predict_proba(list(df_falcip['embeddings']))[:, 1]

df_falcip[df_falcip["accessions"]=="O96274"]