In this notebook we convert the code to create a classical ML classifier on top of a pre-trained protein language model.

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler
import pandas as pd

Use the following function to load part of our data.

In [None]:
import pandas as pd

def load_sequences(data_csv, n_samples):
    df = pd.read_csv(data_csv)
    small_df = df.groupby('label/fitness')['sequence'].apply(lambda s: s.sample(n_samples)).reset_index()

    return small_df['sequence'].tolist(), small_df['label/fitness'].tolist()

Load in the pretrained model as usual and obtain our embeddings

In [None]:
model_name = "facebook/esm2_t6_8M_UR50D"
pretrained_model = EsmModel.from_pretrained(model_name)
tokenizer = EsmTokenizer.from_pretrained(model_name)

# Tokenize
inputs = tokenizer(sequences, return_tensors='pt')

# Get embeddings
outputs = model(**inputs)

embeddings = outputs.last_hidden_state # Select the appropriate output

A random forest model

In [None]:
def train_random_forest(X_train, X_test, y_train, y_test):
    # Initialize and train Random Forest
    rf_model = RandomForestClassifier(
        n_estimators=100,
        random_state=42,
        n_jobs=-1  # Use all available cores
    )
    
    rf_model.fit(X_train, y_train)
    
    # Make predictions
    y_pred = rf_model.predict(X_test)
    
    # Print results
    print("Random Forest Results:")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    
    return rf_model

A SVM model

def train_svm(X_train, X_test, y_train, y_test):
    # Initialize and train SVM
    svm_model = SVC(
        kernel='rbf',  # You can try 'linear', 'poly', or 'sigmoid'
        random_state=42
    )
    
    svm_model.fit(X_train, y_train)
    
    # Make predictions
    y_pred = svm_model.predict(X_test)
    
    # Print results
    print("SVM Results:")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    
    return svm_model

Hyperparameter optimization

In [None]:
from sklearn.model_selection import GridSearchCV

# For Random Forest
rf_params = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10]
}

rf_grid = GridSearchCV(RandomForestClassifier(), rf_params, cv=5)
rf_grid.fit(X_train, y_train)
print("Best RF parameters:", rf_grid.best_params_)

# For SVM
svm_params = {
    'C': [0.1, 1, 10],
    'kernel': ['rbf', 'linear'],
    'gamma': ['scale', 'auto', 0.1, 1]
}

svm_grid = GridSearchCV(SVC(), svm_params, cv=5)
svm_grid.fit(X_train, y_train)
print("Best SVM parameters:", svm_grid.best_params_)