# Traditional ML models

In [3]:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import json

import statsmodels.api as sm
from statsmodels.regression import linear_model
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error


import os
import subprocess
import sys

sys.path.append('../..')
from src import utils



SyntaxError: invalid syntax (3841773743.py, line 17)

In [None]:
class Trainer():
    
    def __init__(data, modelname='ridge'):
        
        self.modelname = modelname
        self.get_model()

    
    def get_model(self):
        
        if self.modelname == 'ridge':
            model = Ridge()
        elif self.modelname == 'svr':
            model = SVR()
        elif self.modelname == 'rforest':
            model = RandomForestRegression(n_jobs=-1)
        elif self.modelname == 'knr':
            model = KNeighborsRegressor
        else:
            raise ValueError(f'Unrecognized model: {self.modelname}')
        self.model = model
        
        return self.model


In [None]:

#weight_types = ['None', 'bin_inv', 'bin_inv_sqrt', 'LDS_inv', 'LDS_inv_sqrt', 'LDS_extreme']
weight_types = ['bin_inv', 'bin_inv_sqrt', 'LDS_inv', 'LDS_inv_sqrt', 'LDS_extreme']


def get_target_data(path):
    '''Return pHopt data'''

    if path is None: # Else specify to correct path
        path = "data/pHopt_data.csv"
    df = pd.read_csv(path, index_col=0)
    data = {}
    for key in ['Training', 'Validation', 'Testing']:
        dfsel = df[df['Split']==key]
        keydata = {'accession': dfsel['Accession'].values, 
                    'sequence': dfsel['Sequence'].values,
                    'y': dfsel['pHopt'].values}
        keydata['weights'] = {name : trainutils.get_sample_weights(data['y'], name) for name in weight_types}
        data[key] = keydata
    
    return data
    










def train_sklearn_model(data, modelname):
    '''Optimize and scikit-learn models'''
    
    # Space  
    param_space = {'alpha': np.arange(-8,8+1), 
                  'weight_type': weight_types}
    params_list = trainutils.sample_hyperparameters(param_space, n=200)

    # Data
    Xtrain, Xval, Xtest = [data[key]['X'] for key in 'Training', 'Validation', 'Testing']
    ytrain, yval, ytest = [data[key]['y'] for key in 'Training', 'Validation', 'Testing']
    means, stds = np.mean(Xtrain, axis=0), np.std(Xtrain, axis=0)
    Xtrain, Xval, Xtest = [(item - means) / (stds + 1e-8) for item in [Xtrain, Xval, Xtest]]

    # Random/grid search
    for i, params in enumerate(params_list):
        
        # Sample weights
        trainweights = data['Training']['weights'][params['weight_type']]
        valweights, testweights = [data[key]['weights']['bin_inv'] for key in ('Validation', 'Testing')]

        # Train and evaluate
        modelparams = params.copy()
        modelparams = {k:v for k,v in modelparams.items() if k!='weight_type'}
        model = model.set_params
        model = model.fit(Xtrain, ytrain, sample_weight=trainweights)
        yvalpred = model.predict(Xval)
        valperf = trainutils.performance(yval, ypred, weights)
        ytestpred = model.predict(ytest)
        testpred = trainutils.performance(ytest, ypred, weights)
    
    return {'valperf': valperf, 'testperf': testperf, 'yvalpred': yvalpred, 'ytestpred': ytestpred}

In [None]:
def train_SVR(data):

param_space = {'kernel': ['poly', 'rbf'],
               'gamma': ['scale', 'auto'],
               'C': 10. ** np.arange(-4, 4)}
param_grid = list(itertools.product(*param_space.values()))
param_size = len(param_grid)

print(f"{param_size} hyperparameter combinations")

# Train model
model = SVR()
rmsew_list, mcc_list = [], []
for i in range(param_size):

    # Instantiate model with hyperparameters
    params = dict(zip(param_space.keys(), param_grid[i]))
    model = model.set_params(**params)
    print(f"\n{i+1} of {param_size}")
    print(model)

    # Train/evaluate model with selected parameters
    model = model.fit(Xtrain, ytrain, sample_weight=trainweights)
    ypred = model.predict(Xval)
    rmsew = np.sqrt(mean_squared_error(yval, ypred, sample_weight=valweights))
    rmsew_list.append(rmsew)
    print(f"rmsew = {round(rmsew, 4)}")

In [3]:
# Define hyperparameter space
param_space = {'n_estimators': [10, 20, 50, 100, 200, 500, 1000],
                #'criterion': ['mse'], # MAE is too slow
               'criterion': ['squared_error'],
                'max_features': [0.25, 0.5, 0.75, None],
                'max_samples': [0.25, 0.5, 0.75, None],
                'max_depth': [5,10,None]}
param_grid = list(itertools.product(*param_space.values()))
param_size = len(param_grid)
print(f"{param_size} hyperparameter combinations")

# Model with default hyperparameters
model = RandomForestRegressor(n_jobs=-1, random_state=0)

# Empty lists for storing results
params_list = []
rmsew_list = []
params_list = []

#  Random search 
niters = 60
assert niters <= param_size

np.random.seed(0)

for i in range(niters):
    
    isSelected = True # Initialize for while loop
    
    # Continue to sample until a new selection is made (to avoid repeated selections)
    while isSelected:
        selected_params = evals.sampleHyperparameters(param_space)
        isSelected = selected_params in params_list
    
    # Train/evaluate model with selected parameters
    model = model.set_params(**selected_params)
    model = model.fit(Xtrain, ytrain, sample_weight=trainweights)
    ypred = model.predict(Xval)
    rmsew = np.sqrt(mean_squared_error(yval, ypred, sample_weight=valweights))
    
    # Store evaluation results
    params_list.append(selected_params.copy()) 
    rmsew_list.append(rmsew)
    
    # Print progress
    print(f"Random search: {i+1} of {niters}")
    print(f"params = {selected_params}")
    print(f"rmsew = {rmsew:.4f}")
    print()



NameError: name 'Users' is not defined