# Random forest model training

Input datasets:
- training_data_no_filter.csv
- training_data_MS_filter.csv
- training_data_MS_iso_filter.csv

This notebook includes:
- Results of hyperparameter grid search
- Training and testing of random forest classifiers (all and seq)
- Feature importance analysis

## Import libraries

In [1]:
import gzip
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pickle
import scikitplot as skplt
import seaborn as sns
import os

from imblearn.under_sampling import RandomUnderSampler
from matplotlib import patches
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import auc, RocCurveDisplay, ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from matplotlib_venn import venn2
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

## Define paths

In [2]:
Data_path = os.path.dirname(os.getcwd()) + '/Data'
Model_path = os.path.dirname(os.getcwd()) + '/Models'

## Import dataset

### Feature datasets

In [3]:
df_unfiltered = pd.read_csv(Data_path + '/training/training_data_no_filter.csv', sep=',')
df_MS_filter = pd.read_csv(Data_path + '/training/training_data_MS_filter.csv', sep=',')
df_MS_iso_filter = pd.read_csv(Data_path + '/training/training_data_MS_iso_filter.csv', sep=',')

## Create Random Forest models

In [4]:
continuous = ['length', 'hydr_count', 'polar_count', 'molecular_weight', 'helix', 'turn', 'sheet', 
    'A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y', 
    'thsa_netsurfp2', 'tasa_netsurfp2', 'rhsa_netsurfp2', 'disorder', 'A_exposed', 'C_exposed', 'D_exposed', 
    'E_exposed', 'F_exposed', 'G_exposed', 'H_exposed', 'I_exposed', 'K_exposed', 'L_exposed', 'M_exposed', 
    'N_exposed', 'P_exposed', 'Q_exposed', 'R_exposed', 'S_exposed', 'T_exposed', 'V_exposed', 'W_exposed', 
    'Y_exposed', 'Probability_solubility', 'Aggregation_propensity', 'Aromaticity', 'Instability_index', 
    'Gravy', 'Isoelectric_point', 'Charge_at_7', 'Charge_at_5', 'Polar_exposed', 'Hydrophobic_exposed']

In [5]:
def preprocess(df):
    
    # define explanatory and response variables
    X = df.drop(["id", "EV"], axis=1)
    y = df["EV"]
    
    # undersample majority class
    undersample = RandomUnderSampler(random_state=0)
    X_balanced, y_balanced = undersample.fit_resample(X, y)
    
    return X_balanced, y_balanced


def split_and_scale(X_balanced, y_balanced, features_cont=continuous, scaler=RobustScaler()):

    # split 80% training and 20% test 
    train_X, test_X, train_y, test_y = train_test_split(X_balanced, y_balanced, test_size=0.2, random_state=0, 
        stratify=y_balanced)
    
    # robust scaling
    train_X_scaled = train_X.copy()
    test_X_scaled = test_X.copy()
    train_X_scaled[features_cont] = scaler.fit_transform(train_X[features_cont])
    test_X_scaled[features_cont] = scaler.transform(test_X[features_cont])
    
    print("Size of training set:", len(train_X_scaled))
    print("Size of test set:", len(test_X_scaled))
    
    return train_X_scaled, train_y, test_X_scaled, test_y, scaler

### Preprocess data

In [6]:
print("Dataset with no filtering")
X_balanced_1, y_balanced_1 = preprocess(df_unfiltered)
train_X_1, train_y_1, test_X_1, test_y_1, scaler_1 = split_and_scale(X_balanced_1, y_balanced_1)
print("---------------------")
print("Dataset with MS filtering")
X_balanced_2, y_balanced_2 = preprocess(df_MS_filter)
train_X_2, train_y_2, test_X_2, test_y_2, scaler_2 = split_and_scale(X_balanced_2, y_balanced_2)
print("---------------------")
print("Dataset with MS and isolation method filtering")
X_balanced_3, y_balanced_3 = preprocess(df_MS_iso_filter)
train_X_3, train_y_3, test_X_3, test_y_3, scaler_3 = split_and_scale(X_balanced_3, y_balanced_3)

Dataset with no filtering
Size of training set: 13452
Size of test set: 3364
---------------------
Dataset with MS filtering
Size of training set: 8817
Size of test set: 2205
---------------------
Dataset with MS and isolation method filtering
Size of training set: 9544
Size of test set: 2386


### Train the random forest models (all features)

In [7]:
rf_1 = RandomForestClassifier(random_state=0, n_estimators=1000, max_features=10)
rf_1.fit(train_X_1, train_y_1)

rf_2 = RandomForestClassifier(random_state=0, n_estimators=1000, max_features=10)
rf_2.fit(train_X_2, train_y_2)

rf_3 = RandomForestClassifier(random_state=0, n_estimators=1000, max_features=10)
rf_3.fit(train_X_3, train_y_3)

KeyboardInterrupt: 

### Save models (all features)

In [12]:
file = gzip.GzipFile(Model_path + '/EV_RF_model_no_filter.pkl', 'wb')
file.write(pickle.dumps(rf_1))
file.close()

file = gzip.GzipFile(Model_path + '/EV_RF_model_MS_filter.pkl', 'wb')
file.write(pickle.dumps(rf_2))
file.close()

file = gzip.GzipFile(Model_path + '/EV_RF_model_MS_iso_filter.pkl', 'wb')
file.write(pickle.dumps(rf_3))
file.close()

### Save scaler

In [9]:
with open(Model_path + '/EV_scaler_MS_iso_filter.pkl', 'wb') as file:
    file.write(pickle.dumps(scaler_3))

### Train the random forest models (sequence-based features)

In [13]:
# use only sequence-based features
seq_features = ['length', 'hydr_count', 'polar_count', 'molecular_weight', 'helix', 'turn', 'sheet', 'A', 'C', 'D', 'E', 'F', 
                'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y', 'thsa_netsurfp2', 'tasa_netsurfp2', 
                'rhsa_netsurfp2', 'disorder', 'A_exposed', 'C_exposed', 'D_exposed', 'E_exposed', 'F_exposed',
                'G_exposed', 'H_exposed', 'I_exposed', 'K_exposed', 'L_exposed', 'M_exposed', 'N_exposed', 'P_exposed', 
                'Q_exposed', 'R_exposed', 'S_exposed', 'T_exposed', 'V_exposed', 'W_exposed', 'Y_exposed', 
                'Probability_solubility', 'Aggregation_propensity', 'Aromaticity', 'Instability_index', 'Gravy', 
                'Isoelectric_point', 'Charge_at_7', 'Charge_at_5', 'PTM_MSD', 'Phosphorylation_MSD',
                'Glycosylation_MSD', 'Ubiquitination_MSD', 'SUMOylation_MSD', 'Acetylation_MSD', 'Palmitoylation_MSD', 
                'Methylation_MSD', 'coiled_coil', 'RAS_profile', 'ww_domain', 'EGF', 'RRM',
                'TMHMM', 'Polar_exposed', 'Hydrophobic_exposed']

train_X_1_seq = train_X_1[seq_features]
test_X_1_seq = test_X_1[seq_features]

train_X_2_seq = train_X_2[seq_features]
test_X_2_seq = test_X_2[seq_features]

train_X_3_seq = train_X_3[seq_features]
test_X_3_seq = test_X_3[seq_features]

In [14]:
rf_1_seq = RandomForestClassifier(random_state=0, n_estimators=1000, max_features=10)
rf_1_seq.fit(train_X_1_seq, train_y_1)

rf_2_seq = RandomForestClassifier(random_state=0, n_estimators=1000, max_features=10)
rf_2_seq.fit(train_X_2_seq, train_y_2)

rf_3_seq = RandomForestClassifier(random_state=0, n_estimators=1000, max_features=10)
rf_3_seq.fit(train_X_3_seq, train_y_3)

RandomForestClassifier(max_features=10, n_estimators=1000, random_state=0)

### Save models (sequence-based features)

In [15]:
file = gzip.GzipFile(Model_path + '/EV_RF_model_no_filter_seq.pkl', 'wb')
file.write(pickle.dumps(rf_1_seq))
file.close()

file = gzip.GzipFile(Model_path + '/EV_RF_model_MS_filter_seq.pkl', 'wb')
file.write(pickle.dumps(rf_2_seq))
file.close()

file = gzip.GzipFile(Model_path + '/EV_RF_model_MS_iso_filter_seq.pkl', 'wb')
file.write(pickle.dumps(rf_3_seq))
file.close()