# Test importance features random forest model
In this notebook the importance of the features used in the final model are tested, so no new features are added. Adding of new unused features can be found in test_random_forest_with_49_features.

In [1]:
import os
from ms2query.utils import load_pickled_file
from matplotlib import pyplot as plt
training_scores, training_labels, validation_scores, validation_labels = load_pickled_file("C:/Users/jonge094/PycharmProjects/PhD_MS2Query/ms2query/data/libraries_and_models/gnps_15_12_2021/ms2q_training_data_with_additional_weigthing_scores.pickle")



In [2]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

def train_random_forest(selection_of_training_scores, selection_of_validation_scores):
    # train rf using optimised parameters from below

    rf = RandomForestRegressor(n_estimators = 250, 
                               random_state = 42, 
                               max_depth = 5, 
                               verbose=1,
                               min_samples_leaf=50,
                               n_jobs=7)
    rf.fit(selection_of_training_scores, training_labels)

    # predict on train
    rf_train_predictions = rf.predict(selection_of_training_scores)
    mse_train_rf = mean_squared_error(training_labels, rf_train_predictions)
    print('Training MSE', mse_train_rf)

    # predict on test
    rf_predictions = rf.predict(selection_of_validation_scores)
    mse_rf = mean_squared_error(validation_labels, rf_predictions)
    print('Validation MSE', mse_rf)

    # get feature importances
    importances = list(rf.feature_importances_)
    feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(selection_of_training_scores.columns, importances)]
    feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)
    [print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances]

# Final model

In [3]:
subselection_of_features = ["query_precursor_mz",
                            "precursor_mz_difference",
                            "s2v_score",
                            "chemical_neighbourhood_no_spectrum_nr_tanimoto_power0",
                            "average_tanimoto_score_for_chemical_neighbourhood_score"]
selection_of_training_scores = training_scores[subselection_of_features]
selection_of_validation_scores = validation_scores[subselection_of_features]
train_random_forest(selection_of_training_scores, selection_of_validation_scores)

  rf.fit(selection_of_training_scores, training_labels)
[Parallel(n_jobs=7)]: Using backend ThreadingBackend with 7 concurrent workers.
[Parallel(n_jobs=7)]: Done  36 tasks      | elapsed:    6.6s
[Parallel(n_jobs=7)]: Done 186 tasks      | elapsed:   44.4s
[Parallel(n_jobs=7)]: Done 250 out of 250 | elapsed:  1.0min finished
[Parallel(n_jobs=7)]: Using backend ThreadingBackend with 7 concurrent workers.
[Parallel(n_jobs=7)]: Done  36 tasks      | elapsed:    0.1s
[Parallel(n_jobs=7)]: Done 186 tasks      | elapsed:    0.7s
[Parallel(n_jobs=7)]: Done 250 out of 250 | elapsed:    1.0s finished
[Parallel(n_jobs=7)]: Using backend ThreadingBackend with 7 concurrent workers.
[Parallel(n_jobs=7)]: Done  36 tasks      | elapsed:    0.0s


Training MSE 0.028182306482397588
Validation MSE 0.025457846784072757
Variable: chemical_neighbourhood_no_spectrum_nr_tanimoto_power0 Importance: 0.62
Variable: precursor_mz_difference Importance: 0.18
Variable: query_precursor_mz   Importance: 0.14
Variable: s2v_score            Importance: 0.05
Variable: average_tanimoto_score_for_chemical_neighbourhood_score Importance: 0.01


[Parallel(n_jobs=7)]: Done 186 tasks      | elapsed:    0.1s
[Parallel(n_jobs=7)]: Done 250 out of 250 | elapsed:    0.2s finished


# Show imporance average of multiple library spectra
To show the importance of this score, the model is trained without and a model is trained were only the spectra of 1 library structure is used, instead of 10 

In [9]:
subselection_of_features = ["query_precursor_mz",
                            "precursor_mz_difference",
                            "s2v_score",
                            "average_tanimoto_score_for_chemical_neighbourhood_score"]
selection_of_training_scores = training_scores[subselection_of_features]
selection_of_validation_scores = validation_scores[subselection_of_features]
train_random_forest(selection_of_training_scores, selection_of_validation_scores)

  rf.fit(selection_of_training_scores, training_labels)
[Parallel(n_jobs=7)]: Using backend ThreadingBackend with 7 concurrent workers.
[Parallel(n_jobs=7)]: Done  36 tasks      | elapsed:    5.0s
[Parallel(n_jobs=7)]: Done 186 tasks      | elapsed:   31.9s
[Parallel(n_jobs=7)]: Done 250 out of 250 | elapsed:   44.1s finished
[Parallel(n_jobs=7)]: Using backend ThreadingBackend with 7 concurrent workers.
[Parallel(n_jobs=7)]: Done  36 tasks      | elapsed:    0.1s
[Parallel(n_jobs=7)]: Done 186 tasks      | elapsed:    0.8s
[Parallel(n_jobs=7)]: Done 250 out of 250 | elapsed:    1.1s finished
[Parallel(n_jobs=7)]: Using backend ThreadingBackend with 7 concurrent workers.
[Parallel(n_jobs=7)]: Done  36 tasks      | elapsed:    0.0s


Training MSE 0.03388704628409759
Validation MSE 0.03305303276407407
Variable: query_precursor_mz   Importance: 0.41
Variable: precursor_mz_difference Importance: 0.29
Variable: s2v_score            Importance: 0.19
Variable: average_tanimoto_score_for_chemical_neighbourhood_score Importance: 0.1


[Parallel(n_jobs=7)]: Done 186 tasks      | elapsed:    0.1s
[Parallel(n_jobs=7)]: Done 250 out of 250 | elapsed:    0.2s finished


In [6]:
subselection_of_features = ["query_precursor_mz",
                            "precursor_mz_difference",
                            "s2v_score",
                            "ms2ds_score",
                            "average_tanimoto_score_for_chemical_neighbourhood_score"]
selection_of_training_scores = training_scores[subselection_of_features]
selection_of_validation_scores = validation_scores[subselection_of_features]
train_random_forest(selection_of_training_scores, selection_of_validation_scores)

  rf.fit(selection_of_training_scores, training_labels)
[Parallel(n_jobs=7)]: Using backend ThreadingBackend with 7 concurrent workers.
[Parallel(n_jobs=7)]: Done  36 tasks      | elapsed:   11.1s
[Parallel(n_jobs=7)]: Done 186 tasks      | elapsed:   53.0s
[Parallel(n_jobs=7)]: Done 250 out of 250 | elapsed:  1.2min finished
[Parallel(n_jobs=7)]: Using backend ThreadingBackend with 7 concurrent workers.
[Parallel(n_jobs=7)]: Done  36 tasks      | elapsed:    0.0s
[Parallel(n_jobs=7)]: Done 186 tasks      | elapsed:    0.5s
[Parallel(n_jobs=7)]: Done 250 out of 250 | elapsed:    0.7s finished
[Parallel(n_jobs=7)]: Using backend ThreadingBackend with 7 concurrent workers.
[Parallel(n_jobs=7)]: Done  36 tasks      | elapsed:    0.0s


Training MSE 0.03376154641110877
Validation MSE 0.0320487802629392
Variable: query_precursor_mz   Importance: 0.32
Variable: ms2ds_score          Importance: 0.25
Variable: precursor_mz_difference Importance: 0.23
Variable: s2v_score            Importance: 0.12
Variable: average_tanimoto_score_for_chemical_neighbourhood_score Importance: 0.07


[Parallel(n_jobs=7)]: Done 186 tasks      | elapsed:    0.1s
[Parallel(n_jobs=7)]: Done 250 out of 250 | elapsed:    0.2s finished


In [5]:
subselection_of_features = ["query_precursor_mz",
                            "precursor_mz_difference",
                            "s2v_score",
                            "ms2ds_score",
                            "average_ms2ds_score_for_inchikey14",
                            "average_tanimoto_score_for_chemical_neighbourhood_score"]
selection_of_training_scores = training_scores[subselection_of_features]
selection_of_validation_scores = validation_scores[subselection_of_features]
train_random_forest(selection_of_training_scores, selection_of_validation_scores)

  rf.fit(selection_of_training_scores, training_labels)
[Parallel(n_jobs=7)]: Using backend ThreadingBackend with 7 concurrent workers.
[Parallel(n_jobs=7)]: Done  36 tasks      | elapsed:   10.4s
[Parallel(n_jobs=7)]: Done 186 tasks      | elapsed:   55.3s
[Parallel(n_jobs=7)]: Done 250 out of 250 | elapsed:  1.2min finished
[Parallel(n_jobs=7)]: Using backend ThreadingBackend with 7 concurrent workers.
[Parallel(n_jobs=7)]: Done  36 tasks      | elapsed:    0.0s
[Parallel(n_jobs=7)]: Done 186 tasks      | elapsed:    0.7s
[Parallel(n_jobs=7)]: Done 250 out of 250 | elapsed:    1.1s finished
[Parallel(n_jobs=7)]: Using backend ThreadingBackend with 7 concurrent workers.
[Parallel(n_jobs=7)]: Done  36 tasks      | elapsed:    0.0s


Training MSE 0.03264100050590641
Validation MSE 0.030927552294164298
Variable: average_ms2ds_score_for_inchikey14 Importance: 0.38
Variable: query_precursor_mz   Importance: 0.27
Variable: precursor_mz_difference Importance: 0.19
Variable: s2v_score            Importance: 0.14
Variable: ms2ds_score          Importance: 0.02


[Parallel(n_jobs=7)]: Done 186 tasks      | elapsed:    0.1s
[Parallel(n_jobs=7)]: Done 250 out of 250 | elapsed:    0.2s finished


# Remove MS2Deepscore

In [7]:
subselection_of_features = ["query_precursor_mz",
                            "precursor_mz_difference",
                            "s2v_score",
                            "chemical_neighbourhood_no_spectrum_nr_tanimoto_power0",
                            "average_tanimoto_score_for_chemical_neighbourhood_score"]
selection_of_training_scores = training_scores[subselection_of_features]
selection_of_validation_scores = validation_scores[subselection_of_features]
train_random_forest(selection_of_training_scores, selection_of_validation_scores)

  rf.fit(selection_of_training_scores, training_labels)
[Parallel(n_jobs=7)]: Using backend ThreadingBackend with 7 concurrent workers.
[Parallel(n_jobs=7)]: Done  36 tasks      | elapsed:    7.1s
[Parallel(n_jobs=7)]: Done 186 tasks      | elapsed:   46.4s
[Parallel(n_jobs=7)]: Done 250 out of 250 | elapsed:  1.1min finished
[Parallel(n_jobs=7)]: Using backend ThreadingBackend with 7 concurrent workers.
[Parallel(n_jobs=7)]: Done  36 tasks      | elapsed:    0.1s
[Parallel(n_jobs=7)]: Done 186 tasks      | elapsed:    0.9s
[Parallel(n_jobs=7)]: Done 250 out of 250 | elapsed:    1.3s finished
[Parallel(n_jobs=7)]: Using backend ThreadingBackend with 7 concurrent workers.
[Parallel(n_jobs=7)]: Done  36 tasks      | elapsed:    0.0s


Training MSE 0.028182306482397588
Validation MSE 0.025457846784072757
Variable: chemical_neighbourhood_no_spectrum_nr_tanimoto_power0 Importance: 0.62
Variable: precursor_mz_difference Importance: 0.18
Variable: query_precursor_mz   Importance: 0.14
Variable: s2v_score            Importance: 0.05
Variable: average_tanimoto_score_for_chemical_neighbourhood_score Importance: 0.01


[Parallel(n_jobs=7)]: Done 186 tasks      | elapsed:    0.1s
[Parallel(n_jobs=7)]: Done 250 out of 250 | elapsed:    0.2s finished


# Remove average tanimoto score

In [13]:
subselection_of_features = ["query_precursor_mz",
                            "precursor_mz_difference",
                            "s2v_score",
                            "chemical_neighbourhood_no_spectrum_nr_tanimoto_power0"]
selection_of_training_scores = training_scores[subselection_of_features]
selection_of_validation_scores = validation_scores[subselection_of_features]
train_random_forest(selection_of_training_scores, selection_of_validation_scores)

  rf.fit(selection_of_training_scores, training_labels)
[Parallel(n_jobs=7)]: Using backend ThreadingBackend with 7 concurrent workers.
[Parallel(n_jobs=7)]: Done  36 tasks      | elapsed:    7.6s
[Parallel(n_jobs=7)]: Done 186 tasks      | elapsed:   41.9s
[Parallel(n_jobs=7)]: Done 250 out of 250 | elapsed:   56.2s finished
[Parallel(n_jobs=7)]: Using backend ThreadingBackend with 7 concurrent workers.
[Parallel(n_jobs=7)]: Done  36 tasks      | elapsed:    0.0s
[Parallel(n_jobs=7)]: Done 186 tasks      | elapsed:    0.6s
[Parallel(n_jobs=7)]: Done 250 out of 250 | elapsed:    0.9s finished
[Parallel(n_jobs=7)]: Using backend ThreadingBackend with 7 concurrent workers.
[Parallel(n_jobs=7)]: Done  36 tasks      | elapsed:    0.0s


Training MSE 0.028298095713427826
Validation MSE 0.02558334805199175
Variable: chemical_neighbourhood_no_spectrum_nr_tanimoto_power0 Importance: 0.63
Variable: precursor_mz_difference Importance: 0.18
Variable: query_precursor_mz   Importance: 0.14
Variable: s2v_score            Importance: 0.05


[Parallel(n_jobs=7)]: Done 186 tasks      | elapsed:    0.1s
[Parallel(n_jobs=7)]: Done 250 out of 250 | elapsed:    0.2s finished


# Without s2v

In [5]:
subselection_of_features = ["query_precursor_mz",
                            "precursor_mz_difference",
                            "chemical_neighbourhood_no_spectrum_nr_tanimoto_power0",
                            "average_tanimoto_score_for_chemical_neighbourhood_score"]
selection_of_training_scores = training_scores[subselection_of_features]
selection_of_validation_scores = validation_scores[subselection_of_features]
train_random_forest(selection_of_training_scores, selection_of_validation_scores)

  rf.fit(selection_of_training_scores, training_labels)
[Parallel(n_jobs=7)]: Using backend ThreadingBackend with 7 concurrent workers.
[Parallel(n_jobs=7)]: Done  36 tasks      | elapsed:    4.9s
[Parallel(n_jobs=7)]: Done 186 tasks      | elapsed:   28.4s
[Parallel(n_jobs=7)]: Done 250 out of 250 | elapsed:   40.0s finished
[Parallel(n_jobs=7)]: Using backend ThreadingBackend with 7 concurrent workers.
[Parallel(n_jobs=7)]: Done  36 tasks      | elapsed:    0.1s
[Parallel(n_jobs=7)]: Done 186 tasks      | elapsed:    0.8s
[Parallel(n_jobs=7)]: Done 250 out of 250 | elapsed:    1.1s finished
[Parallel(n_jobs=7)]: Using backend ThreadingBackend with 7 concurrent workers.
[Parallel(n_jobs=7)]: Done  36 tasks      | elapsed:    0.0s


Training MSE 0.028983948368223638
Validation MSE 0.027243584423754196
Variable: chemical_neighbourhood_no_spectrum_nr_tanimoto_power0 Importance: 0.64
Variable: precursor_mz_difference Importance: 0.21
Variable: query_precursor_mz   Importance: 0.14
Variable: average_tanimoto_score_for_chemical_neighbourhood_score Importance: 0.01


[Parallel(n_jobs=7)]: Done 186 tasks      | elapsed:    0.1s
[Parallel(n_jobs=7)]: Done 250 out of 250 | elapsed:    0.2s finished


# Without mass difference

In [8]:
subselection_of_features = ["query_precursor_mz",
                            "s2v_score",
                            "chemical_neighbourhood_no_spectrum_nr_tanimoto_power0",
                            "average_tanimoto_score_for_chemical_neighbourhood_score"]
selection_of_training_scores = training_scores[subselection_of_features]
selection_of_validation_scores = validation_scores[subselection_of_features]
train_random_forest(selection_of_training_scores, selection_of_validation_scores)

  rf.fit(selection_of_training_scores, training_labels)
[Parallel(n_jobs=7)]: Using backend ThreadingBackend with 7 concurrent workers.
[Parallel(n_jobs=7)]: Done  36 tasks      | elapsed:    9.1s
[Parallel(n_jobs=7)]: Done 186 tasks      | elapsed:   41.0s
[Parallel(n_jobs=7)]: Done 250 out of 250 | elapsed:   53.9s finished
[Parallel(n_jobs=7)]: Using backend ThreadingBackend with 7 concurrent workers.
[Parallel(n_jobs=7)]: Done  36 tasks      | elapsed:    0.1s
[Parallel(n_jobs=7)]: Done 186 tasks      | elapsed:    0.8s
[Parallel(n_jobs=7)]: Done 250 out of 250 | elapsed:    1.1s finished
[Parallel(n_jobs=7)]: Using backend ThreadingBackend with 7 concurrent workers.
[Parallel(n_jobs=7)]: Done  36 tasks      | elapsed:    0.0s


Training MSE 0.031120032938933288
Validation MSE 0.028257028114222277
Variable: chemical_neighbourhood_no_spectrum_nr_tanimoto_power0 Importance: 0.75
Variable: s2v_score            Importance: 0.15
Variable: query_precursor_mz   Importance: 0.09
Variable: average_tanimoto_score_for_chemical_neighbourhood_score Importance: 0.02


[Parallel(n_jobs=7)]: Done 186 tasks      | elapsed:    0.1s
[Parallel(n_jobs=7)]: Done 250 out of 250 | elapsed:    0.2s finished


# Without query precursor mz

In [7]:
subselection_of_features = ["precursor_mz_difference",
                            "s2v_score",
                            "chemical_neighbourhood_no_spectrum_nr_tanimoto_power0",
                            "average_tanimoto_score_for_chemical_neighbourhood_score"]
selection_of_training_scores = training_scores[subselection_of_features]
selection_of_validation_scores = validation_scores[subselection_of_features]
train_random_forest(selection_of_training_scores, selection_of_validation_scores)

  rf.fit(selection_of_training_scores, training_labels)
[Parallel(n_jobs=7)]: Using backend ThreadingBackend with 7 concurrent workers.
[Parallel(n_jobs=7)]: Done  36 tasks      | elapsed:    8.9s
[Parallel(n_jobs=7)]: Done 186 tasks      | elapsed:   42.0s
[Parallel(n_jobs=7)]: Done 250 out of 250 | elapsed:   56.7s finished
[Parallel(n_jobs=7)]: Using backend ThreadingBackend with 7 concurrent workers.
[Parallel(n_jobs=7)]: Done  36 tasks      | elapsed:    0.1s
[Parallel(n_jobs=7)]: Done 186 tasks      | elapsed:    0.8s
[Parallel(n_jobs=7)]: Done 250 out of 250 | elapsed:    1.1s finished
[Parallel(n_jobs=7)]: Using backend ThreadingBackend with 7 concurrent workers.
[Parallel(n_jobs=7)]: Done  36 tasks      | elapsed:    0.0s


Training MSE 0.0298760487279545
Validation MSE 0.02653991857407557
Variable: chemical_neighbourhood_no_spectrum_nr_tanimoto_power0 Importance: 0.72
Variable: precursor_mz_difference Importance: 0.19
Variable: s2v_score            Importance: 0.06
Variable: average_tanimoto_score_for_chemical_neighbourhood_score Importance: 0.03


[Parallel(n_jobs=7)]: Done 186 tasks      | elapsed:    0.1s
[Parallel(n_jobs=7)]: Done 250 out of 250 | elapsed:    0.2s finished
