# Development Notes
runs using conda env msfeast_development in python 3.10
python 3.12 install for matchms fails.

```bash
conda create --name msfeast_development python=3.10
conda activate msfeast_development
pip install ipykernel
pip install matchms
pip install plotly
```

In [105]:
import matchms
import pandas as pd
import numpy as np
import plotly
import copy

In [106]:
# Extracting relevant statistical data only from gnps metadata table
raw_statistical_metadata = pd.read_table("data/metadata.tsv")
raw_statistical_metadata.head()
treatment_table = raw_statistical_metadata[["filename","ATTRIBUTE_ Taxonomy"]]
selection_mask = treatment_table["ATTRIBUTE_ Taxonomy"].isin(['FB_Hericium', 'FB_Pleurotus'])
treatment_table = treatment_table[selection_mask]
treatment_table.columns = ["sample_id", "treatment"]
treatment_table.reset_index(drop = True, inplace=True)
reference_category = treatment_table["treatment"].iloc[0]
treatment_table["sample_id"] = treatment_table["sample_id"].astype(dtype="string")
sample_id_list = treatment_table["sample_id"].to_list()
treatment_table.head(), reference_category, sample_id_list[0:4]

(      sample_id    treatment
 0  P1_pos.mzXML  FB_Hericium
 1  P2_pos.mzXML  FB_Hericium
 2  P3_pos.mzXML  FB_Hericium
 3  P4_pos.mzXML  FB_Hericium
 4  P5_pos.mzXML  FB_Hericium,
 'FB_Hericium',
 ['P1_pos.mzXML', 'P2_pos.mzXML', 'P3_pos.mzXML', 'P4_pos.mzXML'])

In [107]:
# Extracting and structurng quantification table
quantification_table = pd.read_csv("data/quantification_table.csv")
feature_id_column_name = "row ID"
sample_id_suffix = " Peak area"
quantification_table = quantification_table.filter(regex=f"{feature_id_column_name}|{sample_id_suffix}", axis=1)
quantification_table = quantification_table.rename(columns = {'row ID':'feature_id'})

quantification_table = quantification_table.melt(id_vars="feature_id").reset_index(drop=True)
quantification_table.head()


Unnamed: 0,feature_id,variable,value
0,555,MS0_NEW_POS.mzXML Peak area,4106535.0
1,994,MS0_NEW_POS.mzXML Peak area,6008576.5
2,15743,MS0_NEW_POS.mzXML Peak area,12726.544
3,2563,MS0_NEW_POS.mzXML Peak area,56480.33
4,8783,MS0_NEW_POS.mzXML Peak area,19829.701


In [108]:

quantification_table = quantification_table.rename(columns = {'variable':'sample_id'})
quantification_table["sample_id"] = quantification_table["sample_id"].str.replace(pat=" Peak area", repl="").reset_index(drop = True)
quantification_table["sample_id"] = quantification_table["sample_id"].astype(dtype="string")
quantification_table["feature_id"] = quantification_table["feature_id"].astype("str").reset_index(drop = True)
quantification_table
quantification_table = pd.pivot(quantification_table, columns="feature_id",  index = "sample_id", values="value")
quantification_table.insert(loc = 0, column= "sample_id", value = quantification_table.index)
quantification_table.index.name = "index"

quantification_table = quantification_table.reset_index(drop=True)
quantification_table = quantification_table.rename_axis(None, axis="columns")

# subset the sample_ids to those actually used by the treatments
quantification_table = quantification_table.query("sample_id in @sample_id_list")

quantification_table.head()

Unnamed: 0,sample_id,10001,10010,10012,10013,10015,10023,10026,10041,10043,...,996,9960,9963,9965,9972,9976,9986,9992,9993,9994
0,E10_pos.mzXML,24834.273,0.0,10558.262,0.0,0.0,0.0,132596.05,0.0,0.0,...,16122.999,0.0,300892.8,0.0,0.0,0.0,19187.918,0.0,32858.527,1967.7911
1,E11_pos.mzXML,36303.04,0.0,17791.215,0.0,0.0,0.0,195410.2,0.0,0.0,...,25487.537,0.0,311379.75,0.0,0.0,706.03204,20693.646,0.0,36913.086,4878.62
2,E12_pos.mzXML,29205.367,0.0,13557.281,0.0,0.0,0.0,172219.33,0.0,0.0,...,18186.266,0.0,175308.56,0.0,0.0,0.0,17801.588,0.0,41598.72,3934.7422
3,E1_pos.mzXML,92415.71,0.0,131143.0,5604.7725,0.0,699.12036,538345.1,0.0,0.0,...,0.0,0.0,314285.34,2021.3296,0.0,4883.4067,19394.924,0.0,331399.3,36612.418
4,E2_pos.mzXML,15658.006,0.0,105531.95,5802.375,0.0,0.0,84646.195,1530.7404,0.0,...,0.0,0.0,215472.31,1582.7941,0.0,5076.3384,17537.525,0.0,241999.69,17393.959


In [109]:
raw_spectra = list(matchms.importing.load_from_mgf("data/spectra.mgf"))
tmp_spectra = [matchms.filtering.default_filters(spectrum) for spectrum in raw_spectra] # <-- this uses spectra!
tmp_spectra = [matchms.filtering.normalize_intensities(spectrum) for spectrum in tmp_spectra] # <-- this uses tmp_spectra!
tmp_spectra = [matchms.filtering.reduce_to_number_of_peaks(spectrum, n_required= 10, n_max = 200) for spectrum in tmp_spectra]
tmp_spectra = [spectrum for spectrum in tmp_spectra if spectrum is not None]
[spectrum.set("feature_id", spectrum.get("scans")) for spectrum in tmp_spectra] # code to add a feature_id column to the example data
spectra = tmp_spectra

In [110]:
print(spectra[0])
print(spectra[0].peaks.intensities)
print(spectra[0].peaks.mz)
print(spectra[0].metadata.keys())
print(spectra[0].get("scans")) # --> this is the effective id column 
print(spectra[0].get("feature_id"))
print(spectra[0].get("precursor_mz"))

Spectrum(precursor m/z=147.11, 16 fragments between 56.1 and 317.8)
[0.1        0.04042553 0.01595745 0.01702128 0.01808511 1.
 0.62765957 0.02021277 0.01914894 0.24468085 0.09255319 0.01702128
 0.01595745 0.01808511 0.01808511 0.01702128]
[ 56.0504  69.034   75.8074  82.0341  83.0501  84.045   84.0814 102.0551
 105.4873 130.0499 130.0865 152.7332 155.1189 173.9068 243.1815 317.8328]
dict_keys(['scans', 'charge', 'collision_energy', 'retention_time', 'ms_level', 'precursor_mz', 'ionmode', 'feature_id'])
32
32
147.1129


# Simplify and shorten the data for trial runs

In [111]:
subset_spectra = spectra[0:30]
feature_ids = [str(spectrum.get("feature_id")) for spectrum in subset_spectra] # these are strings in matchms, assumed strings throughout
# feature_ids
subset_qt = quantification_table[ ["sample_id"] + feature_ids ]

In [112]:
tmp_quantification_table = quantification_table.set_index("sample_id")
tmp_treatment_table = treatment_table.set_index("sample_id")

# Pandas pipe to align sample_id from quantification table and treatment table
(
  tmp_quantification_table.
  join(tmp_treatment_table, on="sample_id", how="left").
  reset_index()
  [["sample_id", "treatment"]]
)

Unnamed: 0,sample_id,treatment
0,E10_pos.mzXML,FB_Pleurotus
1,E11_pos.mzXML,FB_Pleurotus
2,E12_pos.mzXML,FB_Pleurotus
3,E1_pos.mzXML,FB_Pleurotus
4,E2_pos.mzXML,FB_Pleurotus
5,E3_pos.mzXML,FB_Pleurotus
6,E4_pos.mzXML,FB_Pleurotus
7,E5_pos.mzXML,FB_Pleurotus
8,E6_pos.mzXML,FB_Pleurotus
9,E7_pos.mzXML,FB_Pleurotus


# msFeaST pipeline use development


In [113]:
test_spectra = copy.deepcopy(subset_spectra)
test_quantification_table = copy.deepcopy(subset_qt)
test_treatment_data = copy.deepcopy(treatment_table)

In [119]:
%load_ext autoreload
%autoreload 2
from msfeastPipeline import msfeast

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [120]:
pipelineInstance = msfeast()
pipelineInstance

msfeast(quantification_table=None, treatment_table=None, spectra_list=None, _dataLoaded=False, _contrastSelected=False, _referenceCategorySelected=False, _similarityMatrixAvailable=False, _kmedoidGridComputed=False, _kmedoidIndexSelected=False, _tnseGridComputed=False, _tsneIndexSelected=False, _statisticsDataComputed=False)

In [121]:
pipelineInstance.attachData(quantification_table=test_quantification_table, treatment_table=test_treatment_data, spectra=test_spectra)

In [122]:
print(pipelineInstance.quantification_table.iloc[0:5, 0:5]) # omitting some elements
print(pipelineInstance.treatment_table.head())
print(pipelineInstance.spectra[0])
print(pipelineInstance._dataLoaded)


       sample_id           32        112        132          138
0  E10_pos.mzXML    3615.2964  764741.60  44274.230    559256.06
1  E11_pos.mzXML    4908.3003  656580.06  47164.520    441942.38
2  E12_pos.mzXML    1556.7747  912276.20  44902.120    675668.75
3   E1_pos.mzXML  132213.4400  723551.50  27847.564   9628524.00
4   E2_pos.mzXML  120811.7340  761905.06  24129.504  24587472.00
      sample_id    treatment
0  P1_pos.mzXML  FB_Hericium
1  P2_pos.mzXML  FB_Hericium
2  P3_pos.mzXML  FB_Hericium
3  P4_pos.mzXML  FB_Hericium
4  P5_pos.mzXML  FB_Hericium
Spectrum(precursor m/z=147.11, 16 fragments between 56.1 and 317.8)
True


In [142]:
#pipelineInstance.computeSimilarities(method = "modified cosine score")


scores = (
  matchms.calculate_scores(
    test_spectra[0:3], 
    test_spectra[0:3], 
    matchms.similarity.ModifiedCosine(), 
    is_symmetric=True).
  to_array(name="ModifiedCosine_score")
)
scores

array([[1.        , 0.        , 0.07769411],
       [0.        , 1.        , 0.60447929],
       [0.07769411, 0.60447929, 1.        ]])