# Development Notes
runs using conda env msfeast_development in python 3.10
python 3.12 install for matchms fails.

```bash
conda create --name msfeast_development python=3.10
conda activate msfeast_development
pip install ipykernel
pip install matchms
pip install plotly
```

In [None]:
import matchms
import pandas as pd
import numpy as np
import plotly
import copy

In [None]:
# Extracting relevant statistical data only from gnps metadata table
raw_statistical_metadata = pd.read_table("data/metadata.tsv")
raw_statistical_metadata.head()
treatment_table = raw_statistical_metadata[["filename","ATTRIBUTE_ Taxonomy"]]
selection_mask = treatment_table["ATTRIBUTE_ Taxonomy"].isin(['FB_Hericium', 'FB_Pleurotus'])
treatment_table = treatment_table[selection_mask]
treatment_table.columns = ["sample_id", "treatment"]
treatment_table.reset_index(drop = True, inplace=True)
reference_category = treatment_table["treatment"].iloc[0]
treatment_table["sample_id"] = treatment_table["sample_id"].astype(dtype="string")
sample_id_list = treatment_table["sample_id"].to_list()
treatment_table.head(), reference_category, sample_id_list[0:4]

In [None]:
# Extracting and structurng quantification table
quantification_table = pd.read_csv("data/quantification_table.csv")
feature_id_column_name = "row ID"
sample_id_suffix = " Peak area"
quantification_table = quantification_table.filter(regex=f"{feature_id_column_name}|{sample_id_suffix}", axis=1)
quantification_table = quantification_table.rename(columns = {'row ID':'feature_id'})

quantification_table = quantification_table.melt(id_vars="feature_id").reset_index(drop=True)
quantification_table.head()


In [None]:

quantification_table = quantification_table.rename(columns = {'variable':'sample_id'})
quantification_table["sample_id"] = quantification_table["sample_id"].str.replace(pat=" Peak area", repl="").reset_index(drop = True)
quantification_table["sample_id"] = quantification_table["sample_id"].astype(dtype="string")
quantification_table["feature_id"] = quantification_table["feature_id"].astype("str").reset_index(drop = True)
quantification_table
quantification_table = pd.pivot(quantification_table, columns="feature_id",  index = "sample_id", values="value")
quantification_table.insert(loc = 0, column= "sample_id", value = quantification_table.index)
quantification_table.index.name = "index"

quantification_table = quantification_table.reset_index(drop=True)
quantification_table = quantification_table.rename_axis(None, axis="columns")

# subset the sample_ids to those actually used by the treatments
quantification_table = quantification_table.query("sample_id in @sample_id_list")
quantification_table.head()

In [None]:
raw_spectra = list(matchms.importing.load_from_mgf("data/spectra.mgf"))
tmp_spectra = [matchms.filtering.default_filters(spectrum) for spectrum in raw_spectra] # <-- this uses spectra!
tmp_spectra = [matchms.filtering.normalize_intensities(spectrum) for spectrum in tmp_spectra] # <-- this uses tmp_spectra!
tmp_spectra = [matchms.filtering.reduce_to_number_of_peaks(spectrum, n_required= 10, n_max = 200) for spectrum in tmp_spectra]
tmp_spectra = [spectrum for spectrum in tmp_spectra if spectrum is not None]
[spectrum.set("feature_id", spectrum.get("scans")) for spectrum in tmp_spectra] # code to add a feature_id column to the example data
spectra = tmp_spectra

In [None]:
print(spectra[0])
print(spectra[0].peaks.intensities)
print(spectra[0].peaks.mz)
print(spectra[0].metadata.keys())
print(spectra[0].get("scans")) # --> this is the effective id column 
print(spectra[0].get("feature_id"))
print(spectra[0].get("precursor_mz"))

# Simplify and shorten the data for trial runs

In [None]:
subset_spectra = spectra[0:30]
feature_ids = [str(spectrum.get("feature_id")) for spectrum in subset_spectra] # these are strings in matchms, assumed strings throughout
# feature_ids
subset_qt = quantification_table[ ["sample_id"] + feature_ids ]

In [None]:
tmp_quantification_table = quantification_table.set_index("sample_id")
tmp_treatment_table = treatment_table.set_index("sample_id")

# Pandas pipe to align sample_id from quantification table and treatment table
(
  tmp_quantification_table.
  join(tmp_treatment_table, on="sample_id", how="left").
  reset_index()
  [["sample_id", "treatment"]]
)

# msFeaST pipeline use development


In [None]:
test_spectra = copy.deepcopy(subset_spectra)
test_quantification_table = copy.deepcopy(subset_qt)
test_treatment_data = copy.deepcopy(treatment_table)


test_quantification_table.to_csv("test_quant_table.csv")
test_treatment_data.to_csv("test_treat_table.csv")
matchms.exporting.save_as_mgf(test_spectra, "test_spectra.mgf")

In [None]:
len(test_spectra)
test_quantification_table.shape
test_treatment_data.shape

In [None]:
%load_ext autoreload
%autoreload 2
from msfeastPipeline import msfeast

In [None]:
pipelineInstance = msfeast()
pipelineInstance

In [None]:
pipelineInstance.attachData(quantification_table=test_quantification_table, treatment_table=test_treatment_data, spectra=test_spectra)

In [None]:
print(pipelineInstance.quantification_table.iloc[0:5, 0:5]) # omitting some elements
print(pipelineInstance.treatment_table.head())
print(pipelineInstance.spectra[0])
print(pipelineInstance._dataLoaded)
