# Prepare MZmine table in BIOM format

Take an MZmine x IIN feature table and make a biom table

In [1]:
import pandas as pd
import numpy as np
import biom

## Inputting the MZmine x IIN table

In [3]:
filename= 'feature_table/1907_EMPv2_INN_GNPS_quant.csv'
#filename= 'feature_table/EMPsubset_after_gapfill_quant.csv' ### TEST FILE

ft = pd.read_table(filename, index_col=0, header=0, sep=',', low_memory=False)
ft = ft.drop(columns=['row m/z','row retention time','correlation group ID', 'annotation network number','best ion','auto MS2 verify','identified by n=','partners','neutral M mass'])

# Remove duplicates
ft = ft[ft.columns.drop(list(ft.filter(regex='area.')))]

# Remove suffix for all columns
ft.columns = ft.columns.str.rstrip(" cropped Peak area")

#Remove last column "Unnamed"
ft = ft.iloc[:,:-1]

#Rename for biom format}
ft.index.name = '#SampleID'

output = filename[:-4]+'_prepared.tsv'
print(output)
print(ft.shape)
print(ft.columns)
ft.to_csv(output,sep='\t')
ft.tail(2)

feature_table/1907_EMPv2_INN_GNPS_quant_prepared.tsv
(57339, 777)
Index(['1A12_SPE_Blank.mzML', '1A10_1_12_mayer-34-s008-a04.mzML',
       '1A11_1_13_king-27-s003-a04.mzML', '1A1_1_1_metcalf-40-s014-a04.mzML',
       '1A2_1_2_berry-2-s009-a04.mzML', '1A3_1_4_mayer-34-s006-a04.mzML',
       '1A4_1_5_berry-2-s016-a04.mzML', '1A6_1_8_mayer-34-s004-a04.mzML',
       '1A7_1_9_berry-2-s013-a04.mzML', '1A8_1_10_mayer-34-s005-a04.mzML',
       ...
       'Resusp_solvent_post_2E10_Power_beads_tube.mzML',
       'Resusp_solvent_post_2E11_WaterFilter_GPWP_1.mzML',
       'Resusp_solvent_post_2E12_WaterFilter_GPWP_2.mzML',
       'Resusp_solvent_post_2E9_CTAB-1_5.mzML',
       'Resusp_solvent_post_2F1_WaterFilter_GPWP_3.mzML',
       'Resusp_solvent_post_2F2_Blank_Sterivex_SVGV010RS.mzML',
       'Resusp_solvent_post_2F3_Blank_Ziplock.mzML',
       'Resusp_solvent_post_2F4_Blank_Ziplock2.mzML',
       'Resusp_solvent_post_2F6_Blank_Nasco_WhirlPak_12.mzML',
       'Resusp_solvent_post_CTAB.mzML'],


Unnamed: 0_level_0,1A12_SPE_Blank.mzML,1A10_1_12_mayer-34-s008-a04.mzML,1A11_1_13_king-27-s003-a04.mzML,1A1_1_1_metcalf-40-s014-a04.mzML,1A2_1_2_berry-2-s009-a04.mzML,1A3_1_4_mayer-34-s006-a04.mzML,1A4_1_5_berry-2-s016-a04.mzML,1A6_1_8_mayer-34-s004-a04.mzML,1A7_1_9_berry-2-s013-a04.mzML,1A8_1_10_mayer-34-s005-a04.mzML,...,Resusp_solvent_post_2E10_Power_beads_tube.mzML,Resusp_solvent_post_2E11_WaterFilter_GPWP_1.mzML,Resusp_solvent_post_2E12_WaterFilter_GPWP_2.mzML,Resusp_solvent_post_2E9_CTAB-1_5.mzML,Resusp_solvent_post_2F1_WaterFilter_GPWP_3.mzML,Resusp_solvent_post_2F2_Blank_Sterivex_SVGV010RS.mzML,Resusp_solvent_post_2F3_Blank_Ziplock.mzML,Resusp_solvent_post_2F4_Blank_Ziplock2.mzML,Resusp_solvent_post_2F6_Blank_Nasco_WhirlPak_12.mzML,Resusp_solvent_post_CTAB.mzML
#SampleID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
58035,798.522304,0.0,6418.986587,4980.753708,3621.573537,1889.242919,1274.361585,0.0,5605.869664,0.0,...,11400.262541,12749.689412,5300.133025,37534.350286,2556.600091,3641.306549,1790.213585,3197.230014,5302.175484,559778.888275
58036,1663.679888,12408.32206,2608.418965,1569.330566,1946.27663,893.515113,1494.892767,545.338895,1075.495616,2920.333839,...,132097.028559,231277.649211,155663.516731,93498.310273,184489.230939,217241.335747,149440.932652,203072.320452,64731.220115,0.0


## Converting to BIOM table

In [4]:
%%bash

biom convert -i feature_table/1907_EMPv2_INN_GNPS_quant_prepared.tsv \
-o feature_table/1907_EMPv2_INN_GNPS_quant_prepared.biom \
--to-hdf5 --table-type='Metabolite table'