# Extract Features

Extract SMILES and their discretised fragments from the MoNA LC-MS-MS ESI dataset (http://mona.fiehnlab.ucdavis.edu/downloads)

In [1]:
import os
from glob import glob
import zipfile

from IPython.display import display, HTML
from collections import defaultdict
from rdkit import Chem
import numpy as np
from SmilesEnumerator import SmilesEnumerator

In [2]:
def extract_features(zipped_input, msp_file, maxlength_smiles, max_mz, min_intensity, decimal_point, 
                     mul=10, representation=0):
    
    smiles_to_intspec = []
    seen = set()
    with zipfile.ZipFile(zipped_input) as z:
        with z.open(msp_file) as f:
            state = 0
            for line in f:
                line = line.decode("utf-8") 
                line = line.rstrip()
                if state == 0 and line.startswith('Name'):
                    state = 1            
                if state == 1 and line.startswith('Comment'):
                    tokens = line.split(' ')
                    filtered = [x.replace('"', '') for x in tokens if 'SMILES' in x]
                    smiles = []
                    for x in filtered:
                        pos = x.find('=')
                        smile = x[pos+1:]
                        smiles.append(smile)
                    intspec = set()
                    state = 2            
                if state == 2:
                    if line.startswith('Name'): # finished processing one block
                        if len(smiles) > 0:
                            for s in smiles:
                                
                                if len(s) > maxlength_smiles:
                                    continue

                                intspec = sorted(list(intspec), key=lambda x: x[0]) # sort by m/z
                                mz, intensities = zip(*intspec)
                                mz = np.array(mz)
                                intensities = np.array(intensities)                                
                                if decimal_point == 0:
                                    mz = mz.astype(int)
                                else:
                                    mz = np.round(mz, decimal_point)
                                if representation == 0 or representation == 3:
                                    intensities = intensities / max(intensities)
                                elif representation == 1:
                                    intensities = intensities / sum(intensities)
                                counts = (intensities * mul).astype(int)
                                # print(s)
                                # print(mz)
                                # print(counts)
                                assert(len(intensities) == len(mz))
                                assert(len(counts) == len(mz))
                                
                                filtered_mz = []
                                for i in range(len(mz)):
                                    if mz[i] < max_mz and intensities[i] > min_intensity:
                                        if representation == 0 or representation == 1:
                                            # character-level encoding
                                            # each m/z value is represented as a character,
                                            # with counts proportional to its intensity
                                            words = [mz[i]] * counts[i]
                                            filtered_mz.extend(words)
                                        elif representation == 2:
                                            # word-level encoding
                                            # each m/z value is represented as a word
                                            # intensity information is discarded
                                            filtered_mz.append(mz[i])
                                        elif representation == 3:
                                            # word-level encoding but including intensity
                                            # each pair of (m/z, intensity) becomes a word
                                            words = [mz[i], intensities[i]]
                                            filtered_mz.append(words)
                                # print(filtered_mz)
                                        
                                # RESULT ARE STORED HERE
                                item = (s, filtered_mz, )                                                                               
                                smiles_to_intspec.append(item)
                                seen.add(s)
                                # if len(smiles_to_intspec) % 1000 == 0:
                                #     print('Added', len(smiles_to_intspec))

                        state = 0
                    elif len(line) > 0 and line[0].isdigit():
                        tokens = line.split(' ')
                        mz = float(tokens[0])
                        intensity = float(tokens[1])
                        intspec.add((mz, intensity,))
                        
    return smiles_to_intspec

In [3]:
def write_data(data, outfile, representation, augment=0):
    sme = SmilesEnumerator()
    with open(outfile, 'w') as f:
        for smiles, original_spectra in data:
            
            if representation == 3:
                spectra = ['(%s %f)' % (mz, intensity) for mz, intensity in original_spectra]
                spectra_str = ','.join(map(str, spectra))
                output = smiles + '\t' + spectra_str + '\n'
                f.write(output)                        
                
                # generate more data if required
                for i in range(augment):
                    try:
                        s = sme.randomize_smiles(smiles) # generate new smiles
                        random_num = np.random.random()
                        # randomly remove peaks proportional to their intensities                    
                        filtered = list(filter(lambda x: x[1] > random_num, original_spectra))
                        spectra = ['(%s %f)' % (mz, intensity) for mz, intensity in filtered]                
                        spectra_str = ','.join(map(str, spectra))
                        output = s + '\t' + spectra_str + '\n'
                        f.write(output)                    
                    except AttributeError:
                        continue
                
            else:
                spectra = original_spectra                            
                spectra_str = ','.join(map(str, spectra))
                output = smiles + '\t' + spectra_str + '\n'
                f.write(output)                        
            
        print('Written: %s' % outfile)
            

In [4]:
msp_file = 'MoNA-export-LC-MS-MS_Positive_Mode.msp'
zipped_input = '../data/' + msp_file + '.zip'
representations = [0, 1, 2, 3]

In [5]:
for rep in representations:
    
    if rep == 3:
        augment = 3
    else:
        augment = 0
    
    # small dataset
    data = extract_features(zipped_input, msp_file, maxlength_smiles=30, max_mz=900, min_intensity=0.1, 
                            decimal_point=3, mul=100, representation=rep)
    write_data(data, '../data/representation_%d/data_small.txt' % rep, rep, augment=augment)
    
    # medium dataset
    data = extract_features(zipped_input, msp_file, maxlength_smiles=60, max_mz=900, min_intensity=0.1, 
                            decimal_point=3, mul=100, representation=rep)
    write_data(data, '../data/representation_%d/data_medium.txt' % rep, rep, augment=augment)
    
    # large dataset
    data = extract_features(zipped_input, msp_file, maxlength_smiles=90, max_mz=900, min_intensity=0.1, 
                            decimal_point=3, mul=100, representation=rep)
    write_data(data, '../data/representation_%d/data_large.txt' % rep, rep, augment=augment)

Written: ../data/representation_0/data_small.txt
Written: ../data/representation_0/data_medium.txt
Written: ../data/representation_0/data_large.txt
Written: ../data/representation_1/data_small.txt
Written: ../data/representation_1/data_medium.txt
Written: ../data/representation_1/data_large.txt
Written: ../data/representation_2/data_small.txt
Written: ../data/representation_2/data_medium.txt
Written: ../data/representation_2/data_large.txt
Written: ../data/representation_3/data_small.txt
Written: ../data/representation_3/data_medium.txt
Written: ../data/representation_3/data_large.txt
