# Decoy Generation Methods 

Methods for generating decoys from a target database. The first is a naive approach and the second is a spectrum-based approach, where added fragment ions are based on fragment ions previously added.

Both methods can be read about [here](file:///C:/Users/kia/Downloads/Metabolomics%20Internship/Papers/Converting%20from%20proteomics%20to%20metabolomics.pdf).

## Setup

In [2]:
import random
import matchms
import numpy as np

## Required functions

In [3]:
#Check to see if two mz values are within a p.p.m. tolerance of each other

def within_ppm(mz_1, mz_2, ppm_tolerance):
    
    tolerance = (mz_1*ppm_tolerance)/(10**6)
    within = 1 if (abs(mz_1-mz_2) <= tolerance) else 0
    return within

#Check to see if an mz value is within a p.p.m. tolerance of a list of mzs

def within_ppm_list(mz, mz_list, ppm_tolerance):
    
    for mz_value in mz_list:
        
        if not within_ppm(mz, mz_value, ppm_tolerance):
            
            return False 
    return True

#Draw n, unique items from a unique list uniformly with probability p

def uniform_draw(unique_list, n, p):
    
    draws = []
    list_length = len(unique_list)
    r = np.random.random            
    while len(draws) != n:
        
        if r() < p:
        
            i = np.random.randint(0, list_length)
            if unique_list[i] not in draws:
            
                draws += [unique_list[i]]        
    return draws

#Checks to see if fragment candidate is NOT within p.p.m. tolerance of added fragments and does not have a greater mz than 
#the precursor_mz of the spectrum (for spectrum-based decoy generation).

def mz_check(fragment_candidate, added_fragments, spectrum, ppm_tolerance):
    
    
    precursor_mz = spectrum.get("precursor_mz")
    candidate_mz = fragment_candidate[1]
    mzs = []
    for fragment in added_fragments:
        mzs += [fragment[1]]
    if precursor_mz:
        
        if precursor_mz < mz:
            return False
        elif within_ppm_list(candidate_mz, mzs, ppm_tolerance):
            return False

    else:
        
        if within_ppm_list(candidate_mz, mzs, ppm_tolerance):
            return False
            
    return True


## Naive Method of Generating Decoys

In [35]:
#Generating decoy spectra by the naive method.

fragment_ions = []
for spectrum in spectra:                              

    intensities= spectrum.peaks.intensities            
    mzs = spectrum.peaks.mz                           
    for i in range(0, len(intensities)):
         
        fragment_ion = [intensities[i], mzs[i]] 
        if fragment_ion not in fragment_ions:
            fragment_ions += [fragment_ion]


#Creating decoy database by adding random fragment ions to empty spectra until desired no. of fragments is reached
#If fragment ion being added has the same mz value as another fragment ion in the decoy spectrum, pass

decoy_spectra = []
no_fragment_ions = len(fragment_ions)
np.random.seed(0)                          #to make it reproducible
for spectrum in spectra:             
    
    desired_no_peaks = len(spectrum.peaks.intensities)
    random_fragments = np.random.choice(np.arrange(no_fragment_ions), desired_no_peaks, replace = False)        
    decoy_intensities = []
    decoy_mzs = []
    for index in random_fragments:      
        
        if not within_ppm_list(fragment_ions[index][1], decoy_mzs, 0):
            
            decoy_intensities += [fragment_ions[index][0]]
            decoy_mzs += [fragment_ions[index][1]]
     
    decoy_spectrum = matchms.Spectrum(intensities = np.array(decoy_intensities), mz = np.array(sorted(decoy_mzs)))    
    decoy_spectra += [decoy_spectrum]
    


#Needs testing

## Spectrum-Based Method of Generating Decoys

In [53]:
# Generating decoy spectra by the spectrum-based method.


decoy_spectra = []
np.random.seed(0)                           #to make it reproducible
for spectrum in spectra:
    
    intensities = spectrum.peaks.intensities            
    mzs = spectrum.peaks.mz  
    desired_no_peaks = len(intensities)
    decoy_intensities = []
    decoy_mzs = []
    added_fragments = []
    precursor_mz = spectrum.get("precursor_mz")
    
#Add precursor peak if present to decoy spectrum. If not, add random peak.    
    
    if precursor_mz in mzs:
        
        index = np.where(mzs == precursor_mz)
        decoy_mzs += [precursor_mz]
        decoy_intensities += [intensities[index[0][0]]]
        added_fragments += [[intensities[index[0][0]], precursor_mz]]
    else:
      
        random_peak = np.random.randint(0, len(intensities))                
        decoy_mzs += [mzs[random_peak]]
        decoy_intensities += [intensities[random_peak]]
        added_fragments += [[intensities[random_peak], mzs[random_peak]]]     

        
#Adding peaks based on peaks already added to the decoy spectrum. If a suitable decoy peak cannot be found to be added,
#we repeat the peak finding process until a peak is found. 
#Currently assume a peak CAN be found in all cases. Will check for this scenario later.
#For testing, we avoid infinite loops by using loop repeats count and break.
#If assumption not true, can count no. peak searches and end loop when searched as many times as there are peaks.

    loop_repeats = 0
    i = 1
    while i != desired_no_peaks:                                        
              
        mz = decoy_mzs[i-1]
        fragment_candidates = []
        for spectrum_2 in spectra:                                      

#Not checking for spectrum_2 being the same as spectrum

            mzs_2 = spectrum_2.peaks.mz                      
            intensities_2 = spectrum_2.peaks.intensities   
            no_peaks = len(mzs_2)
            if within_ppm_list(mz, mzs_2, 5):                                     
                          
                if no_peaks >= 5:
                            
                    draws = uniform_draw(range(no_peaks), 5, 1/no_peaks)
                    for draw in draws:                                          
                                
                        fragment_candidate = [intensities_2[draw], mzs_2[draw]]
                        if fragment_candidate not in fragment_candidates:             
                                    
                            fragment_candidates += [fragment_candidate]
                else:
                            
                    for k in range(0, no_peaks):
                                
                            fragment_candidate = [intensities_2[k], mzs_2[k]]
                            if fragment_candidate not in fragment_candidates:
                                    
                                fragment_candidates += [fragment_candidate]         
                            
#Drawing a fragment from fragment candidates. 
#If there are no fragment candidates, then the fragment candidates process is repeated
#If suitable fragment candidate not found, repeat the process until found (we assume one can be found)

        if len(fragment_candidates) > 0:
                
            fragment_candidate_index = np.random.randint(0, len(fragment_candidates))                
            fragment_candidate = fragment_candidates[fragment_candidate_index]

#Check to see if mz is less than precursor_mz, and NOT within 5 p.p.m. of fragments already in the decoy spectrum.
#Note that this way no fragments with the same mz as a fragment in the decoy spectrum are added.
                
            if (mz_check(fragment_candidate, added_fragments, spectrum, 5)):    
            
                added_fragments += [fragment_candidate]
                decoy_intensities += [fragment_candidate[0]]
                decoy_mzs += [fragment_candidate[1]]
                decoy_spectrum = matchms.Spectrum(intensities = np.array(decoy_intensities), mz = np.array(sorted(decoy_mzs)))    
                decoy_spectra += [decoy_spectrum]
                i += 1
                    
        loop_repeats += 1
        if loop_repeats > 1000:
            break
        
    
        
#Needs testing

In [29]:
#Test Cell

#spec1 = matchms.Spectrum(intensities=np.array([1.0,2.5,3.4]), mz=np.array([3.0,4.5,6.4]))
#spec2 = matchms.Spectrum(intensities=np.array([3.0,1.5,7.4]), mz=np.array([3.0000005,4.000000005,6.000004]))

#spectra = [spec1, spec2]
#decoy_spectra

#c = [1,2,3,4]
#np.random.seed(0)
#uniform_draw(c, 2, 0.5)

a = np.array([1,2,1])
b = np.where(a == 1)
b[0]


0