In [1]:
import pyopenms
from pyopenms import *
from scipy.spatial.distance import euclidean
from operator import itemgetter
import pandas as pd
from binding_sites_helper import sum_spectra, get_difference, find_closest
from os import listdir
import re

In [2]:
input_data_url = 'Data//CSV Data//'
input_files = sorted(listdir(input_data_url))
input_files = [file for file in input_files if file not in ['.DS_Store', 'demo_4.csv']]

#unbound/bound files
unbound_pattern = re.compile("u")
unbound_file_matches = [file for file in input_files if unbound_pattern.match(file)] 
bound_file_matches = [file for file in input_files if file not in unbound_file_matches]

In [15]:
for unbound_file in unbound_file_matches:
    for bound_file in bound_file_matches:

        print(unbound_file)
        print(bound_file)

        #read in the files
        bound_df = pd.read_csv(input_data_url + bound_file)
        unbound_df = pd.read_csv(input_data_url + unbound_file)

        #extract bound and unbound m/z and intensity
        bound_mz = list(bound_df['m/z'].values)
        bound_intensity = list(bound_df['intensity'].values)
        unbound_mz = list(unbound_df['m/z'].values)
        unbound_intensity = list(unbound_df['intensity'].values)

        #Create experiments
        bound_exp = pyopenms.MSExperiment()
        bound_spectrum = MSSpectrum()
        unbound_exp = pyopenms.MSExperiment()
        unbound_spectrum = MSSpectrum()

        #Update the experiment with the bound or unbound data, then store it in a file
        unbound_spectrum.set_peaks([unbound_mz, unbound_intensity])
        unbound_exp.setSpectra([unbound_spectrum])
        pyopenms.MzMLFile().store("unbound.mzML", unbound_exp)
        bound_spectrum.set_peaks([bound_mz, bound_intensity])
        bound_exp.setSpectra([bound_spectrum])
        pyopenms.MzMLFile().store("bound.mzML", bound_exp)

        #Import the two mzML files into experiments, and retrieve a single spectrum for each
        bound = MSExperiment()
        MzMLFile().load("bound.mzML", bound)
        bound_spectrum = sum_spectra(bound.getSpectra())
        unbound = MSExperiment()
        MzMLFile().load("unbound.mzML", unbound)
        unbound_spectrum = sum_spectra(unbound.getSpectra())

        #In theory, subtracting the unbound spectrum from the bound spectrum should return the effects of the binding with the platin
        binding_effect = get_difference(bound_spectrum, unbound_spectrum)

        #Theoretical Ub Spectrum
        #Convert the string representation of Ubiquitin into an amino acid sequence object
        ubiquitin = AASequence.fromString("MQIFVKTLTGKTITLEVEPSDTIENVKAKIQDKEGIPPDQQRLIFAGKQLEDGRTLSDYNIQKESTLHLVLRLRGG")
        tsg = TheoreticalSpectrumGenerator()
        spectrum = MSSpectrum()
        #Initialise parameters object and set the values for parameters
        #To change which parameters are set to true (false is default), specific lines can be commented out and vice versa
        parameters = Param()
        parameters.setValue(b"add_isotopes", b"true", "")
        #parameters.setValue(b"add_losses", b"true", "")
        parameters.setValue(b"add_b_ions", b"true", "")
        parameters.setValue(b"add_y_ions", b"true", "")
        parameters.setValue(b"add_a_ions", b"true", "")
        parameters.setValue(b"add_c_ions", b"true", "")
        parameters.setValue(b"add_x_ions", b"true", "")
        parameters.setValue(b"add_z_ions", b"true", "")
        parameters.setValue(b"add_metainfo", b"true", "")
        tsg.setParameters(parameters)
        #Generate the theoretical spectrum of Ubiquitin
        tsg.getSpectrum(spectrum, ubiquitin, 1, 2)

        #Stores the peaks of the difference spectrum in a list of [m/z, i] items
        difference = []
        for mz, i in binding_effect.items():
            if i < 0:
                difference.append([mz,-i])

        #The theoretical spectrum is converted from a pair of lists to a single list of [m/z, i] items
        theoretical = []
        for i in range(len(spectrum.get_peaks()[0])):
            current_mz = spectrum.get_peaks()[0][i]
            current_intensity = spectrum.get_peaks()[1][i]
            theoretical.append([current_mz, current_intensity])


        len_t = len(theoretical)
        #Sort the difference spectrum by intensity values from largest to smallest
        filtered_difference = sorted(difference, key=itemgetter(1), reverse=True)
        #To make the DTW calculation less computationally intensive, only the n peaks with the highest intensity values are used (where n is the number of peaks in the theoretical spectrum)
        if len_t < len(filtered_difference):
            filtered_difference = filtered_difference[:len_t]
        len_fd = len(filtered_difference)

        #The number of peaks was further reduced such that only peaks with large enough intensities were considered as meaningful indicaitons of binding sites
        significant_peaks = filtered_difference[:100]

        #Create a list of pairs of peaks matching each peak from significant_peaks to it's closest counterpart in the theoretical spectrum
        matched_peaks = []
        for peak in significant_peaks:
            theo_peak = find_closest(peak, theoretical)
            matched_peaks.append([peak, theo_peak])

        #For each pair of matched peaks, their m/z and intensity is added to a dictionary
        matching_significance = {}
        for match in matched_peaks:
            match_mz = match[1][0]
            match_i = match[0][1]
            #Multiple signigicant peaks may be matched to the same theoretical peak if that theoretical peak is the closest peak to multiple experimental peaks, in which case their intensities are added together
            #These sums of intensities are reffered to as the 'significance' of the theoretical peak which was matched
            if match_mz not in matching_significance.keys():
                matching_significance[match_mz] = match_i
            else:
                matching_significance[match_mz] += match_i

            #For each peak identified in the matching_significance above, the ion of that peak is identified from the original TheoreticalSpectrumGenerator spectrum
            fragments = []
            for ion, peak in zip(spectrum.getStringDataArrays()[0], spectrum):
                for peak_mz, peak_sig in matching_significance.items():
                    if peak.getMZ() == peak_mz:
                        fragments.append([ion, peak_mz, peak_sig])
            #These fragments (which represent potential binding sites) are sorted from most significant to least
            fragments = sorted(fragments, key=itemgetter(2), reverse=True)

            #Results of potential binding sites
            #fragment type, location, and charge (e.g. y15++), the m/z of that fragment, and the fragments significance as a potential binding site
            ion_list = [fragment[0] for fragment in fragments]
            mass_charge_list = [fragment[1] for fragment in fragments]
            relative_significance = [fragment[2] for fragment in fragments]

            filename_out = unbound_file.split(".csv")[0] + "-" + bound_file.split(".csv")[0] + ".xlsx"
            results_df = pd.DataFrame({'Ion':ion_list, 'm/z':mass_charge_list, 'Relative Significance': relative_significance})
            results_df.to_excel(filename_out, index=False)

ub_1.csv
c_1.csv
ub_1.csv
c_10.csv
ub_1.csv
c_11.csv
ub_1.csv
c_12.csv
ub_1.csv
c_13.csv
ub_1.csv
c_14.csv
ub_1.csv
c_15.csv
ub_1.csv
c_16.csv
ub_1.csv
c_17.csv
ub_1.csv
c_18.csv
ub_1.csv
c_19.csv
ub_1.csv
c_2.csv
ub_1.csv
c_3.csv
ub_1.csv
c_4.csv
ub_1.csv
c_5.csv
ub_1.csv
c_6.csv
ub_1.csv
c_7.csv
ub_1.csv
c_8.csv
ub_1.csv
c_9.csv
ub_1.csv
o_1.csv
ub_1.csv
o_2.csv
ub_1.csv
o_3.csv
ub_1.csv
o_4.csv
ub_1.csv
o_5.csv
ub_1.csv
o_6.csv
ub_1.csv
t_1.csv
ub_1.csv
t_2.csv
ub_1.csv
t_3.csv
ub_1.csv
t_4.csv
ub_1.csv
t_5.csv
ub_2.csv
c_1.csv
ub_2.csv
c_10.csv
ub_2.csv
c_11.csv
ub_2.csv
c_12.csv
ub_2.csv
c_13.csv
ub_2.csv
c_14.csv
ub_2.csv
c_15.csv
ub_2.csv
c_16.csv
ub_2.csv
c_17.csv
ub_2.csv
c_18.csv
ub_2.csv
c_19.csv
ub_2.csv
c_2.csv
ub_2.csv
c_3.csv
ub_2.csv
c_4.csv
ub_2.csv
c_5.csv
ub_2.csv
c_6.csv
ub_2.csv
c_7.csv
ub_2.csv
c_8.csv
ub_2.csv
c_9.csv
ub_2.csv
o_1.csv
ub_2.csv
o_2.csv
ub_2.csv
o_3.csv
ub_2.csv
o_4.csv
ub_2.csv
o_5.csv
ub_2.csv
o_6.csv
ub_2.csv
t_1.csv
ub_2.csv
t_2.csv
ub_2.csv
t_