In [None]:
import numpy
import matplotlib
import lxml
import pandas
import pyteomics
import csv
import math
import multiprocessing
from pyteomics import mass

In [None]:
pwd

In [None]:
vclh_seq = 'MNHKVHMHHHHHHADEQEEKAKVRTELIQELAQGLGGIEKKNFPTLGDEDLDHTYMTKLLTYLQEREQAENSWRKRLLKGIQDHALDLVPRGSPGLPGPRGEQGPTGPTGPAGPRGLQGLQGLQGERGEQGPTGPAGPRGLQGERGEQGPTGLAGKAGEAGAKGETGPAGPQGPRGEQGPQGLPGKDGEAGAQGRPGKRGKQGQKGEKGEPGTQGAKGDRGETGPVGPRGERGEAGPAGKDGERGPVGPAGKDGQNGQDGLPGKDGKDGQNGKDGLPGKDGKDGQNGKDGLPGKDGKDGQDGKDGLPGKDGKDGLPGKDGKDGQPGKPGKY'

In [None]:
type(vclh_seq)

In [None]:
file = 'C:\\Users\\ray07c\\Documents\\Parkville_data\\fragment_finder\\files\\VCLH_T-145-DSP-04_input.csv'

In [None]:
def mass_cal(peptide_seq):
    return(round(mass.calculate_mass(peptide_seq, average = True), 1))

In [None]:
mass_cal(vclh_seq)

In [None]:
def import_dataframe(file_location):
    data = pandas.read_csv(file_location)
    data.rename(columns={'m/z':'M(obs)'}, inplace=True)
    return(data)

In [None]:
def import_obs_masses(dataframe):
    return(list(dataframe['M(obs)']))

In [None]:
def mass_diff(prot_mass, obs_masses):
    mass_diffs = [prot_mass - masses for masses in obs_masses]
    return(int(min(mass_diffs) // 100))

In [None]:
%load_ext lprun

In [None]:
def fragments(prot_seq, obs_masses, tolerance):

    found = []

    #s = int(min(obs_masses)//105)
    #e = int(max(obs_masses)//90)
    for frag in prot_seq:
        for num in obs_masses:
            s = int(num)//107
            e = int(num)//95
            start = 0
            for i in range(s, e):
                #print(round(mass.calculate_mass(prot_seq[start:i], average = True), 1))
                #for num in obs_masses:
                if math.isclose(round(mass.calculate_mass(prot_seq[start:i], average = True), 1), num, abs_tol = tolerance):
                    if prot_seq[start:i] not in found:
                        found.append(prot_seq[start:i]) 
                        found.append(round(mass.calculate_mass(prot_seq[start:i], average = True), 1))
            s += 1
            e += 1
            start += 1
    print(found)

In [None]:
def fragments2(prot_seq, obs_masses, dataframe, tolerance):

    single_cut = []
    double_cut = []
    start = 0
    s = int(min(obs_masses)//105)
    e = int(max(obs_masses)//90)
    for frag in prot_seq:
        for i in range(s, e):
            if i > len(prot_seq):
                break
            for num in obs_masses:
                if math.isclose(round(mass.calculate_mass(prot_seq[start:i], average = True), 1), num, abs_tol = tolerance):
                    if i == len(prot_seq):
                        find = [prot_seq[start] + str(start + 1),
                                str(i),
                                num, 
                                round(mass.calculate_mass(prot_seq[start:i], average = True), 1),
                                round(num - round(mass.calculate_mass(prot_seq[start:i], average = True), 1), 1)]
                        single_cut.append(find)
                    else:
                        find = [prot_seq[start] + str(start + 1),
                                str(i),
                                num, 
                                round(mass.calculate_mass(prot_seq[start:i], average = True), 1),
                                round(num - round(mass.calculate_mass(prot_seq[start:i], average = True), 1), 1)]
                        double_cut.append(find)
        s += 1
        e += 1
        start += 1
    
    df1 = pandas.DataFrame(single_cut, columns = ['Cutsite (Nterm)', 'Cterm', 'M(obs)', 'M(calc)', 'deltaM'])
    df1.sort_values('M(obs)', inplace=True)
    df2 = pandas.DataFrame(double_cut, columns = ['Cutsite (Nterm)', 'Cutsite (Cterm)', 'M(obs)', 'M(calc)', 'deltaM'])
    df2.sort_values('M(obs)', inplace=True)
    df_i = dataframe[['M(obs)', 'I']]
    df1_i = pandas.merge(df1, df_i, on= 'M(obs)', how='right')
    df1_i.dropna(how = 'any', inplace = True)
    percent_i = [round(((num / max(df1_i['I'])) * 100), 2) for num in df1_i['I']]
    df1_i['I'] = percent_i
    df1_i.rename(columns={'I':'% Intensity'}, inplace=True)

    print(df1_i.to_string(index=False))
    print(df2.to_string(index=False))

In [None]:
dataframe = import_dataframe(file)
whole_prot_mass = mass_cal(vclh_seq)
observed_masses = import_obs_masses(dataframe)

In [None]:
%%time
fragments2(vclh_seq, observed_masses, dataframe, 0.5)

In [None]:
def fragments_multi(prot_seq, obs_mass, tolerance):

    found = []
    start = 0
    s = int(obs_mass)//107
    e = int(obs_mass)//96
    for frag in prot_seq:
        for i in range(s, e):
            if math.isclose(round(mass.calculate_mass(prot_seq[start:i], average = True), 1), obs_mass, abs_tol = tolerance):
                if prot_seq[start:i] not in found:
                    found.append(prot_seq[start:i]) 
                    found.append(round(mass.calculate_mass(prot_seq[start:i], average = True), 1))
        s += 1
        e += 1
        start += 1
    if len(found) != 0:
        print(found)

In [None]:
multi = [(vclh_seq, mass, 0.5) for mass in observed_masses]

In [None]:
multi

In [None]:
%%time
if __name__ == '__main__':
    with multiprocessing.Pool(processes=2) as pool:
        results = pool.starmap(fragments_multi, multi)
    print(results)

In [None]:
def fragments_speedup(prot_seq, obs_mass, mass_diffs, tolerance):

    found = []
    start = 0
    s = mass_diffs
    e = len(prot_seq)
    for frag in prot_seq:
        for i in range(s, e):
            if math.isclose(round(mass.calculate_mass(prot_seq[start:i], average = True), 1), obs_mass, abs_tol = tolerance):
                if prot_seq[start:i] not in found:
                    found.append(prot_seq[start:i]) 
                    found.append(round(mass.calculate_mass(prot_seq[start:i], average = True), 1))
        s += 1
        e += 1
        start += 1
    print(found)
    
pool = multiprocessing.Pool(processes=2)
whole_prot_mass = mass_cal(vclh_seq)
observed_masses = import_obs_masses(file)
mass_differences = mass_diff(whole_prot_mass, observed_masses)

input_list = []

part_list = []
for mass in observed_masses[0:2]:
    part_list = (vclh_seq, mass, mass_differences, 0.5)
    input_list.append(part_list)
#print(input_list)

%%time
pool.starmap(fragments_speedup, input_list)
pool.close()