# Fragment Finder
## A Python program to match protein fragments from a known amino acid sequence with observed protein masses as detected on a mass spectrometer

In [None]:
import numpy
import matplotlib
import lxml
import pandas
import pyteomics
import csv
import math
import multiprocessing
import itertools as it
import xlsxwriter
from pyteomics import mass
from ipywidgets import widgets, interact, interactive
# Import the display function for explicitly displaying widgets in the notebook
from IPython.display import display, HTML
import ipyupload
#To open files
from ipyupload import FileUpload
from frag_finder import import_dataframe_jupyter, import_obs_masses, fragments_multi
#Report lab PDF creator libraries
from reportlab.lib import colors
from reportlab.lib.pagesizes import A4
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Image, Flowable, PageBreak, Table
from reportlab.platypus.tables import TableStyle
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
from reportlab.lib.enums import TA_LEFT, TA_CENTER
import datetime

In [None]:
class MCLine(Flowable):
    """
    Line flowable --- draws a line in a flowable
    http://two.pairlist.net/pipermail/reportlab-users/2005-February/003695.html
    """
 
    #----------------------------------------------------------------------
    def __init__(self, width, height=0):
        Flowable.__init__(self)
        self.width = width
        self.height = height
 
    #----------------------------------------------------------------------
    def __repr__(self):
        return "Line(w=%s)" % self.width
 
    #----------------------------------------------------------------------
    def draw(self):
        """
        draw the line
        """
        self.canv.line(0, self.height, self.width, self.height)

In [None]:
def on_button_clicked(b):
    dataframe = import_dataframe_jupyter(file_loc.value)
    observed_masses = import_obs_masses(dataframe)
    #making a list of argument inputs for the multiprocessing
    multi = [(seq.value, mass, mass_cal.value, dataframe, mass_tol.value) for mass in observed_masses]
    #converting the protein amino acid sequence into a list of individual amino acids so that the cut
    #location can be simply inserted by using indexing
    amino_list_single_print = [a for a in seq.value]
    amino_list_single_save = [a for a in seq.value]
    amino_list_double_print = [a for a in seq.value]
    amino_list_double_save = [a for a in seq.value]
    #creating empty string to be able to convert the above lists back into single strings
    rejoined_single_print = ''
    rejoined_single_save = ''
    rejoined_double_print = ''
    rejoined_double_save = ''
    
    if __name__ == '__main__':
        with multiprocessing.Pool(processes = cores.value) as pool:
            results = pool.starmap(fragments_multi, multi)
            #combining the multiprocessing results to put into a pandas dataframe
            combined = [index for line in results for index in line]
            df1 = pandas.DataFrame(combined, columns = ['# Cuts', 'Nterm AA', 'Nterm Num', 'Cterm AA', 'Cterm Num', 'M(obs)', 'M(calc)', 'deltaM'])
            #Selecting the intensity 'I' parameter and 'M(obs)' so that can merge it to the dataframe
            df_i = dataframe[['M(obs)', 'I']]
            df1_i = pandas.merge(df1, df_i, on= 'M(obs)', how='right')
            df1_i.dropna(how = 'any', inplace = True)
            #converting the intensity to a percentage
            percent_i = [round(((num / max(df1_i['I'])) * 100), 2) for num in df1_i['I']]
            df1_i['I'] = percent_i
            df1_i.rename(columns={'I':'% Intensity'}, inplace=True)
            df1_i.sort_values(['# Cuts', 'M(obs)'], ascending = [False, True], inplace=True)
            #changing the data types so the amino acid number is an integer
            df1_i['Nterm Num'] = df1_i['Nterm Num'].astype(dtype='int64')
            df1_i['Cterm Num'] = df1_i['Cterm Num'].astype(dtype='int64')
            df1_i = df1_i.reset_index(drop = True)
            df1_i.index += 1
            df1_i.reset_index(level=0, inplace=True)
            df1_i.rename(columns={'index':'Peak'}, inplace=True)
            df1_i.index += 1
            df1_i['M(obs)'].round(2)
            #splitting the datafram into two dataframes, one for single cuts and one for double cuts
            mask = df1_i['# Cuts'] == 'Single'
            df2_i = df1_i[mask]
            df3_i = df1_i[~mask]

            #iterating over the protein amino acid sequence list and inserting colour formatting and the
            #index identification number from the dataframe so you know which mass has come about from
            #which cut location
            for a, b, in it.zip_longest(df2_i['Nterm Num'], df2_i.index):
                amino_list_single_print[a-2] = amino_list_single_print[a-2] + '\x1b[0;33;40m' + str(b) + '\x1b[0m'
                amino_list_single_save[a-2] = amino_list_single_save[a-2] + '<a href="#MYANCHOR" color="red">' + str(b) + '</a>'

            for a, b, in it.zip_longest(df3_i['Nterm Num'], df3_i.index):
                amino_list_double_print[a-2] = amino_list_double_print[a-2] + '\x1b[6;36;40m' + str(b) + '\x1b[0m'
                amino_list_double_save[a-2] = amino_list_double_save[a-2] + '<a href="#MYANCHOR" color="red">' + str(b) + '</a>'

            for a, b, in it.zip_longest(df3_i['Cterm Num'], df3_i.index):
                amino_list_double_print[a-2] = amino_list_double_print[a-2] + '\x1b[0;35;40m' + str(b) + '\x1b[0m'
                amino_list_double_save[a-2] = amino_list_double_save[a-2] + '<a href="#MYANCHOR" color="blue">' + str(b) + '</a>'

        #rejoining the amino acid sequence lists back into a single string
        rejoined_single_save = rejoined_single_save.join(amino_list_single_save)
        rejoined_double_save = rejoined_double_save.join(amino_list_double_save)
        
        #using simpledoc to make PDF
        
        if len(df2_i) and len(df3_i) == 0:
            print('Observed Masses Could Not Be Matched To Amino Acid Sequence')
        else:
            if save_output.value == True:                     
                doc = SimpleDocTemplate(save_loc.value.strip('"') + "\\" + pdf_title.value + '.pdf', pagesize=A4)

                story = []

                styles=getSampleStyleSheet()
                styles.add(ParagraphStyle(name='Center', alignment=TA_CENTER, fontname = "Helvetica"))
                styles.add(ParagraphStyle(name='Courier', alignment=TA_LEFT, backColor = 'lightgrey',  fontname = "Courier"))
                
                #Line
                line = MCLine(450)
                story.append(line)

                #Title
                ptext = '<font size=20>%s</font>' % pdf_title.value
                story.append(Paragraph(ptext, styles["Center"]))

                #Space
                story.append(Spacer(2, 12))

                #Line
                story.append(line)

                #Space
                story.append(Spacer(2, 12))

                #Date Report Created
                now = datetime.datetime.now()
                date_created = 'Report Date: ' + str(now.day) + '/' + str(now.month) + '/' + str(now.year)
                ptext = '<font size=15>%s</font>' % date_created
                story.append(Paragraph(ptext, styles["Center"]))

                #Line Space
                story.append(Spacer(1, 12))
            
                #Image Spectrum
                if image_output.value == True:
                    im = Image(add_image.value.strip('"'))
                    im.drawHeight = 189
                    im.drawWidth = 500
                    im.hAlign = 'CENTER'
                    story.append(im)

                #Line Space
                story.append(Spacer(1, 12))
            
                #Choosing whether to add single cuts, double cuts or both to PDF Report
            
                if single_cut.value and double_cut.value == True:
                    if len(df2_i) and len(df3_i) != 0:                                               
                        #Number of Cuts Title
                        ptext = '<font size=15>%s</font>' % 'Single Cuts'
                        story.append(Paragraph(ptext, styles["Normal"]))
            
                        #Line Space
                        story.append(Spacer(1, 12))

                        #Making the dataframe from the pandas one previously created. Need to convert to list to work.
                        df2_i_to_list = [df2_i.columns.values.tolist()] + df2_i.values.tolist()
                        tb_single = Table(df2_i_to_list, style=[('BACKGROUND',(0,0),(-1,-1),colors.lightgrey), 
                                                                ('ALIGN',(0,0),(-1,-1),'CENTER'), 
                                                                ('LINEABOVE',(0,0),(9,1),1,colors.black)])
                        story.append(tb_single)
            
                        #Line Space
                        story.append(Spacer(1, 12))

                        #Amino Acid Sequence Showing where Cuts are
                        #ptext = '<font size=12>%s</font>' % 
                        story.append(Paragraph(rejoined_single_save, styles["Courier"]))
                
                        #Page Break
                        story.append(PageBreak())
                
                        #Line
                        line = MCLine(450)
                        story.append(line)

                        #Title
                        ptext = '<font size=20>%s</font>' % pdf_title.value
                        story.append(Paragraph(ptext, styles["Center"]))

                        #Space
                        story.append(Spacer(2, 12))

                        #Line
                        story.append(line)

                        #Space
                        story.append(Spacer(2, 12))

                        #Date Report Created
                        now = datetime.datetime.now()
                        date_created = 'Report Date: ' + str(now.day) + '/' + str(now.month) + '/' + str(now.year)
                        ptext = '<font size=15>%s</font>' % date_created
                        story.append(Paragraph(ptext, styles["Center"]))

                        #Line Space
                        story.append(Spacer(1, 12))
            
                        #Image Spectrum
                        if image_output.value == True:
                            im = Image(add_image.value.strip('"'))
                            im.drawHeight = 189
                            im.drawWidth = 500
                            im.hAlign = 'CENTER'
                            story.append(im)

                        #Line Space
                        story.append(Spacer(1, 12))
            
                        #Number of Cuts Title
                        ptext = '<font size=15>%s</font>' % 'Double Cuts'
                        story.append(Paragraph(ptext, styles["Normal"]))
            
                        #Line Space
                        story.append(Spacer(1, 12))

                        #Making the dataframe from the pandas one previously created. Need to convert to list to work.
                        df3_i_to_list = [df3_i.columns.values.tolist()] + df3_i.values.tolist()
                        tb_single = Table(df3_i_to_list, style=[('BACKGROUND',(0,0),(-1,-1),colors.lightgrey), 
                                                                ('ALIGN',(0,0),(-1,-1),'CENTER'), 
                                                                ('LINEABOVE',(0,0),(9,1),1,colors.black)])
                        story.append(tb_single)
            
                        #Line Space
                        story.append(Spacer(1, 12))

                        #Amino Acid Sequence Showing where Cuts are
                        #ptext = '<font size=12>%s</font>' % 
                        story.append(Paragraph(rejoined_double_save, styles["Courier"]))
                        doc.build(story)
                    
                    else:
                        if len(df2_i) != 0:
                                            
                            #Number of Cuts Title
                            ptext = '<font size=15>%s</font>' % 'Single Cuts'
                            story.append(Paragraph(ptext, styles["Normal"]))
        
                            #Line Space
                            story.append(Spacer(1, 12))

                            #Making the dataframe from the pandas one previously created. Need to convert to list to work.
                            df2_i_to_list = [df2_i.columns.values.tolist()] + df2_i.values.tolist()
                            tb_single = Table(df2_i_to_list, style=[('BACKGROUND',(0,0),(-1,-1),colors.lightgrey), 
                                                                    ('ALIGN',(0,0),(-1,-1),'CENTER'), 
                                                                    ('LINEABOVE',(0,0),(9,1),1,colors.black)])
                            story.append(tb_single)
            
                            #Line Space
                            story.append(Spacer(1, 12))

                            #Amino Acid Sequence Showing where Cuts are
                            #ptext = '<font size=12>%s</font>' % 
                            story.append(Paragraph(rejoined_single_save, styles["Courier"]))
                            doc.build(story)
                        else:
                            #print('No Single Cuts Found Therefore Not Saved to the PDF Report')
                            if len(df3_i) != 0:
                                #Number of Cuts Title
                                ptext = '<font size=15>%s</font>' % 'Double Cuts'
                                story.append(Paragraph(ptext, styles["Normal"]))
        
                                #Line Space
                                story.append(Spacer(1, 12))
                
                                #Making the dataframe from the pandas one previously created. Need to convert to list to work.
                                df3_i_to_list = [df3_i.columns.values.tolist()] + df3_i.values.tolist()
                                tb_single = Table(df3_i_to_list, style=[('BACKGROUND',(0,0),(-1,-1),colors.lightgrey), 
                                                                        ('ALIGN',(0,0),(-1,-1),'CENTER'), 
                                                                      ('LINEABOVE',(0,0),(9,1),1,colors.black)])
                                story.append(tb_single)
            
                                #Line Space
                                story.append(Spacer(1, 12))

                                #Amino Acid Sequence Showing where Cuts are
                                #ptext = '<font size=12>%s</font>' % 
                                story.append(Paragraph(rejoined_double_save, styles["Courier"]))
                                doc.build(story)
                            
                else:
                    if single_cut.value == True:
                        if len(df2_i) != 0:
                                            
                            #Number of Cuts Title
                            ptext = '<font size=15>%s</font>' % 'Single Cuts'
                            story.append(Paragraph(ptext, styles["Normal"]))
        
                            #Line Space
                            story.append(Spacer(1, 12))

                            #Making the dataframe from the pandas one previously created. Need to convert to list to work.
                            df2_i_to_list = [df2_i.columns.values.tolist()] + df2_i.values.tolist()
                            tb_single = Table(df2_i_to_list, style=[('BACKGROUND',(0,0),(-1,-1),colors.lightgrey), 
                                                                    ('ALIGN',(0,0),(-1,-1),'CENTER'), 
                                                                    ('LINEABOVE',(0,0),(9,1),1,colors.black)])
                            story.append(tb_single)
            
                            #Line Space
                            story.append(Spacer(1, 12))

                            #Amino Acid Sequence Showing where Cuts are
                            #ptext = '<font size=12>%s</font>' % 
                            story.append(Paragraph(rejoined_single_save, styles["Courier"]))
                            doc.build(story)
                        #else:
                         #   print('No Single Cuts Found Therefore Not Saved to the PDF Report')

                    else:
                        if double_cut.value == True:
                            if len(df3_i) != 0:
                                #Number of Cuts Title
                                ptext = '<font size=15>%s</font>' % 'Double Cuts'
                                story.append(Paragraph(ptext, styles["Normal"]))
        
                                #Line Space
                                story.append(Spacer(1, 12))
                
                                #Making the dataframe from the pandas one previously created. Need to convert to list to work.
                                df3_i_to_list = [df3_i.columns.values.tolist()] + df3_i.values.tolist()
                                tb_single = Table(df3_i_to_list, style=[('BACKGROUND',(0,0),(-1,-1),colors.lightgrey), 
                                                                        ('ALIGN',(0,0),(-1,-1),'CENTER'), 
                                                                      ('LINEABOVE',(0,0),(9,1),1,colors.black)])
                                story.append(tb_single)
            
                                #Line Space
                                story.append(Spacer(1, 12))

                                #Amino Acid Sequence Showing where Cuts are
                                #ptext = '<font size=12>%s</font>' % 
                                story.append(Paragraph(rejoined_double_save, styles["Courier"]))
                                doc.build(story)
                            #else:
                             #   print('No Double Cuts Found Therefore Not Saved to the PDF Report')
                            
    out.clear_output(wait = True)
    with out:
        if single_cut.value == True:
            if len(df2_i) != 0:
                display(HTML(df2_i.to_html(index=False)))
                print(rejoined_single_print.join(amino_list_single_print))
            else:
                print('No Single Cuts Found')
        if double_cut.value == True:
            if len(df3_i) != 0:
                display(HTML(df3_i.to_html(index=False)))
                print(rejoined_double_print.join(amino_list_double_print))
            else:
                print('No Double Cuts Found')

In [None]:
style = {'description_width': 'initial'}

seq_title = widgets.HTML(value="<b>Paste Your Amino Acid Sequence Here:</b>", 
                        style = style,
                        layout = widgets.Layout(width='71%'))
seq = widgets.Textarea(value='Amino Acid Sequence',
                        style = style,
                      layout = widgets.Layout(width='71%'))

file_loc_title = widgets.HTML(value="<b>Right Click and Copy the Full Path for your Observed Masses .xls or .csv file. Paste Here:</b>", 
                        style = style,
                        layout = widgets.Layout(width='71%'))

file_loc =  widgets.Text(value='Observed Masses .xls or .csv File Path',
                        layout = widgets.Layout(width='71%', height='25px', justify_content = 'flex-start'),
                        style = style)

mass_cal = widgets.ToggleButtons(options = ['Monoisotopic', 'Average'],
                                 value = 'Average',
                                 description = 'Mass Calculation Type:',
                                 layout = widgets.Layout(width='71%', height='25px', justify_content = 'flex-start'),
                                 style = style,
                                 button_style='info')
mass_tol = widgets.FloatSlider(value = 0.5, 
                               min=0, 
                               max=2,
                               step=0.1, 
                               style=style,
                               layout = widgets.Layout(width='71%', height='25px', justify_content = 'flex-start'),
                               description='Mass Tolerance (kDa)')

cores_title = widgets.HTML(value="<b>Select the Number of Processing Cores your Computer has to Speed up Processing</b>", 
                        style = style,
                        layout = widgets.Layout(width='71%')) 

cores = widgets.Dropdown(options=[2, 4, 8, 12, 24],
                          value=2,
                          description = 'Number of Computer Cores:', 
                          style=style, 
                          layout = widgets.Layout(width='71%', height='25px', justify_content = 'flex-start'),
                          readout=True)

single_cut = widgets.Checkbox(value=False, 
                               description='Single Cut?', 
                               disabled=False, 
                               button_style='danger')

double_cut = widgets.Checkbox(value=False, 
                               description='Double Cut?', 
                               disabled=False, 
                               button_style='danger')

cut_title = widgets.HTML(value="<b>Select if you would like to find Single Cuts, Double Cuts or Select Both:</b>", 
                        style = style,
                        layout = widgets.Layout(width='71%'))

cut_layout = widgets.HBox([single_cut, double_cut])

save_loc = widgets.Text(value='PDF Report Save Location',
                        layout = widgets.Layout(width='44%', height='25px', justify_content = 'flex-start'),
                        style = style)

save_output = widgets.Checkbox(value=False, 
                               description='Save PDF Report?', 
                               disabled=False, 
                               button_style='danger')

save_title = widgets.HTML(value="<b>Right Click and Copy the Full Path Where you Want to Save your PDF Report to:</b>", 
                        style = style,
                        layout = widgets.Layout(width='71%'))

save_layout = widgets.HBox([save_loc, save_output])

add_image = widgets.Text(value='Mass Spectrum Peaks Image',
                        layout = widgets.Layout(width='44%', height='25px', justify_content = 'flex-start'),
                        style = style)

image_output = widgets.Checkbox(value=False, 
                               description='Add image to PDF Report?', 
                               disabled=False, 
                               button_style='danger')

image_title = widgets.HTML(value="<b>Right Click and Copy the Full Path for your Spectrum Image .PNG file. Paste Here:</b>", 
                        style = style,
                        layout = widgets.Layout(width='71%'))

image_layout = widgets.HBox([add_image, image_output])

report_title = widgets.HTML(value="<b>Type the Title to Appear on your PDF Report. Will also be the File Name:</b>", 
                        style = style,
                        layout = widgets.Layout(width='71%'))

pdf_title = widgets.Text(value='PDF Report Title and Filename',
                        layout = widgets.Layout(width='44%', height='25px', justify_content = 'flex-start'),
                        style = style)


#file_upload = FileUpload(
    # https://developer.mozilla.org/en-US/docs/Web/HTML/Element/input#attr-accept
    # eg. '.txt', '.pdf', 'image/*', 'image/*,.pdf'
#    accept='', # default
    # True to accept multiple files upload else False
#    multiple=False, # default
    # True to disable the button else False to enable it
#    disabled=False, # default
    # CSS transparently passed to button (a button element overlays the input[type=file] element for better styling)
    # e.g. 'color: darkblue; background-color: lightsalmon; width: 180px;'
#    style_button='color:darkblue; background-color:lightsalmon')

compute = widgets.Button(value=False, 
                         description = 'Find Fragments', 
                         layout = widgets.Layout(width='71%', height='45px', justify_content = 'center'), 
                         disabled = False, 
                         style = {'font_weight': 'bold', 'font-size' : '30px'}, 
                         button_style = 'info', # 'success', 'info', 'warning', 'danger' or '' 
                         tooltip = 'Description', 
                         icon = 'check')

out = widgets.Output()

compute.on_click(on_button_clicked)

display(seq_title, seq, 
        file_loc_title, file_loc, 
        mass_cal, mass_tol, 
        cores_title, cores, 
        cut_title, cut_layout,
        save_title, save_layout, 
        image_title, image_layout, 
        report_title, pdf_title, 
        compute, 
        out)

In [None]:
from IPython.display import HTML

HTML('''<script>
code_show=true; 
function code_toggle() {
 if (code_show){
 $('div.input').hide();
 } else {
 $('div.input').show();
 }
 code_show = !code_show
} 
$( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()"><input type="submit" value="Click here to toggle on/off the raw code."></form>''')