Generates table of min and max hydrophobicity windows for a list of peptides

Requirements:
* `matplotlib`
* `numpy`
* `scipy` 
* `sklearn`
* `pandas`
* `Biopython`

In [1]:
import matplotlib.pyplot as plt
import numpy as np

%matplotlib inline

# 1. Load peptides

In [3]:
standard_amino_acids = list('AGILPVFWYDERHKSTCMNQ')

import pandas as pd
path_to_spreadsheet = 'barcode-lib-all-list.xlsx'
ref_set = list(pd.read_excel(path_to_spreadsheet, sheetname=0, header=None)[0])
print('len(ref_set):', len(ref_set))
# remove redundancies
nonredundant_set = sorted(list(set(ref_set)))
print('len(nonredundant_set):', len(nonredundant_set))
# remove any sequences containing nonstandard characters (e.g. wildcards)
print('some peptides contained the following nonstandard characters:',
      sorted(list(set(list(''.join(nonredundant_set))).difference(standard_amino_acids))))
is_valid = lambda seq : (sum([aa in standard_amino_acids for aa in set(seq)]) / len(set(seq))) == 1
for seq in nonredundant_set:
    if not is_valid(seq):
        print('\t{}'.format(seq))
initial_set = [seq for seq in nonredundant_set if is_valid(seq)]
print('len(initial_set): ', len(initial_set))

len(ref_set): 8002
len(nonredundant_set): 5399
some peptides contained the following nonstandard characters: []
len(initial_set):  5399


# 2. Compute hydrophobicity

In [3]:
from Bio.SeqUtils.ProtParam import ProteinAnalysis, ProtParamData
ProtParamData?

In [5]:
kyte_doolittle_scale = { 'A': 1.8,'R':-4.5,'N':-3.5,'D':-3.5,'C': 2.5,
       'Q':-3.5,'E':-3.5,'G':-0.4,'H':-3.2,'I': 4.5,
       'L': 3.8,'K':-3.9,'M': 1.9,'F': 2.8,'P':-1.6,
       'S':-0.8,'T':-0.7,'W':-0.9,'Y':-1.3,'V': 4.2 }

In [6]:
peptide = initial_set[1]
len(peptide)

10

In [7]:
print (initial_set[1])

AAACHHHEWE


In [8]:
protein_analysis = ProteinAnalysis(peptide)

In [9]:
protein_analysis.protein_scale(kyte_doolittle_scale, window=5)

[0.9399999999999998,
 -0.06000000000000005,
 -1.06,
 -2.12,
 -2.8,
 -2.8600000000000003]

In [10]:
def compute_minhyd (protein_analysis, window=5):
    
    window = min(window, len(protein_analysis.sequence))
    hydrophobicity_windows = protein_analysis.protein_scale(kyte_doolittle_scale, window=window)
    return np.array (min(hydrophobicity_windows))

In [11]:
def compute_maxhyd (protein_analysis, window=5):
    
    window = min(window, len(protein_analysis.sequence))
    hydrophobicity_windows = protein_analysis.protein_scale(kyte_doolittle_scale, window=window)
    return np.array (max(hydrophobicity_windows))

In [12]:
def compute_medianhyd (protein_analysis, window=5):
    
    window = min(window, len(protein_analysis.sequence))
    hydrophobicity_windows = protein_analysis.protein_scale(kyte_doolittle_scale, window=window)
    return np.array (np.median(hydrophobicity_windows))

In [13]:
def compute_totalhyd (protein_analysis, window =20):
    window = min(window, len(protein_analysis.sequence))
    hydrophobicity_windows = protein_analysis.protein_scale(kyte_doolittle_scale, window=window)
    return np.array (max(hydrophobicity_windows))

In [14]:
def compute_mass(protein_analysis):
    """Return the molecular weight (in Da) of the peptide"""
    return protein_analysis.molecular_weight()

def compute_iso(protein_analysis):
    """Return isoelectric poitn for peptide"""
    return protein_analysis.isoelectric_point()

In [15]:
compute_minhyd(protein_analysis)

array(-2.86)

In [16]:
compute_maxhyd(protein_analysis)

array(0.94)

In [17]:
compute_totalhyd(protein_analysis)

array(-1.16363636)

In [18]:
compute_iso(protein_analysis)

5.77911376953125

In [19]:
def describe_peptide(x):
    peptide = x
    protein_analysis = ProteinAnalysis(peptide)
    minhyd=compute_minhyd(protein_analysis)
    maxhyd=compute_maxhyd(protein_analysis)
    medianhyd=compute_medianhyd(protein_analysis)
    totalhyd=compute_totalhyd(protein_analysis)
    mass=compute_mass(protein_analysis)
    iso=compute_iso(protein_analysis)
    length=len(x)

    return ','.join([peptide,str(minhyd), str(maxhyd),str(medianhyd),str(totalhyd), str(mass),str(iso),str(length)])

lines =list(map(describe_peptide,sorted(initial_set)))

In [20]:
with open('unique-peptides-properties-table.csv', 'w') as f:
    f.writelines(['peptide,minhyd,maxhyd,medianhyd,totalhyd,mass,iso,length\n'])
    f.writelines(['{}\n'.format(line) for line in lines])