# Protein analyzing and visualizing

Analyzed protein:
* 1FAT - Phytohemagglutinin-l

In [13]:
from Bio.PDB import *
from Bio.SeqUtils.ProtParam import ProteinAnalysis
from Bio.SeqUtils.ProtParam import ProtParamData
import nglview as nv
import ipywidgets

import warnings 
warnings.filterwarnings('ignore')

## Visualizing with the CIF Format by MMCIF Parser

Creating MMCIFParser and use the nglview to create interactive visualization

In [14]:
parser = MMCIFParser()

In [15]:
structure = parser.get_structure("1FAT", "prot/1fat.cif")

In [16]:
view = nv.show_biopython(structure)
view

NGLWidget()

Acessing to protein information by Header

In [17]:
mmcif_dict = MMCIF2Dict.MMCIF2Dict("prot/1fat.cif")

Counting items in created dictionary

In [18]:
len(mmcif_dict)

689

Acessing Residue Sequence

In [19]:
for model in structure:
    for residue in model.get_residues():
        print(residue)

<Residue SER het=  resseq=1 icode= >
<Residue ASN het=  resseq=2 icode= >
<Residue ASP het=  resseq=3 icode= >
<Residue ILE het=  resseq=4 icode= >
<Residue TYR het=  resseq=5 icode= >
<Residue PHE het=  resseq=6 icode= >
<Residue ASN het=  resseq=7 icode= >
<Residue PHE het=  resseq=8 icode= >
<Residue GLN het=  resseq=9 icode= >
<Residue ARG het=  resseq=10 icode= >
<Residue PHE het=  resseq=11 icode= >
<Residue ASN het=  resseq=12 icode= >
<Residue GLU het=  resseq=13 icode= >
<Residue THR het=  resseq=14 icode= >
<Residue ASN het=  resseq=15 icode= >
<Residue LEU het=  resseq=16 icode= >
<Residue ILE het=  resseq=17 icode= >
<Residue LEU het=  resseq=18 icode= >
<Residue GLN het=  resseq=19 icode= >
<Residue ARG het=  resseq=20 icode= >
<Residue ASP het=  resseq=21 icode= >
<Residue ALA het=  resseq=22 icode= >
<Residue SER het=  resseq=23 icode= >
<Residue VAL het=  resseq=24 icode= >
<Residue SER het=  resseq=25 icode= >
<Residue SER het=  resseq=26 icode= >
<Residue SER het=  re

Creating smaller polypeptides from entire protein structure

In [20]:
ppb = CaPPBuilder()
counter = 1
for pp in ppb.build_peptides(structure):
    seq = pp.get_sequence()
    print(f'Sequence: {counter} Length: {len(seq)}')
    print(seq)
    counter += 1

Sequence: 1 Length: 36
SNDIYFNFQRFNETNLILQRDASVSSSGQLRLTNLN
Sequence: 2 Length: 196
NGEPRVGSLGRAFYSAPIQIWDNTTGTVASFATSFTFNIQVPNNAGPADGLAFALVPVGSQPKDKGGFLGLFDGSNSNFHTVAVEFDTLYNKDWDPTERHIGIDVNSIRSIKTTRWDFVNGENAEVLITYDSSTNLLVASLVYPSQKTSFIVSDTVDLKSVLPEWVSVGFSATTGINKGNVETNDVLSWSFASKLS
Sequence: 3 Length: 233
SNDIYFNFQRFNETNLILQRDASVSSSGQLRLTNLNGNGEPRVGSLGRAFYSAPIQIWDNTTGTVASFATSFTFNIQVPNNAGPADGLAFALVPVGSQPKDKGGFLGLFDGSNSNFHTVAVEFDTLYNKDWDPTERHIGIDVNSIRSIKTTRWDFVNGENAEVLITYDSSTNLLVASLVYPSQKTSFIVSDTVDLKSVLPEWVSVGFSATTGINKGNVETNDVLSWSFASKLS
Sequence: 4 Length: 36
SNDIYFNFQRFNETNLILQRDASVSSSGQLRLTNLN
Sequence: 5 Length: 196
NGEPRVGSLGRAFYSAPIQIWDNTTGTVASFATSFTFNIQVPNNAGPADGLAFALVPVGSQPKDKGGFLGLFDGSNSNFHTVAVEFDTLYNKDWDPTERHIGIDVNSIRSIKTTRWDFVNGENAEVLITYDSSTNLLVASLVYPSQKTSFIVSDTVDLKSVLPEWVSVGFSATTGINKGNVETNDVLSWSFASKLS
Sequence: 6 Length: 35
SNDIYFNFQRFNETNLILQRDASVSSSGQLRLTNL
Sequence: 7 Length: 196
NGEPRVGSLGRAFYSAPIQIWDNTTGTVASFATSFTFNIQVPNNAGPADGLAFALVPVGSQPKDKGGFLGLFDGSNSNFHTVAVEFDTLYNKDWDPT

In [21]:
analysed_seq = ProteinAnalysis(str(seq))

Calculating molecular weight

In [22]:
analysed_seq.molecular_weight()

21164.201100000006

Calculating protein GRAVY (higher value is more hydrophobic)

In [23]:
analysed_seq.gravy()

-0.0959183673469388

Counting number of aminoacids

In [24]:
analysed_seq.count_amino_acids()

{'A': 13,
 'C': 0,
 'D': 13,
 'E': 7,
 'F': 13,
 'G': 16,
 'H': 2,
 'I': 10,
 'K': 8,
 'L': 14,
 'M': 0,
 'N': 15,
 'P': 9,
 'Q': 4,
 'R': 5,
 'S': 22,
 'T': 17,
 'V': 19,
 'W': 5,
 'Y': 4}

Calculating percentage of each amnino acid in the sequence

In [25]:
analysed_seq.get_amino_acids_percent()

{'A': 0.0663265306122449,
 'C': 0.0,
 'D': 0.0663265306122449,
 'E': 0.03571428571428571,
 'F': 0.0663265306122449,
 'G': 0.08163265306122448,
 'H': 0.01020408163265306,
 'I': 0.05102040816326531,
 'K': 0.04081632653061224,
 'L': 0.07142857142857142,
 'M': 0.0,
 'N': 0.07653061224489796,
 'P': 0.04591836734693878,
 'Q': 0.02040816326530612,
 'R': 0.025510204081632654,
 'S': 0.11224489795918367,
 'T': 0.08673469387755102,
 'V': 0.09693877551020408,
 'W': 0.025510204081632654,
 'Y': 0.02040816326530612}

Method that returns beta sheets, alpha helixes and turns

In [26]:
analysed_seq.secondary_structure_fraction() # helix, turn, sheet

(0.33163265306122447, 0.3163265306122449, 0.17346938775510204)

Analyzing kd index (Kyte & Doolittle index of hydrophobicity)

In [27]:
analysed_seq.protein_scale(window = 7, param_dict = ProtParamData.kd)

[-1.3857142857142857,
 -1.0,
 -0.4000000000000001,
 0.04285714285714288,
 -0.3714285714285714,
 0.5285714285714285,
 0.32857142857142857,
 0.20000000000000004,
 0.19999999999999996,
 -0.08571428571428573,
 -0.2571428571428571,
 1.0285714285714287,
 0.2714285714285714,
 0.5142857142857143,
 0.5714285714285714,
 0.1857142857142857,
 -0.5714285714285714,
 -0.4428571428571429,
 -1.1857142857142857,
 -0.742857142857143,
 -1.4857142857142855,
 -0.7571428571428571,
 1.586032892321652e-17,
 0.38571428571428573,
 0.8857142857142856,
 1.242857142857143,
 1.1999999999999997,
 1.1857142857142857,
 0.9857142857142857,
 0.6285714285714284,
 1.1428571428571428,
 0.2428571428571428,
 0.6285714285714284,
 0.2285714285714285,
 0.9428571428571428,
 0.31428571428571433,
 -0.08571428571428578,
 -0.9857142857142857,
 -0.2285714285714286,
 -0.9285714285714286,
 -0.6571428571428571,
 -0.9999999999999999,
 -1.2714285714285716,
 -0.8285714285714284,
 0.21428571428571433,
 0.21428571428571427,
 0.671428571428571

This scale above respresent values of hydrophobicity. Positive values are hydrophobic and negative are hydrophilic.

In [28]:
ProtParamData.kd

{'A': 1.8,
 'R': -4.5,
 'N': -3.5,
 'D': -3.5,
 'C': 2.5,
 'Q': -3.5,
 'E': -3.5,
 'G': -0.4,
 'H': -3.2,
 'I': 4.5,
 'L': 3.8,
 'K': -3.9,
 'M': 1.9,
 'F': 2.8,
 'P': -1.6,
 'S': -0.8,
 'T': -0.7,
 'W': -0.9,
 'Y': -1.3,
 'V': 4.2}

## Creating a script which bring all metods represented above

In [36]:
parser = MMCIFParser()
structure = parser.get_structure("PHA-L", "prot/1fat.cif")

ppb = CaPPBuilder()
#creating empty list for chains
all_seqs = []
counter = 1

for pp in ppb.build_peptides(structure):
    seq_information = {}
    seq = pp.get_sequence()
    analysed_seq = ProteinAnalysis(str(seq))
    
    seq_information["Sequence number"] = counter
    seq_information["Sequence"] = seq # stores Seq() object
    seq_information["Sequence length"] = len(seq)
    seq_information["Molecular weight"] = round(analysed_seq.molecular_weight(), 4)
    seq_information["GRAVY"] = round(analysed_seq.gravy(), 4)
    seq_information["Amino acid count"] = analysed_seq.count_amino_acids() 
    seq_information["Amino acid percent"] = analysed_seq.get_amino_acids_percent()
    seq_information["Secondary structure"] = analysed_seq.secondary_structure_fraction()
    
    all_seqs.append(seq_information) #  Updating list that stores info about seqs
    counter += 1

Selecting first sequence and it returns dictionary with values

In [37]:
all_seqs[0]

{'Sequence number': 1,
 'Sequence': Seq('SNDIYFNFQRFNETNLILQRDASVSSSGQLRLTNLN'),
 'Sequence length': 36,
 'Molecular weight': 4176.5167,
 'GRAVY': -0.5611,
 'Amino acid count': {'A': 1,
  'C': 0,
  'D': 2,
  'E': 1,
  'F': 3,
  'G': 1,
  'H': 0,
  'I': 2,
  'K': 0,
  'L': 5,
  'M': 0,
  'N': 6,
  'P': 0,
  'Q': 3,
  'R': 3,
  'S': 5,
  'T': 2,
  'V': 1,
  'W': 0,
  'Y': 1},
 'Amino acid percent': {'A': 0.027777777777777776,
  'C': 0.0,
  'D': 0.05555555555555555,
  'E': 0.027777777777777776,
  'F': 0.08333333333333333,
  'G': 0.027777777777777776,
  'H': 0.0,
  'I': 0.05555555555555555,
  'K': 0.0,
  'L': 0.1388888888888889,
  'M': 0.0,
  'N': 0.16666666666666666,
  'P': 0.0,
  'Q': 0.08333333333333333,
  'R': 0.08333333333333333,
  'S': 0.1388888888888889,
  'T': 0.05555555555555555,
  'V': 0.027777777777777776,
  'W': 0.0,
  'Y': 0.027777777777777776},
 'Secondary structure': (0.3333333333333333,
  0.3333333333333333,
  0.19444444444444445)}

We can specify values very easily

In [41]:
all_seqs[1]['Molecular weight']

21164.2011