In [10]:
import pandas as pd
import numpy as np
from Bio import SeqIO
from Bio.SeqUtils.ProtParam import ProteinAnalysis
from os import path

In [2]:
DATA_DIR = "../../data"

In [24]:
# from https://github.com/yemilyz/bioviaclinic1920/blob/aa69b4d98c1d98f810f45145ad09a5b4cae5e9a2/source/protparam_features.py
def get_properties_for_sequences(seqs):
    feature_set = {}
    colNames = ['aa_percent{}'.format(i) for i in range(20)] + ['aromacity', 'instability',
                'flexibility', 'isoelectric', 'mol_extinct1',
                'mol_extinct2', 'mw', 'gravy', 'ss_faction1', 'ss_faction2',
                'ss_faction3']
    for name, seq in seqs.items():
        analysed_seq = ProteinAnalysis(seq)
        aa_per = analysed_seq.get_amino_acids_percent().values()
        aromacity = analysed_seq.aromaticity()
        instability = analysed_seq.instability_index()
        flexibility = np.average(analysed_seq.flexibility())
        isoelectric = analysed_seq.isoelectric_point()
        mol_extinct1, mol_extinct2 = analysed_seq.molar_extinction_coefficient()
        mw = analysed_seq.molecular_weight()
        gravy = analysed_seq.gravy()
        ss_faction = analysed_seq.secondary_structure_fraction()
        feature = list(aa_per) + [aromacity, instability, flexibility, isoelectric, mol_extinct1, mol_extinct2, mw, gravy] + list(ss_faction)
        feature_set[name] = feature
    feature_set = pd.DataFrame.from_dict(feature_set, orient='index', columns=colNames)
    return feature_set

In [11]:
tap_data = pd.read_csv(path.join(DATA_DIR, "tap/TAP_data.csv"))
tap_data.head()

Unnamed: 0,Antibody_ID,heavy,light,CDR_length,PSH,PPC,PNC,SFvCSP,Y
0,Abagovomab,QVKLQESGAELARPGASVKLSCKASGYTFTNYWMQWVKQRPGQGLD...,DIELTQSPASLSASVGETVTITCQASENIYSYLAWHQQKQGKSPQL...,46,129.7603,0.0,0.0,16.32,1
1,Abituzumab,QVQLQQSGGELAKPGASVKVSCKASGYTFSSFWMHWVRQAPGQGLE...,DIQMTQSPSSLSASVGDRVTITCRASQDISNYLAWYQQKPGKAPKL...,45,115.9106,0.0954,0.0421,-3.1,1
2,Abrilumab,QVQLVQSGAEVKKPGASVKVSCKVSGYTLSDLSIHWVRQAPGKGLE...,DIQMTQSPSSVSASVGDRVTITCRASQGISSWLAWYQQKPGKAPKL...,45,109.6995,0.0,0.8965,-4.0,1
3,Actoxumab,QVQLVESGGGVVQPGRSLRLSCAASGFSFSNYGMHWVRQAPGKGLE...,DIQMTQSPSSVSASVGDRVTITCRASQGISSWLAWYQHKPGKAPKL...,49,112.629,0.0,1.1247,3.1,1
4,Adalimumab,EVQLVESGGGLVQPGRSLRLSCAASGFTFDDYAMHWVRQAPGKGLE...,DIQMTQSPSSLSASVGDRVTITCRASQGIRNYLAWYQQKPGKAPKL...,48,111.2512,0.0485,1.1364,-19.5,1


In [18]:
def create_seq_dict(df, which_seq):
    seq_dict = {}
    for i, row in df.iterrows():
        name = row["Antibody_ID"]
        seq = row[which_seq]
        seq_dict[name] = seq
    return seq_dict

In [19]:
heavy_seq_dict = create_seq_dict(tap_data, "heavy")

In [21]:
light_seq_dict = create_seq_dict(tap_data, "light")

In [25]:
heavy_df = get_properties_for_sequences(heavy_seq_dict)
heavy_df.head()

Unnamed: 0,aa_percent0,aa_percent1,aa_percent2,aa_percent3,aa_percent4,aa_percent5,aa_percent6,aa_percent7,aa_percent8,aa_percent9,...,instability,flexibility,isoelectric,mol_extinct1,mol_extinct2,mw,gravy,ss_faction1,ss_faction2,ss_faction3
Abagovomab,0.10084,0.016807,0.033613,0.033613,0.02521,0.117647,0.008403,0.016807,0.067227,0.058824,...,24.734454,1.000576,9.100953,40910,41035,13030.348,-0.502521,0.268908,0.277311,0.210084
Abituzumab,0.076271,0.016949,0.033898,0.050847,0.033898,0.101695,0.008475,0.025424,0.033898,0.050847,...,39.184746,0.9986,5.736498,33920,34045,13026.3097,-0.390678,0.271186,0.271186,0.211864
Abrilumab,0.059322,0.016949,0.059322,0.050847,0.025424,0.101695,0.008475,0.016949,0.059322,0.059322,...,30.763559,1.005234,4.780921,29450,29575,12757.0231,-0.35,0.271186,0.271186,0.194915
Actoxumab,0.057377,0.016393,0.057377,0.032787,0.032787,0.114754,0.008197,0.032787,0.02459,0.065574,...,38.079508,0.994221,6.767842,36440,36565,13524.0182,-0.179508,0.336066,0.278689,0.180328
Adalimumab,0.090909,0.016529,0.057851,0.041322,0.024793,0.090909,0.016529,0.024793,0.024793,0.090909,...,42.929752,0.99575,5.184364,32430,32555,13239.5465,-0.184298,0.31405,0.272727,0.239669


In [27]:
light_df = get_properties_for_sequences(light_seq_dict)
light_df.head()

Unnamed: 0,aa_percent0,aa_percent1,aa_percent2,aa_percent3,aa_percent4,aa_percent5,aa_percent6,aa_percent7,aa_percent8,aa_percent9,...,instability,flexibility,isoelectric,mol_extinct1,mol_extinct2,mw,gravy,ss_faction1,ss_faction2,ss_faction3
Abagovomab,0.056075,0.018692,0.018692,0.046729,0.037383,0.11215,0.037383,0.065421,0.065421,0.093458,...,53.693458,0.999564,7.973724,14440,14565,11556.8384,-0.257009,0.299065,0.299065,0.196262
Abituzumab,0.046729,0.018692,0.046729,0.018692,0.037383,0.084112,0.009346,0.074766,0.056075,0.046729,...,42.514019,1.001379,8.586625,17420,17545,11762.9686,-0.452336,0.280374,0.299065,0.121495
Abrilumab,0.065421,0.018692,0.037383,0.028037,0.046729,0.093458,0.0,0.056075,0.046729,0.056075,...,40.151402,1.002818,7.970307,22460,22585,11548.7104,-0.335514,0.271028,0.336449,0.158879
Actoxumab,0.074766,0.018692,0.037383,0.018692,0.046729,0.084112,0.009346,0.056075,0.046729,0.056075,...,51.517757,1.000328,8.682102,22460,22585,11530.7383,-0.260748,0.271028,0.317757,0.158879
Adalimumab,0.074766,0.018692,0.037383,0.018692,0.028037,0.084112,0.0,0.056075,0.046729,0.065421,...,46.585047,1.000522,9.428646,15930,16055,11664.9632,-0.402804,0.271028,0.28972,0.168224


In [28]:
protparam_df = heavy_df.merge(light_df, left_index=True, right_index=True, suffixes=["_x", "_y"])
protparam_df.head()

Unnamed: 0,aa_percent0_x,aa_percent1_x,aa_percent2_x,aa_percent3_x,aa_percent4_x,aa_percent5_x,aa_percent6_x,aa_percent7_x,aa_percent8_x,aa_percent9_x,...,instability_y,flexibility_y,isoelectric_y,mol_extinct1_y,mol_extinct2_y,mw_y,gravy_y,ss_faction1_y,ss_faction2_y,ss_faction3_y
Abagovomab,0.10084,0.016807,0.033613,0.033613,0.02521,0.117647,0.008403,0.016807,0.067227,0.058824,...,53.693458,0.999564,7.973724,14440,14565,11556.8384,-0.257009,0.299065,0.299065,0.196262
Abituzumab,0.076271,0.016949,0.033898,0.050847,0.033898,0.101695,0.008475,0.025424,0.033898,0.050847,...,42.514019,1.001379,8.586625,17420,17545,11762.9686,-0.452336,0.280374,0.299065,0.121495
Abrilumab,0.059322,0.016949,0.059322,0.050847,0.025424,0.101695,0.008475,0.016949,0.059322,0.059322,...,40.151402,1.002818,7.970307,22460,22585,11548.7104,-0.335514,0.271028,0.336449,0.158879
Actoxumab,0.057377,0.016393,0.057377,0.032787,0.032787,0.114754,0.008197,0.032787,0.02459,0.065574,...,51.517757,1.000328,8.682102,22460,22585,11530.7383,-0.260748,0.271028,0.317757,0.158879
Adalimumab,0.090909,0.016529,0.057851,0.041322,0.024793,0.090909,0.016529,0.024793,0.024793,0.090909,...,46.585047,1.000522,9.428646,15930,16055,11664.9632,-0.402804,0.271028,0.28972,0.168224


In [31]:
protparam_df.to_csv(path.join(DATA_DIR, "tap/protparam/protparam_features_tap.csv"))