## Chen

In [23]:
import pandas as pd
import numpy as np
from os import path

In [2]:
DATA_DIR = "../../data"

In [3]:
chen_data = pd.read_csv(path.join(DATA_DIR, "chen/chen_data.csv"))
chen_data

Unnamed: 0,Antibody_ID,heavy,light,Y
0,12e8,EVQLQQSGAEVVRSGASVKLSCTASGFNIKDYYIHWVKQRPEKGLE...,DIVMTQSQKFMSTSVGDRVSITCKASQNVGTAVAWYQQKPGQSPKL...,0
1,15c8,EVQLQQSGAELVKPGASVKLSCTASGFNIKDTYMHWVKQKPEQGLE...,DIVLTQSPAIMSASLGERVTMTCTASSSVSSSNLHWYQQKPGSSPK...,0
2,1a0q,EVQLQESDAELVKPGASVKISCKASGYTFTDHVIHWVKQKPEQGLE...,DIELTQSPSSLSASLGGKVTITCKASQDIKKYIGWYQHKPGKQPRL...,1
3,1a14,QVQLQQSGAELVKPGASVRMSCKASGYTFTNYNMYWVKQSPGQGLE...,DIELTQTTSSLSASLGDRVTISCRASQDISNYLNWYQQNPDGTVKL...,0
4,1a2y,QVQLQESGPGLVAPSQSLSITCTVSGFSLTGYGVNWVRQPPGKGLE...,DIVLTQSPASLSASVGETVTITCRASGNIHNYLAWYQQKQGKSPQL...,0
...,...,...,...,...
2404,6s5a,EVKLLESGGGLVQPGGSLKLSCAASGFDFSRYWMNWVRQAPGKGLE...,QAVVTQESALTTSPGETVTLTCRSSTGAVTTSNYANWVQEKPDHLF...,0
2405,6tyb,EVQLVQSGTEVKRPGESLTISCKTSGYSFSGTWISWVRQMPGKGLE...,DIQMTQSPSSLSASVGDRVTITCRASQGISTYLAWYQQKPGKAPKL...,0
2406,6u1t,EVQLVESGGGLVKPGGSLKLSCAASGFTFSSYDMSWVRQTPEKRLE...,DIQMTQSPASQSASLGESVTITCLASQTIGTWLAWYQQKPGKSPQL...,0
2407,7fab,AVQLEQSGPGLVRPSQTLSLTCTVSGTSFDDYYWTWVRQPPGRGLE...,ASVLTQPPSVSGAPGQRVTISCTGSSSNIGAGHNVKWYQQLPGTAP...,0


In [4]:
from PyBioMed import Pyprotein

In [5]:
def calculate_all_descriptors(sequence):
    protein = Pyprotein.PyProtein(sequence)
    desc = list(protein.GetALL().values())
    tripept = list(protein.GetTPComp().values())
    all_desc = desc[:420] + tripept + desc[8420:]
    return all_desc

In [9]:
def descriptors_for_ab(seqs):
    desc_heavy = calculate_all_descriptors(seqs["heavy"])
    desc_light = calculate_all_descriptors(seqs["light"])
    all_desc = desc_heavy + desc_light
    return np.asarray(all_desc)

In [20]:
X_data = chen_data[["heavy", "light"]].apply(descriptors_for_ab, axis=1, result_type="expand")
X_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,19750,19751,19752,19753,19754,19755,19756,19757,19758,19759
0,7.5,3.333,2.5,5.833,1.667,5.833,5.833,9.167,1.667,4.167,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,9.244,0.0,3.361,5.882,1.681,4.202,6.723,8.403,2.521,3.361,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,5.882,1.681,3.361,5.042,1.681,5.882,5.042,9.244,1.681,4.202,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,6.667,2.5,4.167,5.0,1.667,2.5,7.5,11.667,0.0,1.667,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,3.448,5.172,4.31,6.034,1.724,2.586,6.034,10.345,0.862,2.586,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [21]:
X_data.columns = X_data.columns.astype(str)
X_data.insert(0, "Ab_ID", chen_data["Antibody_ID"])
X_data.to_feather(path.join(DATA_DIR, "chen/pybiomed/X_data.ftr"))

# TAP

In [22]:
data_tap = pd.read_csv(path.join(DATA_DIR, "tap/TAP_data.csv"))
data_tap

Unnamed: 0,Antibody_ID,heavy,light,CDR_length,PSH,PPC,PNC,SFvCSP,Y
0,Abagovomab,QVKLQESGAELARPGASVKLSCKASGYTFTNYWMQWVKQRPGQGLD...,DIELTQSPASLSASVGETVTITCQASENIYSYLAWHQQKQGKSPQL...,46,129.7603,0.0000,0.0000,16.32,1
1,Abituzumab,QVQLQQSGGELAKPGASVKVSCKASGYTFSSFWMHWVRQAPGQGLE...,DIQMTQSPSSLSASVGDRVTITCRASQDISNYLAWYQQKPGKAPKL...,45,115.9106,0.0954,0.0421,-3.10,0
2,Abrilumab,QVQLVQSGAEVKKPGASVKVSCKVSGYTLSDLSIHWVRQAPGKGLE...,DIQMTQSPSSVSASVGDRVTITCRASQGISSWLAWYQQKPGKAPKL...,45,109.6995,0.0000,0.8965,-4.00,0
3,Actoxumab,QVQLVESGGGVVQPGRSLRLSCAASGFSFSNYGMHWVRQAPGKGLE...,DIQMTQSPSSVSASVGDRVTITCRASQGISSWLAWYQHKPGKAPKL...,49,112.6290,0.0000,1.1247,3.10,0
4,Adalimumab,EVQLVESGGGLVQPGRSLRLSCAASGFTFDDYAMHWVRQAPGKGLE...,DIQMTQSPSSLSASVGDRVTITCRASQGIRNYLAWYQQKPGKAPKL...,48,111.2512,0.0485,1.1364,-19.50,0
...,...,...,...,...,...,...,...,...,...
236,Visilizumab,QVQLVQSGAEVKKPGASVKVSCKASGYTFISYTMHWVRQAPGQGLE...,DIQMTQSPSSLSASVGDRVTITCSASSSVSYMNWYQQKPGKAPKRL...,46,124.0825,0.1417,0.1812,8.40,1
237,Vonlerolizumab,EVQLVQSGAEVKKPGASVKVSCKASGYTFTDSYMSWVRQAPGQGLE...,DIQMTQSPSSLSASVGDRVTITCRASQDISNYLNWYQQKPGKAPKL...,44,118.5559,0.2029,0.3046,0.00,1
238,Zalutumumab,QVQLVESGGGVVQPGRSLRLSCAASGFTFSTYGMHWVRQAPGKGLE...,AIQLTQSPSSLSASVGDRVTITCRASQDISSALVWYQQKPGKAPKL...,52,121.8996,0.0000,1.2505,0.00,1
239,Zanolimumab,QVQLQQWGAGLLKPSETLSLTCAVYGGSFSGYYWSWIRQPPGKGLE...,DIQMTQSPSSVSASVGDRVTITCRASQDISSWLAWYQHKPGKAPKL...,42,112.5357,0.0000,0.0000,6.51,0


In [None]:
X_data = data_tap[["heavy", "light"]].apply(descriptors_for_ab, axis=1, result_type="expand")
X_data.head()

In [None]:
X_data.columns = X_data.columns.astype(str)
X_data.insert(0, "Ab_ID", data_tap["Antibody_ID"])
X_data.to_feather(path.join(DATA_DIR, "tap/pybiomed/X_TAP_data.ftr"))

In [24]:
pd.read_feather(path.join(DATA_DIR, "tap/pybiomed/X_TAP_data.ftr"))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,19750,19751,19752,19753,19754,19755,19756,19757,19758,19759
0,10.084,3.361,2.521,3.361,1.681,3.361,5.882,11.765,0.840,1.681,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,7.627,4.237,1.695,3.390,1.695,5.085,5.932,10.169,0.847,2.542,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,5.932,1.695,0.000,5.932,1.695,5.085,6.780,10.169,0.847,1.695,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,5.738,6.557,4.098,5.738,1.639,3.279,4.918,11.475,0.820,3.279,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,9.091,4.959,3.306,5.785,1.653,4.132,4.132,9.091,1.653,2.479,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
236,10.000,3.333,1.667,4.167,1.667,3.333,5.833,8.333,1.667,1.667,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
237,5.983,5.128,1.709,5.128,1.709,5.128,5.128,7.692,0.000,1.709,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
238,5.600,6.400,2.400,6.400,1.600,2.400,4.800,12.000,0.800,2.400,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
239,5.217,2.609,4.348,2.609,1.739,2.609,6.087,9.565,0.870,4.348,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
