# SAbDab Chen et al.

In [1]:
import pandas as pd
import numpy as np
from os import path

In [2]:
DATA_DIR = "../../data"

In [7]:
from tdc.single_pred import Develop
from tdc.utils import retrieve_label_name_list

In [8]:
chen_loader = Develop(name = 'SAbDab_Chen')
chen_data = chen_loader.get_data()
chen_data

Downloading...
100%|██████████| 601k/601k [00:00<00:00, 1.21MiB/s]
Loading...
Done!


Unnamed: 0,Antibody_ID,Antibody,Y
0,12e8,['EVQLQQSGAEVVRSGASVKLSCTASGFNIKDYYIHWVKQRPEKG...,0
1,15c8,['EVQLQQSGAELVKPGASVKLSCTASGFNIKDTYMHWVKQKPEQG...,0
2,1a0q,['EVQLQESDAELVKPGASVKISCKASGYTFTDHVIHWVKQKPEQG...,1
3,1a14,['QVQLQQSGAELVKPGASVRMSCKASGYTFTNYNMYWVKQSPGQG...,0
4,1a2y,['QVQLQESGPGLVAPSQSLSITCTVSGFSLTGYGVNWVRQPPGKG...,0
...,...,...,...
2404,6s5a,['EVKLLESGGGLVQPGGSLKLSCAASGFDFSRYWMNWVRQAPGKG...,0
2405,6tyb,['EVQLVQSGTEVKRPGESLTISCKTSGYSFSGTWISWVRQMPGKG...,0
2406,6u1t,['EVQLVESGGGLVKPGGSLKLSCAASGFTFSSYDMSWVRQTPEKR...,0
2407,7fab,['AVQLEQSGPGLVRPSQTLSLTCTVSGTSFDDYYWTWVRQPPGRG...,0


In [16]:
chen_data.loc[1]["Antibody"]

"['EVQLQQSGAELVKPGASVKLSCTASGFNIKDTYMHWVKQKPEQGLEWIAQIDPANGNTKYDPKFQGKATITADTSSNTAYLHLSSLTSEDSAVYYCAADPPYYGHGDYWGQGTTLTVSS', 'DIVLTQSPAIMSASLGERVTMTCTASSSVSSSNLHWYQQKPGSSPKLWIYSTSNLASGVPARFSGSGSGTSYSLTISSMEAEDAATYYCHQYHRSPYTFGGGTKLEIK']"

In [24]:
def split_ab_row(row):
    whole_ab_seq = row["Antibody"]
    ab_seq = whole_ab_seq.replace("[", "").replace("]", "").replace("'", "")
    heavy, light = ab_seq.split(",")
    heavy = heavy.strip()
    light = light.strip()
    return heavy, light

In [25]:
split_ab_row(chen_data.loc[1])

('EVQLQQSGAELVKPGASVKLSCTASGFNIKDTYMHWVKQKPEQGLEWIAQIDPANGNTKYDPKFQGKATITADTSSNTAYLHLSSLTSEDSAVYYCAADPPYYGHGDYWGQGTTLTVSS',
 'DIVLTQSPAIMSASLGERVTMTCTASSSVSSSNLHWYQQKPGSSPKLWIYSTSNLASGVPARFSGSGSGTSYSLTISSMEAEDAATYYCHQYHRSPYTFGGGTKLEIK')

In [26]:
chen_data[["heavy", "light"]] = chen_data.apply(split_ab_row, axis=1, result_type="expand")
chen_data.head()

Unnamed: 0,Antibody_ID,Antibody,Y,heavy,light
0,12e8,['EVQLQQSGAEVVRSGASVKLSCTASGFNIKDYYIHWVKQRPEKG...,0,EVQLQQSGAEVVRSGASVKLSCTASGFNIKDYYIHWVKQRPEKGLE...,DIVMTQSQKFMSTSVGDRVSITCKASQNVGTAVAWYQQKPGQSPKL...
1,15c8,['EVQLQQSGAELVKPGASVKLSCTASGFNIKDTYMHWVKQKPEQG...,0,EVQLQQSGAELVKPGASVKLSCTASGFNIKDTYMHWVKQKPEQGLE...,DIVLTQSPAIMSASLGERVTMTCTASSSVSSSNLHWYQQKPGSSPK...
2,1a0q,['EVQLQESDAELVKPGASVKISCKASGYTFTDHVIHWVKQKPEQG...,1,EVQLQESDAELVKPGASVKISCKASGYTFTDHVIHWVKQKPEQGLE...,DIELTQSPSSLSASLGGKVTITCKASQDIKKYIGWYQHKPGKQPRL...
3,1a14,['QVQLQQSGAELVKPGASVRMSCKASGYTFTNYNMYWVKQSPGQG...,0,QVQLQQSGAELVKPGASVRMSCKASGYTFTNYNMYWVKQSPGQGLE...,DIELTQTTSSLSASLGDRVTISCRASQDISNYLNWYQQNPDGTVKL...
4,1a2y,['QVQLQESGPGLVAPSQSLSITCTVSGFSLTGYGVNWVRQPPGKG...,0,QVQLQESGPGLVAPSQSLSITCTVSGFSLTGYGVNWVRQPPGKGLE...,DIVLTQSPASLSASVGETVTITCRASGNIHNYLAWYQQKQGKSPQL...


In [28]:
chen_data = chen_data[["Antibody_ID", "heavy", "light", "Y"]]
chen_data

Unnamed: 0,Antibody_ID,heavy,light,Y
0,12e8,EVQLQQSGAEVVRSGASVKLSCTASGFNIKDYYIHWVKQRPEKGLE...,DIVMTQSQKFMSTSVGDRVSITCKASQNVGTAVAWYQQKPGQSPKL...,0
1,15c8,EVQLQQSGAELVKPGASVKLSCTASGFNIKDTYMHWVKQKPEQGLE...,DIVLTQSPAIMSASLGERVTMTCTASSSVSSSNLHWYQQKPGSSPK...,0
2,1a0q,EVQLQESDAELVKPGASVKISCKASGYTFTDHVIHWVKQKPEQGLE...,DIELTQSPSSLSASLGGKVTITCKASQDIKKYIGWYQHKPGKQPRL...,1
3,1a14,QVQLQQSGAELVKPGASVRMSCKASGYTFTNYNMYWVKQSPGQGLE...,DIELTQTTSSLSASLGDRVTISCRASQDISNYLNWYQQNPDGTVKL...,0
4,1a2y,QVQLQESGPGLVAPSQSLSITCTVSGFSLTGYGVNWVRQPPGKGLE...,DIVLTQSPASLSASVGETVTITCRASGNIHNYLAWYQQKQGKSPQL...,0
...,...,...,...,...
2404,6s5a,EVKLLESGGGLVQPGGSLKLSCAASGFDFSRYWMNWVRQAPGKGLE...,QAVVTQESALTTSPGETVTLTCRSSTGAVTTSNYANWVQEKPDHLF...,0
2405,6tyb,EVQLVQSGTEVKRPGESLTISCKTSGYSFSGTWISWVRQMPGKGLE...,DIQMTQSPSSLSASVGDRVTITCRASQGISTYLAWYQQKPGKAPKL...,0
2406,6u1t,EVQLVESGGGLVKPGGSLKLSCAASGFTFSSYDMSWVRQTPEKRLE...,DIQMTQSPASQSASLGESVTITCLASQTIGTWLAWYQQKPGKSPQL...,0
2407,7fab,AVQLEQSGPGLVRPSQTLSLTCTVSGTSFDDYYWTWVRQPPGRGLE...,ASVLTQPPSVSGAPGQRVTISCTGSSSNIGAGHNVKWYQQLPGTAP...,0


In [36]:
chen_data.to_csv(path.join(DATA_DIR, "chen/chen_data.csv"), index=False)

In [3]:
data_chen = pd.read_csv(path.join(DATA_DIR, "chen/chen_data.csv"))
data_chen

Unnamed: 0,Antibody_ID,heavy,light,Y
0,12e8,EVQLQQSGAEVVRSGASVKLSCTASGFNIKDYYIHWVKQRPEKGLE...,DIVMTQSQKFMSTSVGDRVSITCKASQNVGTAVAWYQQKPGQSPKL...,0
1,15c8,EVQLQQSGAELVKPGASVKLSCTASGFNIKDTYMHWVKQKPEQGLE...,DIVLTQSPAIMSASLGERVTMTCTASSSVSSSNLHWYQQKPGSSPK...,0
2,1a0q,EVQLQESDAELVKPGASVKISCKASGYTFTDHVIHWVKQKPEQGLE...,DIELTQSPSSLSASLGGKVTITCKASQDIKKYIGWYQHKPGKQPRL...,1
3,1a14,QVQLQQSGAELVKPGASVRMSCKASGYTFTNYNMYWVKQSPGQGLE...,DIELTQTTSSLSASLGDRVTISCRASQDISNYLNWYQQNPDGTVKL...,0
4,1a2y,QVQLQESGPGLVAPSQSLSITCTVSGFSLTGYGVNWVRQPPGKGLE...,DIVLTQSPASLSASVGETVTITCRASGNIHNYLAWYQQKQGKSPQL...,0
...,...,...,...,...
2404,6s5a,EVKLLESGGGLVQPGGSLKLSCAASGFDFSRYWMNWVRQAPGKGLE...,QAVVTQESALTTSPGETVTLTCRSSTGAVTTSNYANWVQEKPDHLF...,0
2405,6tyb,EVQLVQSGTEVKRPGESLTISCKTSGYSFSGTWISWVRQMPGKGLE...,DIQMTQSPSSLSASVGDRVTITCRASQGISTYLAWYQQKPGKAPKL...,0
2406,6u1t,EVQLVESGGGLVKPGGSLKLSCAASGFTFSSYDMSWVRQTPEKRLE...,DIQMTQSPASQSASLGESVTITCLASQTIGTWLAWYQQKPGKSPQL...,0
2407,7fab,AVQLEQSGPGLVRPSQTLSLTCTVSGTSFDDYYWTWVRQPPGRGLE...,ASVLTQPPSVSGAPGQRVTISCTGSSSNIGAGHNVKWYQQLPGTAP...,0


In [5]:
data_chen["Y"].sum()

482

In [6]:
2409 - 482

1927

# TAP

In [38]:
label_list = retrieve_label_name_list('TAP')
label_list

['CDR_Length', 'PSH', 'PPC', 'PNC', 'SFvCSP']

In [43]:
tap_loader_0 = Develop(name = 'TAP', label_name = label_list[0])
tap_loader_1 = Develop(name = 'TAP', label_name = label_list[1])
tap_loader_2 = Develop(name = 'TAP', label_name = label_list[2])
tap_loader_3 = Develop(name = 'TAP', label_name = label_list[3])
tap_loader_4 = Develop(name = 'TAP', label_name = label_list[4])
tap_loader_0.get_data()

Found local copy...
Loading...
Done!
Found local copy...
Loading...
Done!
Found local copy...
Loading...
Done!
Found local copy...
Loading...
Done!
Found local copy...
Loading...
Done!


Unnamed: 0,Antibody_ID,Antibody,Y
0,Abagovomab,['QVKLQESGAELARPGASVKLSCKASGYTFTNYWMQWVKQRPGQG...,46
1,Abituzumab,['QVQLQQSGGELAKPGASVKVSCKASGYTFSSFWMHWVRQAPGQG...,45
2,Abrilumab,['QVQLVQSGAEVKKPGASVKVSCKVSGYTLSDLSIHWVRQAPGKG...,45
3,Actoxumab,['QVQLVESGGGVVQPGRSLRLSCAASGFSFSNYGMHWVRQAPGKG...,49
4,Adalimumab,['EVQLVESGGGLVQPGRSLRLSCAASGFTFDDYAMHWVRQAPGKG...,48
...,...,...,...
236,Visilizumab,['QVQLVQSGAEVKKPGASVKVSCKASGYTFISYTMHWVRQAPGQG...,46
237,Vonlerolizumab,['EVQLVQSGAEVKKPGASVKVSCKASGYTFTDSYMSWVRQAPGQG...,44
238,Zalutumumab,['QVQLVESGGGVVQPGRSLRLSCAASGFTFSTYGMHWVRQAPGKG...,52
239,Zanolimumab,['QVQLQQWGAGLLKPSETLSLTCAVYGGSFSGYYWSWIRQPPGKG...,42


In [51]:
tap_0 = tap_loader_0.get_data().rename({"Y": "CDR_length"}, axis=1)
tap_0

Unnamed: 0,Antibody_ID,Antibody,CDR_length
0,Abagovomab,['QVKLQESGAELARPGASVKLSCKASGYTFTNYWMQWVKQRPGQG...,46
1,Abituzumab,['QVQLQQSGGELAKPGASVKVSCKASGYTFSSFWMHWVRQAPGQG...,45
2,Abrilumab,['QVQLVQSGAEVKKPGASVKVSCKVSGYTLSDLSIHWVRQAPGKG...,45
3,Actoxumab,['QVQLVESGGGVVQPGRSLRLSCAASGFSFSNYGMHWVRQAPGKG...,49
4,Adalimumab,['EVQLVESGGGLVQPGRSLRLSCAASGFTFDDYAMHWVRQAPGKG...,48
...,...,...,...
236,Visilizumab,['QVQLVQSGAEVKKPGASVKVSCKASGYTFISYTMHWVRQAPGQG...,46
237,Vonlerolizumab,['EVQLVQSGAEVKKPGASVKVSCKASGYTFTDSYMSWVRQAPGQG...,44
238,Zalutumumab,['QVQLVESGGGVVQPGRSLRLSCAASGFTFSTYGMHWVRQAPGKG...,52
239,Zanolimumab,['QVQLQQWGAGLLKPSETLSLTCAVYGGSFSGYYWSWIRQPPGKG...,42


In [53]:
label_list

['CDR_Length', 'PSH', 'PPC', 'PNC', 'SFvCSP']

In [54]:
tap_1 = tap_loader_1.get_data().rename({"Y": "PSH"}, axis=1)
tap_2 = tap_loader_2.get_data().rename({"Y": "PPC"}, axis=1)
tap_3 = tap_loader_3.get_data().rename({"Y": "PNC"}, axis=1)
tap_4 = tap_loader_4.get_data().rename({"Y": "SFvCSP"}, axis=1)

In [59]:
tap_data = pd.concat([tap_0, tap_1["PSH"], tap_2["PPC"], tap_3["PNC"], tap_4["SFvCSP"]], axis=1)
tap_data

Unnamed: 0,Antibody_ID,Antibody,CDR_length,PSH,PPC,PNC,SFvCSP
0,Abagovomab,['QVKLQESGAELARPGASVKLSCKASGYTFTNYWMQWVKQRPGQG...,46,129.7603,0.0000,0.0000,16.32
1,Abituzumab,['QVQLQQSGGELAKPGASVKVSCKASGYTFSSFWMHWVRQAPGQG...,45,115.9106,0.0954,0.0421,-3.10
2,Abrilumab,['QVQLVQSGAEVKKPGASVKVSCKVSGYTLSDLSIHWVRQAPGKG...,45,109.6995,0.0000,0.8965,-4.00
3,Actoxumab,['QVQLVESGGGVVQPGRSLRLSCAASGFSFSNYGMHWVRQAPGKG...,49,112.6290,0.0000,1.1247,3.10
4,Adalimumab,['EVQLVESGGGLVQPGRSLRLSCAASGFTFDDYAMHWVRQAPGKG...,48,111.2512,0.0485,1.1364,-19.50
...,...,...,...,...,...,...,...
236,Visilizumab,['QVQLVQSGAEVKKPGASVKVSCKASGYTFISYTMHWVRQAPGQG...,46,124.0825,0.1417,0.1812,8.40
237,Vonlerolizumab,['EVQLVQSGAEVKKPGASVKVSCKASGYTFTDSYMSWVRQAPGQG...,44,118.5559,0.2029,0.3046,0.00
238,Zalutumumab,['QVQLVESGGGVVQPGRSLRLSCAASGFTFSTYGMHWVRQAPGKG...,52,121.8996,0.0000,1.2505,0.00
239,Zanolimumab,['QVQLQQWGAGLLKPSETLSLTCAVYGGSFSGYYWSWIRQPPGKG...,42,112.5357,0.0000,0.0000,6.51


In [62]:
tap_data.loc[1]["Antibody"]

"['QVQLQQSGGELAKPGASVKVSCKASGYTFSSFWMHWVRQAPGQGLEWIGYINPRSGYTEYNEIFRDKATMTTDTSTSTAYMELSSLRSEDTAVYYCASFLGRGAMDYWGQGTTVTVSS'\\n 'DIQMTQSPSSLSASVGDRVTITCRASQDISNYLAWYQQKPGKAPKLLIYYTSKIHSGVPSRFSGSGSGTDYTFTISSLQPEDIATYYCQQGNTFPYTFGQGTKVEIK']"

In [65]:
def split_ab_row_tap(row):
    whole_ab_seq = row["Antibody"]
    ab_seq = whole_ab_seq.replace("[", "").replace("]", "").replace("'", "")
    heavy, light = ab_seq.split("\\n")
    heavy = heavy.strip()
    light = light.strip()
    return heavy, light

In [66]:
split_ab_row_tap(tap_data.loc[1])

('QVQLQQSGGELAKPGASVKVSCKASGYTFSSFWMHWVRQAPGQGLEWIGYINPRSGYTEYNEIFRDKATMTTDTSTSTAYMELSSLRSEDTAVYYCASFLGRGAMDYWGQGTTVTVSS',
 'DIQMTQSPSSLSASVGDRVTITCRASQDISNYLAWYQQKPGKAPKLLIYYTSKIHSGVPSRFSGSGSGTDYTFTISSLQPEDIATYYCQQGNTFPYTFGQGTKVEIK')

In [67]:
tap_data[["heavy", "light"]] = tap_data.apply(split_ab_row_tap, axis=1, result_type="expand")
tap_data.head()

Unnamed: 0,Antibody_ID,Antibody,CDR_length,PSH,PPC,PNC,SFvCSP,heavy,light
0,Abagovomab,['QVKLQESGAELARPGASVKLSCKASGYTFTNYWMQWVKQRPGQG...,46,129.7603,0.0,0.0,16.32,QVKLQESGAELARPGASVKLSCKASGYTFTNYWMQWVKQRPGQGLD...,DIELTQSPASLSASVGETVTITCQASENIYSYLAWHQQKQGKSPQL...
1,Abituzumab,['QVQLQQSGGELAKPGASVKVSCKASGYTFSSFWMHWVRQAPGQG...,45,115.9106,0.0954,0.0421,-3.1,QVQLQQSGGELAKPGASVKVSCKASGYTFSSFWMHWVRQAPGQGLE...,DIQMTQSPSSLSASVGDRVTITCRASQDISNYLAWYQQKPGKAPKL...
2,Abrilumab,['QVQLVQSGAEVKKPGASVKVSCKVSGYTLSDLSIHWVRQAPGKG...,45,109.6995,0.0,0.8965,-4.0,QVQLVQSGAEVKKPGASVKVSCKVSGYTLSDLSIHWVRQAPGKGLE...,DIQMTQSPSSVSASVGDRVTITCRASQGISSWLAWYQQKPGKAPKL...
3,Actoxumab,['QVQLVESGGGVVQPGRSLRLSCAASGFSFSNYGMHWVRQAPGKG...,49,112.629,0.0,1.1247,3.1,QVQLVESGGGVVQPGRSLRLSCAASGFSFSNYGMHWVRQAPGKGLE...,DIQMTQSPSSVSASVGDRVTITCRASQGISSWLAWYQHKPGKAPKL...
4,Adalimumab,['EVQLVESGGGLVQPGRSLRLSCAASGFTFDDYAMHWVRQAPGKG...,48,111.2512,0.0485,1.1364,-19.5,EVQLVESGGGLVQPGRSLRLSCAASGFTFDDYAMHWVRQAPGKGLE...,DIQMTQSPSSLSASVGDRVTITCRASQGIRNYLAWYQQKPGKAPKL...


In [68]:
tap_data = tap_data[["Antibody_ID", "heavy", "light", "CDR_length", "PSH", "PPC", "PNC", "SFvCSP"]]
tap_data.head()

Unnamed: 0,Antibody_ID,heavy,light,CDR_length,PSH,PPC,PNC,SFvCSP
0,Abagovomab,QVKLQESGAELARPGASVKLSCKASGYTFTNYWMQWVKQRPGQGLD...,DIELTQSPASLSASVGETVTITCQASENIYSYLAWHQQKQGKSPQL...,46,129.7603,0.0,0.0,16.32
1,Abituzumab,QVQLQQSGGELAKPGASVKVSCKASGYTFSSFWMHWVRQAPGQGLE...,DIQMTQSPSSLSASVGDRVTITCRASQDISNYLAWYQQKPGKAPKL...,45,115.9106,0.0954,0.0421,-3.1
2,Abrilumab,QVQLVQSGAEVKKPGASVKVSCKVSGYTLSDLSIHWVRQAPGKGLE...,DIQMTQSPSSVSASVGDRVTITCRASQGISSWLAWYQQKPGKAPKL...,45,109.6995,0.0,0.8965,-4.0
3,Actoxumab,QVQLVESGGGVVQPGRSLRLSCAASGFSFSNYGMHWVRQAPGKGLE...,DIQMTQSPSSVSASVGDRVTITCRASQGISSWLAWYQHKPGKAPKL...,49,112.629,0.0,1.1247,3.1
4,Adalimumab,EVQLVESGGGLVQPGRSLRLSCAASGFTFDDYAMHWVRQAPGKGLE...,DIQMTQSPSSLSASVGDRVTITCRASQGIRNYLAWYQQKPGKAPKL...,48,111.2512,0.0485,1.1364,-19.5


In [78]:
merged = tap_data.merge(data_tap.drop("Y", axis=1), indicator=True, how='outer')
merged[merged['_merge'] == 'both']

Unnamed: 0,Antibody_ID,heavy,light,CDR_length,PSH,PPC,PNC,SFvCSP,_merge
0,Abagovomab,QVKLQESGAELARPGASVKLSCKASGYTFTNYWMQWVKQRPGQGLD...,DIELTQSPASLSASVGETVTITCQASENIYSYLAWHQQKQGKSPQL...,46,129.7603,0.0,0.0,16.32,both


In [102]:
# !! Following guidelines from http://opig.stats.ox.ac.uk/webapps/newsabdab/sabpred/tap
# but the cutoff is made-up !!
def tap_developable(row):
    amber = 0
    if row["CDR_length"] < 37 or row["CDR_length"] > 63:
        return 0
    if row["CDR_length"] <= 43 or row["CDR_length"] >= 55:
        amber += 1
        
    if row["PSH"] < 89.9 or row["PSH"] > 208.67:
        return 0
    if row["PSH"] <= 116.3 or row["PSH"] >= 173.5:
        amber += 1
    
    if row["PPC"] > 3.74:
        return 0
    if row["PPC"] >= 1.26:
        amber += 1
        
    if row["PNC"] > 4.25:
        return 0
    if row["PNC"] >= 1.84:
        amber += 1
        
    if row["SFvCSP"] < -19.5:
        return 0
    if row["SFvCSP"] <= -5.7:
        amber += 1
        
    if amber <= 2:
        return 1
    else:
        return 0

In [99]:
tap_data["Y"]  = tap_data.apply(tap_developable, axis=1)
tap_data.head()

Unnamed: 0,Antibody_ID,heavy,light,CDR_length,PSH,PPC,PNC,SFvCSP,Y
0,Abagovomab,QVKLQESGAELARPGASVKLSCKASGYTFTNYWMQWVKQRPGQGLD...,DIELTQSPASLSASVGETVTITCQASENIYSYLAWHQQKQGKSPQL...,46,129.7603,0.0,0.0,16.32,1
1,Abituzumab,QVQLQQSGGELAKPGASVKVSCKASGYTFSSFWMHWVRQAPGQGLE...,DIQMTQSPSSLSASVGDRVTITCRASQDISNYLAWYQQKPGKAPKL...,45,115.9106,0.0954,0.0421,-3.1,1
2,Abrilumab,QVQLVQSGAEVKKPGASVKVSCKVSGYTLSDLSIHWVRQAPGKGLE...,DIQMTQSPSSVSASVGDRVTITCRASQGISSWLAWYQQKPGKAPKL...,45,109.6995,0.0,0.8965,-4.0,1
3,Actoxumab,QVQLVESGGGVVQPGRSLRLSCAASGFSFSNYGMHWVRQAPGKGLE...,DIQMTQSPSSVSASVGDRVTITCRASQGISSWLAWYQHKPGKAPKL...,49,112.629,0.0,1.1247,3.1,1
4,Adalimumab,EVQLVESGGGLVQPGRSLRLSCAASGFTFDDYAMHWVRQAPGKGLE...,DIQMTQSPSSLSASVGDRVTITCRASQGIRNYLAWYQQKPGKAPKL...,48,111.2512,0.0485,1.1364,-19.5,1


In [100]:
tap_data.to_csv(path.join(DATA_DIR, "tap/TAP_data.csv"), index=False)

In [3]:
data_tap = pd.read_csv(path.join(DATA_DIR, "tap/TAP_data.csv"))
data_tap

Unnamed: 0,Antibody_ID,heavy,light,CDR_length,PSH,PPC,PNC,SFvCSP,Y
0,Abagovomab,QVKLQESGAELARPGASVKLSCKASGYTFTNYWMQWVKQRPGQGLD...,DIELTQSPASLSASVGETVTITCQASENIYSYLAWHQQKQGKSPQL...,46,129.7603,0.0000,0.0000,16.32,1
1,Abituzumab,QVQLQQSGGELAKPGASVKVSCKASGYTFSSFWMHWVRQAPGQGLE...,DIQMTQSPSSLSASVGDRVTITCRASQDISNYLAWYQQKPGKAPKL...,45,115.9106,0.0954,0.0421,-3.10,1
2,Abrilumab,QVQLVQSGAEVKKPGASVKVSCKVSGYTLSDLSIHWVRQAPGKGLE...,DIQMTQSPSSVSASVGDRVTITCRASQGISSWLAWYQQKPGKAPKL...,45,109.6995,0.0000,0.8965,-4.00,1
3,Actoxumab,QVQLVESGGGVVQPGRSLRLSCAASGFSFSNYGMHWVRQAPGKGLE...,DIQMTQSPSSVSASVGDRVTITCRASQGISSWLAWYQHKPGKAPKL...,49,112.6290,0.0000,1.1247,3.10,1
4,Adalimumab,EVQLVESGGGLVQPGRSLRLSCAASGFTFDDYAMHWVRQAPGKGLE...,DIQMTQSPSSLSASVGDRVTITCRASQGIRNYLAWYQQKPGKAPKL...,48,111.2512,0.0485,1.1364,-19.50,1
...,...,...,...,...,...,...,...,...,...
236,Visilizumab,QVQLVQSGAEVKKPGASVKVSCKASGYTFISYTMHWVRQAPGQGLE...,DIQMTQSPSSLSASVGDRVTITCSASSSVSYMNWYQQKPGKAPKRL...,46,124.0825,0.1417,0.1812,8.40,1
237,Vonlerolizumab,EVQLVQSGAEVKKPGASVKVSCKASGYTFTDSYMSWVRQAPGQGLE...,DIQMTQSPSSLSASVGDRVTITCRASQDISNYLNWYQQKPGKAPKL...,44,118.5559,0.2029,0.3046,0.00,1
238,Zalutumumab,QVQLVESGGGVVQPGRSLRLSCAASGFTFSTYGMHWVRQAPGKGLE...,AIQLTQSPSSLSASVGDRVTITCRASQDISSALVWYQQKPGKAPKL...,52,121.8996,0.0000,1.2505,0.00,1
239,Zanolimumab,QVQLQQWGAGLLKPSETLSLTCAVYGGSFSGYYWSWIRQPPGKGLE...,DIQMTQSPSSVSASVGDRVTITCRASQDISSWLAWYQHKPGKAPKL...,42,112.5357,0.0000,0.0000,6.51,1


In [22]:
data_chen[data_chen["heavy"] == data_tap.iloc[16]["heavy"]]

Unnamed: 0,Antibody_ID,heavy,light,Y
1327,4nki,EVQLLESGGGLVQPGGSLRLSCAASGFTFSSYIMMWVRQAPGKGLE...,QSALTQPASVSGSPGQSITISCTGTSSDVGGYNYVSWYQQHPGKAP...,0


In [4]:
data_chen = pd.read_csv(path.join(DATA_DIR, "chen/deduplicated/chen_data.csv"), index_col=0)

In [5]:
count_overlap = 0
for i in range(241):
    if len(data_chen[data_chen["heavy"] == data_tap.iloc[i]["heavy"]]):
        count_overlap += 1
print(count_overlap)

62


In [6]:
overlap = pd.merge(data_chen, data_tap, how='inner', on=['heavy', 'light'])

In [7]:
overlap["Y_x"].value_counts()

0    39
1    18
Name: Y_x, dtype: int64

In [8]:
unique = data_tap.merge(data_chen[["heavy", "light"]], how="outer", on=['heavy', 'light'], indicator=True).loc[lambda x : x['_merge']=='left_only']
unique
#df = df1.merge(df2, how = 'outer' ,indicator=True).loc[lambda x : x['_merge']=='left_only']

Unnamed: 0,Antibody_ID,heavy,light,CDR_length,PSH,PPC,PNC,SFvCSP,Y,_merge
0,Abagovomab,QVKLQESGAELARPGASVKLSCKASGYTFTNYWMQWVKQRPGQGLD...,DIELTQSPASLSASVGETVTITCQASENIYSYLAWHQQKQGKSPQL...,46.0,129.7603,0.0000,0.0000,16.32,1.0,left_only
1,Abituzumab,QVQLQQSGGELAKPGASVKVSCKASGYTFSSFWMHWVRQAPGQGLE...,DIQMTQSPSSLSASVGDRVTITCRASQDISNYLAWYQQKPGKAPKL...,45.0,115.9106,0.0954,0.0421,-3.10,1.0,left_only
2,Abrilumab,QVQLVQSGAEVKKPGASVKVSCKVSGYTLSDLSIHWVRQAPGKGLE...,DIQMTQSPSSVSASVGDRVTITCRASQGISSWLAWYQQKPGKAPKL...,45.0,109.6995,0.0000,0.8965,-4.00,1.0,left_only
3,Actoxumab,QVQLVESGGGVVQPGRSLRLSCAASGFSFSNYGMHWVRQAPGKGLE...,DIQMTQSPSSVSASVGDRVTITCRASQGISSWLAWYQHKPGKAPKL...,49.0,112.6290,0.0000,1.1247,3.10,1.0,left_only
7,Alirocumab,EVQLVESGGGLVQPGGSLRLSCAASGFTFNNYAMNWVRQAPGKGLD...,DIVMTQSPDSLAVSLGERATINCKSSQSVLYRSNNRNFLGWYQQKP...,51.0,129.5843,0.0604,0.0750,3.10,1.0,left_only
...,...,...,...,...,...,...,...,...,...,...
235,Veltuzumab,QVQLQQSGAEVKKPGSSVKVSCKASGYTFTSYNMHWVKQAPGQGLE...,DIQLTQSPSSLSASVGDRVTMTCRASSSVSYIHWFQQKPGKAPKPW...,47.0,131.6119,0.0000,0.0000,-6.00,1.0,left_only
236,Visilizumab,QVQLVQSGAEVKKPGASVKVSCKASGYTFISYTMHWVRQAPGQGLE...,DIQMTQSPSSLSASVGDRVTITCSASSSVSYMNWYQQKPGKAPKRL...,46.0,124.0825,0.1417,0.1812,8.40,1.0,left_only
238,Zalutumumab,QVQLVESGGGVVQPGRSLRLSCAASGFTFSTYGMHWVRQAPGKGLE...,AIQLTQSPSSLSASVGDRVTITCRASQDISSALVWYQQKPGKAPKL...,52.0,121.8996,0.0000,1.2505,0.00,1.0,left_only
239,Zanolimumab,QVQLQQWGAGLLKPSETLSLTCAVYGGSFSGYYWSWIRQPPGKGLE...,DIQMTQSPSSVSASVGDRVTITCRASQDISSWLAWYQHKPGKAPKL...,42.0,112.5357,0.0000,0.0000,6.51,1.0,left_only


In [9]:
unique = unique[["Antibody_ID", "heavy", "light", "Y"]]
unique.to_csv(path.join(DATA_DIR, "tap/tap_not_in_chen.csv"))

## Jain

In [9]:
jain_data = pd.read_csv(path.join(DATA_DIR, "Jain/Jain_data_2.csv"), sep=";")
jain_data.head()

Unnamed: 0,Name,VH,VL,LC Class,Source,Source Detaileda,Disclaimers and Known Issues,Unnamed: 7,Notes,Unnamed: 9,Unnamed: 10,Unnamed: 11,Unnamed: 12,Unnamed: 13,Unnamed: 14
0,abituzumab,QVQLQQSGGELAKPGASVKVSCKASGYTFSSFWMHWVRQAPGQGLE...,DIQMTQSPSSLSASVGDRVTITCRASQDISNYLAWYQQKPGKAPKL...,kappa,WHO-INN,PL109,,,aPL and RL refer to WHO-INN publications for p...,,,,,,
1,abrilumab,QVQLVQSGAEVKKPGASVKVSCKVSGYTLSDLSIHWVRQAPGKGLE...,DIQMTQSPSSVSASVGDRVTITCRASQGISSWLAWYQQKPGKAPKL...,kappa,WHO-INN,PL111,,,,,,,,,
2,adalimumab,EVQLVESGGGLVQPGRSLRLSCAASGFTFDDYAMHWVRQAPGKGLE...,DIQMTQSPSSLSASVGDRVTITCRASQGIRNYLAWYQQKPGKAPKL...,kappa,PDB,4NYL,,,,,,,,,
3,alemtuzumab,QVQLQESGPGLVRPSQTLSLTCTVSGFTFTDFYMNWVRQPPGRGLE...,DIQMTQSPSSLSASVGDRVTITCKASQNIDKYLNWYQQKPGKAPKL...,kappa,PDB,1BEY,,,,,,,,,
4,alirocumab,EVQLVESGGGLVQPGGSLRLSCAASGFTFNNYAMNWVRQAPGKGLD...,DIVMTQSPDSLAVSLGERATINCKSSQSVLYRSNNRNFLGWYQQKP...,kappa,WHO-INN,PL107,,,,,,,,,


In [10]:
jain_abs = set(jain_data["Name"])

In [11]:
common = jain_abs.intersection(set(data_tap["Antibody_ID"].str.lower()))
len(common)

137

## Shehata

In [12]:
shehata_data = pd.read_excel(path.join(DATA_DIR, "shehata/shehata42.xlsx"))
shehata_data.head()

Unnamed: 0,Antibody name,TmApp (°C),PSR Score,HIC Retention Time (min)
0,adalimumab,69.5,0.0,8.92
1,alirocumab,69.0,0.0,ND
2,basiliximab,62.0,0.05959,ND
3,brentuximab,69.5,0.546259,9.843
4,brodalumab,73.5,0.053885,8.882


In [13]:
shehata_abs = set(shehata_data["Antibody name"])

In [14]:
common = shehata_abs.intersection(set(data_tap["Antibody_ID"].str.lower()))
len(common)

42

In [15]:
tap_abs = set(data_tap["Antibody_ID"].str.lower())

In [18]:
len(shehata_abs.intersection(jain_abs))

42