In [1]:
import pandas as pd
import numpy as np
from os import path

In [2]:
DATA_DIR = "../../data/"

In [3]:
chen_data = pd.read_csv(path.join(DATA_DIR, "chen/chen_data.csv"))
chen_data.head()

Unnamed: 0,Antibody_ID,heavy,light,Y
0,12e8,EVQLQQSGAEVVRSGASVKLSCTASGFNIKDYYIHWVKQRPEKGLE...,DIVMTQSQKFMSTSVGDRVSITCKASQNVGTAVAWYQQKPGQSPKL...,0
1,15c8,EVQLQQSGAELVKPGASVKLSCTASGFNIKDTYMHWVKQKPEQGLE...,DIVLTQSPAIMSASLGERVTMTCTASSSVSSSNLHWYQQKPGSSPK...,0
2,1a0q,EVQLQESDAELVKPGASVKISCKASGYTFTDHVIHWVKQKPEQGLE...,DIELTQSPSSLSASLGGKVTITCKASQDIKKYIGWYQHKPGKQPRL...,1
3,1a14,QVQLQQSGAELVKPGASVRMSCKASGYTFTNYNMYWVKQSPGQGLE...,DIELTQTTSSLSASLGDRVTISCRASQDISNYLNWYQQNPDGTVKL...,0
4,1a2y,QVQLQESGPGLVAPSQSLSITCTVSGFSLTGYGVNWVRQPPGKGLE...,DIVLTQSPASLSASVGETVTITCRASGNIHNYLAWYQQKQGKSPQL...,0


# Explore lengths of sequences
## Heavy

In [6]:
# max length
chen_data["heavy"].str.len().max()

171

In [7]:
# min length
chen_data["heavy"].str.len().min()

103

## Light

In [8]:
# max length
chen_data["light"].str.len().max()

119

In [9]:
# min length
chen_data["light"].str.len().min()

93

-> We aim for a vector of 171 + 119 ints

# Encode sequences

In [11]:
# Which amino acids are present? (Make sure we don't have non-standard such as pyrrolysine)
all_aas = set()

def add_aas(aa_list):
    global all_aas
    all_aas = all_aas.union(set(aa_list))
    
chen_data["heavy"].str.split("").apply(add_aas)
chen_data["light"].str.split("").apply(add_aas)

print(sorted(list(all_aas)))
aa_list = sorted(list(all_aas))[1:]


['', 'A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']


In [12]:
aa_list.append("-")
ENCODING = {
    letter: number + 1 for number, letter in enumerate(aa_list)
}
# zero is left for padding
ENCODING

{'A': 1,
 'C': 2,
 'D': 3,
 'E': 4,
 'F': 5,
 'G': 6,
 'H': 7,
 'I': 8,
 'K': 9,
 'L': 10,
 'M': 11,
 'N': 12,
 'P': 13,
 'Q': 14,
 'R': 15,
 'S': 16,
 'T': 17,
 'V': 18,
 'W': 19,
 'Y': 20,
 '-': 21}

In [13]:
tap_data = pd.read_csv(path.join(DATA_DIR, "tap/TAP_data.csv"))
tap_data.head()

Unnamed: 0,Antibody_ID,heavy,light,CDR_length,PSH,PPC,PNC,SFvCSP,Y
0,Abagovomab,QVKLQESGAELARPGASVKLSCKASGYTFTNYWMQWVKQRPGQGLD...,DIELTQSPASLSASVGETVTITCQASENIYSYLAWHQQKQGKSPQL...,46,129.7603,0.0,0.0,16.32,1
1,Abituzumab,QVQLQQSGGELAKPGASVKVSCKASGYTFSSFWMHWVRQAPGQGLE...,DIQMTQSPSSLSASVGDRVTITCRASQDISNYLAWYQQKPGKAPKL...,45,115.9106,0.0954,0.0421,-3.1,1
2,Abrilumab,QVQLVQSGAEVKKPGASVKVSCKVSGYTLSDLSIHWVRQAPGKGLE...,DIQMTQSPSSVSASVGDRVTITCRASQGISSWLAWYQQKPGKAPKL...,45,109.6995,0.0,0.8965,-4.0,1
3,Actoxumab,QVQLVESGGGVVQPGRSLRLSCAASGFSFSNYGMHWVRQAPGKGLE...,DIQMTQSPSSVSASVGDRVTITCRASQGISSWLAWYQHKPGKAPKL...,49,112.629,0.0,1.1247,3.1,1
4,Adalimumab,EVQLVESGGGLVQPGRSLRLSCAASGFTFDDYAMHWVRQAPGKGLE...,DIQMTQSPSSLSASVGDRVTITCRASQGIRNYLAWYQQKPGKAPKL...,48,111.2512,0.0485,1.1364,-19.5,1


In [14]:
# check also for TAP dataset
all_aas = set()

def add_aas(aa_list):
    global all_aas
    all_aas = all_aas.union(set(aa_list))
    
tap_data["heavy"].str.split("").apply(add_aas)
tap_data["light"].str.split("").apply(add_aas)

print(sorted(list(all_aas)))

['', 'A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']


In [15]:
TARGET_LEN = chen_data["heavy"].str.len().max() + chen_data["light"].str.len().max()
TARGET_LEN

290

In [16]:
# TAP sequences are shorter
tap_data["heavy"].str.len().max() + tap_data["light"].str.len().max()

243

No weird amino acids in either dataset

In [17]:
def encode(row):
    vector = [ENCODING[aa] for aa in row["heavy"]] + [ENCODING[aa] for aa in row["light"]]
    padding = TARGET_LEN - len(vector)
    vector = vector + [0] * padding
    # which end to pad from ???
    return vector

In [18]:
encoded = chen_data.apply(encode, axis=1, result_type="expand")

In [19]:
encoded.insert(0, "Ab_ID", chen_data["Antibody_ID"])
encoded.head()

Unnamed: 0,Ab_ID,0,1,2,3,4,5,6,7,8,...,280,281,282,283,284,285,286,287,288,289
0,12e8,4,18,14,10,14,14,16,6,1,...,0,0,0,0,0,0,0,0,0,0
1,15c8,4,18,14,10,14,14,16,6,1,...,0,0,0,0,0,0,0,0,0,0
2,1a0q,4,18,14,10,14,4,16,3,1,...,0,0,0,0,0,0,0,0,0,0
3,1a14,14,18,14,10,14,14,16,6,1,...,0,0,0,0,0,0,0,0,0,0
4,1a2y,14,18,14,10,14,4,16,6,13,...,0,0,0,0,0,0,0,0,0,0


In [20]:
encoded.drop("Ab_ID", axis=1).sum()

0      20686
1      40697
2      32335
3      24300
4      35502
       ...  
285        0
286        0
287        0
288        0
289        0
Length: 290, dtype: int64

Many columns have only padding -> drop those

In [21]:
encoded = encoded.loc[:,encoded.sum() != 0]

In [70]:
encoded.to_csv(path.join(DATA_DIR, "chen/integer_encoding/chen_integer_encoded.csv"), index=False)

# Encode according to ANARCI numbering

In [9]:
heavy_df = pd.read_csv(path.join(DATA_DIR, "chen/abnumber/chen_heavy_chain_numbering.csv"))
light_df = pd.read_csv(path.join(DATA_DIR, "chen/abnumber/chen_light_chain_numbering.csv"))
heavy_df.head()

Unnamed: 0,Ab_ID,chain_type,species,1,2,3,3A,4,4A,5,...,119,120,121,122,123,124,125,126,127,128
0,12e8,H,mouse,E,V,Q,-,L,-,Q,...,G,Q,G,T,L,V,T,V,S,A
1,15c8,H,mouse,E,V,Q,-,L,-,Q,...,G,Q,G,T,T,L,T,V,S,S
2,1a0q,H,mouse,E,V,Q,-,L,-,Q,...,G,Q,G,T,T,L,T,V,S,S
3,1a14,H,mouse,Q,V,Q,-,L,-,Q,...,G,Q,G,T,T,V,T,V,-,-
4,1a2y,H,mouse,Q,V,Q,-,L,-,Q,...,G,Q,G,T,T,L,T,V,S,S


In [12]:
heavy_df["1"].apply(lambda x: ENCODING[x])

0        4
1        4
2        4
3       14
4       14
        ..
2402     4
2403     4
2404     4
2405     1
2406     1
Name: 1, Length: 2407, dtype: int64

In [14]:
pos_columns = heavy_df.columns[3:]
for col in pos_columns:
    heavy_df[col] = heavy_df[col].apply(lambda x: ENCODING[x])
heavy_df.head()

Unnamed: 0,Ab_ID,chain_type,species,1,2,3,3A,4,4A,5,...,119,120,121,122,123,124,125,126,127,128
0,12e8,H,mouse,4,18,14,21,10,21,14,...,6,14,6,17,10,18,17,18,16,1
1,15c8,H,mouse,4,18,14,21,10,21,14,...,6,14,6,17,17,10,17,18,16,16
2,1a0q,H,mouse,4,18,14,21,10,21,14,...,6,14,6,17,17,10,17,18,16,16
3,1a14,H,mouse,14,18,14,21,10,21,14,...,6,14,6,17,17,18,17,18,21,21
4,1a2y,H,mouse,14,18,14,21,10,21,14,...,6,14,6,17,17,10,17,18,16,16


In [15]:
pos_columns = light_df.columns[3:]
for col in pos_columns:
    light_df[col] = light_df[col].apply(lambda x: ENCODING[x])
light_df.head()

Unnamed: 0,Ab_ID,chain_type,species,1,2,3,4,5,6,7,...,118,119,120,121,122,123,124,125,126,127
0,12e8,K,mouse,3,8,18,11,17,14,16,...,5,6,1,6,17,9,10,4,10,9
1,15c8,K,mouse,3,8,18,10,17,14,16,...,5,6,6,6,17,9,10,4,8,9
2,1a0q,K,rat,3,8,4,10,17,14,16,...,5,6,6,6,17,9,10,4,8,9
3,1a14,K,mouse,3,8,4,10,17,14,17,...,5,6,6,6,17,21,21,21,21,21
4,1a2y,K,mouse,3,8,18,10,17,14,16,...,5,6,6,6,17,9,10,4,8,9


In [16]:
encoded_df = pd.concat([heavy_df.drop(["chain_type", "species"], axis=1), light_df.drop(["chain_type", "species", "Ab_ID"], axis=1)], axis=1)

In [17]:
encoded_df

Unnamed: 0,Ab_ID,1,2,3,3A,4,4A,5,6,7,...,118,119,120,121,122,123,124,125,126,127
0,12e8,4.0,18.0,14.0,21.0,10.0,21.0,14.0,14.0,16.0,...,5,6,1,6,17,9,10,4,10,9
1,15c8,4.0,18.0,14.0,21.0,10.0,21.0,14.0,14.0,16.0,...,5,6,6,6,17,9,10,4,8,9
2,1a0q,4.0,18.0,14.0,21.0,10.0,21.0,14.0,4.0,16.0,...,5,6,6,6,17,9,10,4,8,9
3,1a14,14.0,18.0,14.0,21.0,10.0,21.0,14.0,14.0,16.0,...,5,6,6,6,17,21,21,21,21,21
4,1a2y,14.0,18.0,14.0,21.0,10.0,21.0,14.0,4.0,16.0,...,5,6,6,6,17,9,10,4,8,9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2404,6s5a,4.0,18.0,14.0,21.0,10.0,21.0,18.0,4.0,16.0,...,5,6,6,6,17,9,10,17,18,10
2405,6tyb,1.0,18.0,14.0,21.0,10.0,21.0,4.0,14.0,16.0,...,5,6,14,6,17,9,18,4,8,9
2406,6u1t,1.0,18.0,9.0,21.0,10.0,21.0,18.0,14.0,1.0,...,5,6,6,6,17,9,10,4,8,9
2407,,,,,,,,,,,...,5,6,6,6,17,9,10,17,18,10


In [18]:
encoded_df.to_csv(path.join(DATA_DIR, "chen/integer_encoding/chen_integers_from_anarci.csv"), index=False)


In [20]:
HEAVY_MAX_LEN = chen_data["heavy"].str.len().max()
LIGHT_MAX_LEN = chen_data["light"].str.len().max()

In [21]:
len(heavy_df)

2407

In [24]:
def encode_separate(row):
    vector = [ENCODING[aa] for aa in row["heavy"]] 
    padding = HEAVY_MAX_LEN - len(vector)
    vector += [0] * padding
    vector_l = [ENCODING[aa] for aa in row["light"]]
    padding = LIGHT_MAX_LEN - len(vector_l)
    vector = vector + vector_l + [0] * padding
    return vector

In [25]:
encoded_separate = chen_data.apply(encode_separate, axis=1, result_type="expand")

In [26]:
encoded_separate.insert(0, "Ab_ID", chen_data["Antibody_ID"])
encoded_separate.head()

Unnamed: 0,Ab_ID,0,1,2,3,4,5,6,7,8,...,280,281,282,283,284,285,286,287,288,289
0,12e8,4,18,14,10,14,14,16,6,1,...,0,0,0,0,0,0,0,0,0,0
1,15c8,4,18,14,10,14,14,16,6,1,...,0,0,0,0,0,0,0,0,0,0
2,1a0q,4,18,14,10,14,4,16,3,1,...,0,0,0,0,0,0,0,0,0,0
3,1a14,14,18,14,10,14,14,16,6,1,...,0,0,0,0,0,0,0,0,0,0
4,1a2y,14,18,14,10,14,4,16,6,13,...,0,0,0,0,0,0,0,0,0,0


In [27]:
encoded_separate.drop("Ab_ID", axis=1).sum()

0      20686
1      40697
2      32335
3      24300
4      35502
       ...  
285      220
286       85
287       34
288       36
289       20
Length: 290, dtype: int64

In [29]:
encoded_separate.to_csv(path.join(DATA_DIR, "chen/integer_encoding/chen_integer_encoded_separate.csv"), index=False)

# TAP data

Repeat with the other dataset

In [23]:
# Number of columns kept in the first dataset
len(encoded.columns)

282

In [28]:
encoded_tap = tap_data.apply(encode, axis=1, result_type="expand")
encoded_tap.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,280,281,282,283,284,285,286,287,288,289
0,14,18,9,10,14,4,16,6,1,4,...,0,0,0,0,0,0,0,0,0,0
1,14,18,14,10,14,14,16,6,6,4,...,0,0,0,0,0,0,0,0,0,0
2,14,18,14,10,18,14,16,6,1,4,...,0,0,0,0,0,0,0,0,0,0
3,14,18,14,10,18,4,16,6,6,6,...,0,0,0,0,0,0,0,0,0,0
4,4,18,14,10,18,4,16,6,6,6,...,0,0,0,0,0,0,0,0,0,0


In [29]:
encoded_tap.insert(0, "Ab_ID", tap_data["Antibody_ID"])
encoded_tap.head()

Unnamed: 0,Ab_ID,0,1,2,3,4,5,6,7,8,...,280,281,282,283,284,285,286,287,288,289
0,Abagovomab,14,18,9,10,14,4,16,6,1,...,0,0,0,0,0,0,0,0,0,0
1,Abituzumab,14,18,14,10,14,14,16,6,6,...,0,0,0,0,0,0,0,0,0,0
2,Abrilumab,14,18,14,10,18,14,16,6,1,...,0,0,0,0,0,0,0,0,0,0
3,Actoxumab,14,18,14,10,18,4,16,6,6,...,0,0,0,0,0,0,0,0,0,0
4,Adalimumab,4,18,14,10,18,4,16,6,6,...,0,0,0,0,0,0,0,0,0,0


In [30]:
encoded_tap.drop("Ab_ID", axis=1).sum()

0      2333
1      4216
2      3341
3      2434
4      3922
       ... 
285       0
286       0
287       0
288       0
289       0
Length: 290, dtype: int64

In [32]:
encoded_tap = encoded_tap.iloc[:,:282]
len(encoded_tap.columns)

282

In [35]:
encoded_tap.to_csv(path.join(DATA_DIR, "tap/integer_encoding/tap_integer_encoded.csv"), index=False)