In [2]:
import pandas as pd
import numpy as np 
from os import path

In [3]:
DATA_DIR = "../data"

In [4]:
chen_train = pd.read_csv(path.join(DATA_DIR, "chen/deduplicated/chen_train_data.csv"), index_col=0)
chen_valid = pd.read_csv(path.join(DATA_DIR, "chen/deduplicated/chen_valid_data.csv"), index_col=0)
chen_test = pd.read_csv(path.join(DATA_DIR, "chen/deduplicated/chen_test_data.csv"), index_col=0)
chen_train.head()

Unnamed: 0,Antibody_ID,heavy,light,Y
2073,6aod,EVQLVQSGAEVKKPGASVKVSCKASGYTFTGYYMHWVRQAPGQGLE...,DIVMTKSPSSLSASVGDRVTITCRASQGIRNDLGWYQQKPGKAPKR...,0
1517,4yny,EVQLVESGGGLVQPGRSLKLSCAASGFTFSNYGMAWVRQTPTKGLE...,EFVLTQPNSVSTNLGSTVKLSCKRSTGNIGSNYVNWYQQHEGRSPT...,1
2025,5xcv,EVQLVESGGGLVQPGRSLKLSCAASGFTFSNYGMAWVRQTPTKGLE...,QFVLTQPNSVSTNLGSTVKLSCKRSTGNIGSNYVNWYQQHEGRSPT...,1
2070,6and,EVQLVESGGGLVQPGGSLRLSCAASGYEFSRSWMNWVRQAPGKGLE...,DIQMTQSPSSLSASVGDRVTITCRSSQSIVHSVGNTFLEWYQQKPG...,1
666,2xqy,QVQLQQPGAELVKPGASVKMSCKASGYSFTSYWMNWVKQRPGRGLE...,DIVLTQSPASLALSLGQRATISCRASKSVSTSGYSYMYWYQQKPGQ...,0


In [8]:
from sapiens.roberta import RoBERTa, seq2tokens
from fairseq.models.roberta import RobertaModel

In [29]:
roberta = RoBERTa.load(
    "data/models/heavy/02_pretrained_2000epochs/checkpoints",
    'checkpoint_best.pt'
)

In [13]:
label_fn = lambda label: roberta.interface.task.label_dictionary.string(
    [label + roberta.interface.task.label_dictionary.nspecial]
)

In [34]:
def get_pred(seq):
    tokens = seq2tokens(seq, roberta.interface.task.target_dictionary, True)
    #return label_fn(roberta.interface.predict('sentence_classification_head', tokens).argmax().item())
    return roberta.interface.predict('sentence_classification_head', tokens).argmax().item()

In [25]:
chen_test["predictions"] = chen_test["heavy"].apply(get_pred)

In [23]:
roberta.interface.model.classification_heads

ModuleDict(
  (sentence_classification_head): RobertaClassificationHead(
    (dense): Linear(in_features=128, out_features=128, bias=True)
    (dropout): Dropout(p=0.0, inplace=False)
    (out_proj): Linear(in_features=128, out_features=2, bias=True)
  )
)

In [26]:
chen_test.head()

Unnamed: 0,Antibody_ID,heavy,light,Y,predictions
2372,6obd,EVQLVESGGGLVQPGGSLRLSCAASGFPFSNYWMNWVRQAPGKGLE...,DIVMTQTPLSLSVTPGQPASISCKSSQSLLYSNGKTYLNWVLQKPG...,0,0
359,1sm3,QVQLQESGGGLVQPGGSMKLSCVASGFTFSNYWMNWVRQSPEKGLE...,DIVVTQESALTTSPGETVTLTCRSSTGAVTTSNYANWVQEKPDHLF...,0,0
1539,5a2i,QVQLQESGGGLVQPGGSMKLSCVASGFTFSNYWMNWVRQSPEKGLE...,DIVVTQESALTTSPGETVTLTCRSSTGAVTTSNYANWVQEKPDHLF...,0,0
1112,4f33,EVQLQQSGPELEKPGASVKISCKASGYSFTGYTMNWVKQSHGKSLE...,DIELTQSPAIMSASPGEKVTMTCSASSSVSYMHWYQQKSGTSPKRW...,1,1
314,1q1j,EVQLVESGGGLVKPGGSLRLTCVASGFTFSDVWLNWVRQAPGKGLE...,QSVLTQPPSVSAAPGQKVTISCSGSSSNIGNNYVLWYQQFPGTAPK...,0,0


In [27]:
chen_test["predictions"].value_counts()

0    108
1     11
Name: predictions, dtype: int64

In [35]:
def evaluation(df):
    TN, TP, FN, FP = 0, 0, 0, 0
    for i, row in df.iterrows():
        if row["Y"] == 1 and row["predictions"] == 1:
            TP += 1
        elif row["Y"] == 1 and row["predictions"] == 0:
            FN += 1
        elif row["Y"] == 0 and row["predictions"] == 1:
            FP += 1
        elif row["Y"] == 0 and row["predictions"] == 0:
            TN += 1
    return pd.DataFrame({"0": [TN, FN], "1": [FP, TP]})

In [50]:
conf_mat = evaluation(chen_test)
conf_mat

Unnamed: 0,0,1
0,89,7
1,19,4


In [56]:
conf_mat.loc[0][1]

7

In [58]:
# F1 score
conf_mat.loc[1][1] / (conf_mat.loc[1][1] + 0.5 * (conf_mat.loc[0][1] + conf_mat.loc[1][0]))

0.23529411764705882

In [37]:
for chkpt in range(100, 2005, 100):
    roberta = RoBERTa.load(
        "data/models/heavy/02_pretrained_2000epochs/checkpoints",
        f'checkpoint{chkpt}.pt'
    )
    def get_pred(seq):
        tokens = seq2tokens(seq, roberta.interface.task.target_dictionary, True)
        return roberta.interface.predict('sentence_classification_head', tokens).argmax().item()
    chen_test["predictions"] = chen_test["heavy"].apply(get_pred)
    print(f"Checkpoint: {chkpt}")
    conf_mat = evaluation(chen_test)
    display(conf_mat)
    print(conf_mat.loc[1][1] / (conf_mat.loc[1][1] + 0.5 * (conf_mat.loc[0][1] + conf_mat.loc[1][0])))  

Checkpoint: 100


Unnamed: 0,0,1
0,14,82
1,2,21


0.3333333333333333
Checkpoint: 200


Unnamed: 0,0,1
0,96,0
1,23,0


0.0
Checkpoint: 300


Unnamed: 0,0,1
0,96,0
1,23,0


0.0
Checkpoint: 400


Unnamed: 0,0,1
0,96,0
1,23,0


0.0
Checkpoint: 500


Unnamed: 0,0,1
0,96,0
1,23,0


0.0
Checkpoint: 600


Unnamed: 0,0,1
0,96,0
1,23,0


0.0
Checkpoint: 700


Unnamed: 0,0,1
0,94,2
1,22,1


0.07692307692307693
Checkpoint: 800


Unnamed: 0,0,1
0,90,6
1,19,4


0.24242424242424243
Checkpoint: 900


Unnamed: 0,0,1
0,89,7
1,19,4


0.23529411764705882
Checkpoint: 1000


Unnamed: 0,0,1
0,88,8
1,20,3


0.17647058823529413
Checkpoint: 1100


Unnamed: 0,0,1
0,88,8
1,20,3


0.17647058823529413
Checkpoint: 1200


Unnamed: 0,0,1
0,88,8
1,20,3


0.17647058823529413
Checkpoint: 1300


Unnamed: 0,0,1
0,89,7
1,19,4


0.23529411764705882
Checkpoint: 1400


Unnamed: 0,0,1
0,89,7
1,19,4


0.23529411764705882
Checkpoint: 1500


Unnamed: 0,0,1
0,89,7
1,20,3


0.18181818181818182
Checkpoint: 1600


Unnamed: 0,0,1
0,90,6
1,19,4


0.24242424242424243
Checkpoint: 1700


Unnamed: 0,0,1
0,86,10
1,19,4


0.21621621621621623
Checkpoint: 1800


Unnamed: 0,0,1
0,87,9
1,19,4


0.2222222222222222
Checkpoint: 1900


Unnamed: 0,0,1
0,88,8
1,19,4


0.22857142857142856
Checkpoint: 2000


Unnamed: 0,0,1
0,87,9
1,19,4


0.2222222222222222


In [30]:
chen_test["predictions"] = chen_test["heavy"].apply(get_pred)

In [33]:
conf_mat = evaluation(chen_test)
display(conf_mat)

Unnamed: 0,0,1
0,0,0
1,0,4


In [13]:
tokens = seq2tokens(chen_test.iloc[0]["heavy"], roberta.interface.task.source_dictionary, True)

In [20]:
#print('Loading RoBERTa model...')
roberta = RobertaModel.from_pretrained(
    "data/models/heavy/02_pretrained_2000epochs/checkpoints",
    checkpoint_file='checkpoint_best.pt',
    data_name_or_path="data/processed/heavy/train",
    user_dir='/home/brazdilv/diplomka/SW/Sapiens/sapiens/',
    bpe=None
)
roberta.eval() # disable dropout

RobertaHubInterface(
  (model): RobertaModel(
    (encoder): RobertaEncoder(
      (sentence_encoder): TransformerSentenceEncoder(
        (dropout_module): FairseqDropout()
        (embed_tokens): Embedding(25, 128, padding_idx=1)
        (embed_positions): LearnedPositionalEmbedding(146, 128, padding_idx=1)
        (layers): ModuleList(
          (0): TransformerSentenceEncoderLayer(
            (dropout_module): FairseqDropout()
            (activation_dropout_module): FairseqDropout()
            (self_attn): MultiheadAttention(
              (dropout_module): FairseqDropout()
              (k_proj): Linear(in_features=128, out_features=128, bias=True)
              (v_proj): Linear(in_features=128, out_features=128, bias=True)
              (q_proj): Linear(in_features=128, out_features=128, bias=True)
              (out_proj): Linear(in_features=128, out_features=128, bias=True)
            )
            (self_attn_layer_norm): LayerNorm((128,), eps=1e-05, elementwise_affine=True

In [23]:
pred = roberta.predict('sentence_classification_head', tokens)

In [26]:
pred.argmax().item()

0

In [2]:
!fairseq-train \
    data/processed/light/train/  \
    --user-dir /home/brazdilv/diplomka/SW/Sapiens/sapiens \
    --init-token 0 --separator-token 2 \
    --restore-file /home/brazdilv/diplomka/SW/Sapiens/sapiens/models/v1/checkpoint_vl.pt \
    --reset-optimizer --reset-dataloader --reset-meters \
    --save-dir data/models/light/01_pretrained_2000epochs/checkpoints \
    --tensorboard-logdir data/models/light/01_pretrained_2000epochs/tensorboard \
    --arch roberta_small \
    --criterion sentence_prediction \
    --task sentence_prediction \
    --classification-head-name di-pred \
    --num-classes 2 \
    --optimizer adam \
    --lr 1e-5 --lr-scheduler inverse_sqrt --warmup-updates 10000 \
    --dropout 0.1 --attention-dropout 0.1 \
    --max-positions 144 \
    --shorten-method truncate \
    --batch-size 256 \
    --max-epoch 2000 \
    --log-format simple \
    --log-interval 1000 \
    --validate-interval 1 \
    --save-interval 100 \
        2>&1 | tee models/light/01_pretrained_2000epochs/log

2022-03-02 20:53:35 | INFO | fairseq_cli.train | Namespace(activation_dropout=0.0, activation_fn='gelu', adam_betas='(0.9, 0.999)', adam_eps=1e-08, add_prev_output_tokens=False, all_gather_list_size=16384, arch='roberta_small', attention_dropout=0.1, batch_size=256, batch_size_valid=256, best_checkpoint_metric='loss', bf16=False, bpe=None, broadcast_buffers=False, bucket_cap_mb=25, checkpoint_shard_count=1, checkpoint_suffix='', classification_head_name='sentence_classification_head', clip_norm=0.0, cpu=False, criterion='sentence_prediction', curriculum=0, data='data/processed/heavy/train/', data_buffer_size=10, dataset_impl=None, ddp_backend='c10d', device_id=0, disable_validation=False, distributed_backend='nccl', distributed_init_method=None, distributed_no_spawn=False, distributed_num_procs=0, distributed_port=-1, distributed_rank=0, distributed_world_size=1, distributed_wrapper='DDP', dropout=0.1, empty_cache_freq=0, encoder_attention_heads=8, encoder_embed_dim=128, encoder_ffn_em

In [None]:
!fairseq-train \
    data/processed/heavy/train/  \
    --user-dir /Users/brazdilk/Projects/BioPhi/biophi/humanization/methods/sapiens \
    --init-token 0 --separator-token 2 \
    --restore-file /Users/brazdilk/Projects/diplomka/di-pred/fairseq/models/heavy/01_pretrained_2000epochs/checkpoint_best.pt \
    --reset-optimizer --reset-dataloader --reset-meters \
    --save-dir models/heavy/01_pretrained_2000epochs/checkpoints \
    --tensorboard-logdir models/heavy/01_pretrained_2000epochs/tensorboard \
    --arch roberta_small \
    --criterion sentence_prediction \
    --task sentence_prediction \
    --num-classes 2 \
    --optimizer adam \
    --lr 1e-5 --lr-scheduler inverse_sqrt --warmup-updates 10000 \
    --dropout 0.1 --attention-dropout 0.1 \
    --max-positions 144 \
    --shorten-method truncate \
    --batch-size 256 \
    --max-epoch 1900 \
    --log-format simple \
    --log-interval 100 \
    --validate-interval 1 \
    --save-interval 20 \
        2>&1 | tee models/heavy/01_pretrained_2000epochs/log

2022-03-04 08:35:42 | INFO | fairseq_cli.train | Namespace(activation_dropout=0.0, activation_fn='gelu', adam_betas='(0.9, 0.999)', adam_eps=1e-08, add_prev_output_tokens=False, all_gather_list_size=16384, arch='roberta_small', attention_dropout=0.1, batch_size=256, batch_size_valid=256, best_checkpoint_metric='loss', bf16=False, bpe=None, broadcast_buffers=False, bucket_cap_mb=25, checkpoint_shard_count=1, checkpoint_suffix='', classification_head_name='sentence_classification_head', clip_norm=0.0, cpu=False, criterion='sentence_prediction', curriculum=0, data='data/processed/heavy/train/', data_buffer_size=10, dataset_impl=None, ddp_backend='c10d', device_id=0, disable_validation=False, distributed_backend='nccl', distributed_init_method=None, distributed_no_spawn=False, distributed_num_procs=0, distributed_port=-1, distributed_rank=0, distributed_world_size=1, distributed_wrapper='DDP', dropout=0.1, empty_cache_freq=0, encoder_attention_heads=8, encoder_embed_dim=128, encoder_ffn_em