In [None]:
!pip install transformers==4.28

In [None]:
!pip install datasets

In [None]:
!pip install accelerate -U

In [None]:
import pandas as pd
import numpy as np
import transformers
import datasets
from datasets import Dataset
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification, AutoModel
from transformers import TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [None]:
train = pd.read_csv('drive/MyDrive/ПМО/drugsComTrain_raw.tsv', sep='\t')

In [None]:
test = pd.read_csv('drive/MyDrive/ПМО/drugsComTest_raw.tsv', sep='\t')

In [None]:
train.head()

Unnamed: 0.1,Unnamed: 0,drugName,condition,review,rating,date,usefulCount
0,206461,Valsartan,Left Ventricular Dysfunction,"""It has no side effect, I take it in combinati...",9.0,"May 20, 2012",27
1,95260,Guanfacine,ADHD,"""My son is halfway through his fourth week of ...",8.0,"April 27, 2010",192
2,92703,Lybrel,Birth Control,"""I used to take another oral contraceptive, wh...",5.0,"December 14, 2009",17
3,138000,Ortho Evra,Birth Control,"""This is my first time using any form of birth...",8.0,"November 3, 2015",10
4,35696,Buprenorphine / naloxone,Opiate Dependence,"""Suboxone has completely turned my life around...",9.0,"November 27, 2016",37


In [None]:
test.head()

Unnamed: 0.1,Unnamed: 0,drugName,condition,review,rating,date,usefulCount
0,163740,Mirtazapine,Depression,"""I&#039;ve tried a few antidepressants over th...",10.0,"February 28, 2012",22
1,206473,Mesalamine,"Crohn's Disease, Maintenance","""My son has Crohn&#039;s disease and has done ...",8.0,"May 17, 2009",17
2,159672,Bactrim,Urinary Tract Infection,"""Quick reduction of symptoms""",9.0,"September 29, 2017",3
3,39293,Contrave,Weight Loss,"""Contrave combines drugs that were used for al...",9.0,"March 5, 2017",35
4,97768,Cyclafem 1 / 35,Birth Control,"""I have been on this birth control for one cyc...",9.0,"October 22, 2015",4


In [None]:
train.drop(columns=['Unnamed: 0', 'drugName', 'condition', 'date', 'usefulCount'], inplace=True)
train.head()

Unnamed: 0,review,rating
0,"""It has no side effect, I take it in combinati...",9.0
1,"""My son is halfway through his fourth week of ...",8.0
2,"""I used to take another oral contraceptive, wh...",5.0
3,"""This is my first time using any form of birth...",8.0
4,"""Suboxone has completely turned my life around...",9.0


In [None]:
test.drop(columns=['Unnamed: 0', 'drugName', 'condition', 'date', 'usefulCount'], inplace=True)
test.head()

Unnamed: 0,review,rating
0,"""I&#039;ve tried a few antidepressants over th...",10.0
1,"""My son has Crohn&#039;s disease and has done ...",8.0
2,"""Quick reduction of symptoms""",9.0
3,"""Contrave combines drugs that were used for al...",9.0
4,"""I have been on this birth control for one cyc...",9.0


In [None]:
train['label'] = 0
train.loc[train['rating'] >= 4, 'label'] = 1
train.loc[train['rating'] >= 8, 'label'] = 2
train.rename(columns={'review': 'text'}, inplace=True)
train.drop(columns=['rating'], inplace=True)

In [None]:
test['label'] = 0
test.loc[test['rating'] >= 4, 'label'] = 1
test.loc[test['rating'] >= 8, 'label'] = 2
test.rename(columns={'review': 'text'}, inplace=True)
test.drop(columns=['rating'], inplace=True)

In [None]:
train.head(10)

Unnamed: 0,text,label
0,"""It has no side effect, I take it in combinati...",2
1,"""My son is halfway through his fourth week of ...",2
2,"""I used to take another oral contraceptive, wh...",1
3,"""This is my first time using any form of birth...",2
4,"""Suboxone has completely turned my life around...",2
5,"""2nd day on 5mg started to work with rock hard...",0
6,"""He pulled out, but he cummed a bit in me. I t...",0
7,"""Abilify changed my life. There is hope. I was...",2
8,""" I Ve had nothing but problems with the Kepp...",0
9,"""I had been on the pill for many years. When m...",2


In [None]:
train_dataset = Dataset.from_pandas(train)
test_dataset = Dataset.from_pandas(test)

In [None]:
tokenizer_bert = AutoTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True)

In [None]:
def tokenize_function_bert(examples):
    return tokenizer_bert(examples["text"], padding="max_length", truncation=True, max_length=250, return_tensors="pt").to("cuda") #, max_length=250)

In [None]:
tokenized_bert_train = train_dataset.map(tokenize_function_bert, batched=True)
tokenized_bert_test = test_dataset.map(tokenize_function_bert, batched=True)

Map:   0%|          | 0/161297 [00:00<?, ? examples/s]

Map:   0%|          | 0/53766 [00:00<?, ? examples/s]

In [None]:
tokenized_train

Dataset({
    features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 161297
})

In [None]:
model_bert = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=3).to("cuda")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = accuracy_score(labels, predictions)
    precision = precision_score(labels, predictions, average='weighted')
    recall = recall_score(labels, predictions, average='weighted')
    f1_micro = f1_score(labels, predictions, average='micro')
    f1_macro = f1_score(labels, predictions, average='macro')
    f1_weighted = f1_score(labels, predictions, average='weighted')
    return {
       'accuracy': accuracy,
       'precision': precision,
       'recall': recall,
       'f1_micro': f1_micro,
       'f1_macro': f1_macro,
       'f1_weighted': f1_weighted,
    }

In [None]:
training_args_bert = TrainingArguments(output_dir="drive/MyDrive/ПМО/trainer_bert",
                                       per_device_train_batch_size=16,
                                       per_device_eval_batch_size=16,
                                       num_train_epochs=3,
                                       evaluation_strategy="steps",
                                       logging_steps=5000,
                                       save_steps=5000,
                                       load_best_model_at_end=True,
                                       metric_for_best_model="f1_weighted",
                                       save_total_limit=1
                                      )

In [None]:
trainer = Trainer(
    model=model_bert,
    args=training_args_bert,
    train_dataset=tokenized_bert_train,
    eval_dataset=tokenized_bert_test,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()



Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1 Micro,F1 Macro,F1 Weighted
5000,0.5544,0.513154,0.795596,0.771919,0.795596,0.795596,0.65969,0.762433
10000,0.4696,0.444942,0.818156,0.826152,0.818156,0.818156,0.756054,0.821169
15000,0.3702,0.418337,0.849849,0.845371,0.849849,0.849849,0.786772,0.847292
20000,0.335,0.363629,0.869248,0.863796,0.869248,0.869248,0.812163,0.865659
25000,0.2191,0.422537,0.876613,0.880787,0.876613,0.876613,0.833249,0.878415
30000,0.2025,0.411817,0.890693,0.889353,0.890693,0.890693,0.847831,0.88996


TrainOutput(global_step=30246, training_loss=0.35706584075057995, metrics={'train_runtime': 6692.819, 'train_samples_per_second': 72.3, 'train_steps_per_second': 4.519, 'total_flos': 6.216709707901349e+16, 'train_loss': 0.35706584075057995, 'epoch': 3.0})

In [None]:
model_bert.save_pretrained("drive/MyDrive/ПМО/model_bert")
tokenizer_bert.save_pretrained("drive/MyDrive/ПМО/tokenizer_bert")

In [None]:
trainer.save_model("drive/MyDrive/ПМО/saved_trainer_bert")

In [None]:
tokenizer_bio_bert = AutoTokenizer.from_pretrained("dmis-lab/biobert-v1.1")

Downloading (…)okenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/462 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [None]:
def tokenize_function_bio_bert(examples):
    return tokenizer_bio_bert(examples["text"], padding="max_length", truncation=True, max_length=250, return_tensors="pt").to("cuda") #, max_length=250)

In [None]:
tokenized_bio_bert_train = train_dataset.map(tokenize_function_bio_bert, batched=True)
tokenized_bio_bert_test = test_dataset.map(tokenize_function_bio_bert, batched=True)

Map:   0%|          | 0/161297 [00:00<?, ? examples/s]

Map:   0%|          | 0/53766 [00:00<?, ? examples/s]

In [None]:
model_bio_bert = AutoModelForSequenceClassification.from_pretrained("dmis-lab/biobert-v1.1", num_labels=3).to("cuda")

Downloading pytorch_model.bin:   0%|          | 0.00/433M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dmis-lab/biobert-v1.1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
training_args_bio_bert = TrainingArguments(output_dir="drive/MyDrive/ПМО/trainer_bio_bert",
                                           per_device_train_batch_size=16,
                                           per_device_eval_batch_size=16,
                                           num_train_epochs=3,
                                           evaluation_strategy="steps",
                                           logging_steps=5000,
                                           save_steps=5000,
                                           load_best_model_at_end=True,
                                           metric_for_best_model="f1_weighted",
                                           save_total_limit=1
                                          )

In [None]:
trainer_bio_bert = Trainer(
    model=model_bio_bert,
    args=training_args_bio_bert,
    train_dataset=tokenized_bio_bert_train,
    eval_dataset=tokenized_bio_bert_test,
    compute_metrics=compute_metrics,
)

In [None]:
trainer_bio_bert.train()



Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1 Micro,F1 Macro,F1 Weighted
5000,0.5544,0.512181,0.801789,0.779945,0.801789,0.801789,0.682494,0.775957
10000,0.4763,0.435429,0.825522,0.828914,0.825522,0.825522,0.762512,0.827012
15000,0.3764,0.412718,0.845386,0.840265,0.845386,0.845386,0.780635,0.842414
20000,0.3419,0.373181,0.866217,0.861003,0.866217,0.866217,0.808098,0.862816
25000,0.2298,0.40718,0.876055,0.877649,0.876055,0.876055,0.83036,0.876798
30000,0.2168,0.407012,0.886341,0.885332,0.886341,0.886341,0.842289,0.885782


TrainOutput(global_step=30246, training_loss=0.36445605063544045, metrics={'train_runtime': 6733.2202, 'train_samples_per_second': 71.866, 'train_steps_per_second': 4.492, 'total_flos': 6.216709707901349e+16, 'train_loss': 0.36445605063544045, 'epoch': 3.0})

In [None]:
model_bio_bert.save_pretrained("drive/MyDrive/ПМО/model_bio_bert")
tokenizer_bio_bert.save_pretrained("drive/MyDrive/ПМО/tokenizer_bio_bert")

('drive/MyDrive/ПМО/tokenizer_bio_bert/tokenizer_config.json',
 'drive/MyDrive/ПМО/tokenizer_bio_bert/special_tokens_map.json',
 'drive/MyDrive/ПМО/tokenizer_bio_bert/vocab.txt',
 'drive/MyDrive/ПМО/tokenizer_bio_bert/added_tokens.json',
 'drive/MyDrive/ПМО/tokenizer_bio_bert/tokenizer.json')

In [None]:
trainer_bio_bert.save_model("drive/MyDrive/ПМО/saved_trainer_bio_bert")

In [None]:
tokenizer_bio_clinical_bert = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")

In [None]:
def tokenize_function_bio_clinical_bert(examples):
    return tokenizer_bio_clinical_bert(examples["text"], padding="max_length", truncation=True, max_length=250, return_tensors="pt").to("cuda") #, max_length=250)

In [None]:
tokenized_bio_clinical_bert_train = train_dataset.map(tokenize_function_bio_clinical_bert, batched=True)
tokenized_bio_clinical_bert_test = test_dataset.map(tokenize_function_bio_clinical_bert, batched=True)

Map:   0%|          | 0/161297 [00:00<?, ? examples/s]

Map:   0%|          | 0/53766 [00:00<?, ? examples/s]

In [None]:
tokenized_biobert_train

Dataset({
    features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 161297
})

In [None]:
model_bio_clinical_bert = AutoModelForSequenceClassification.from_pretrained("emilyalsentzer/Bio_ClinicalBERT", num_labels=3).to("cuda")

Downloading pytorch_model.bin:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of the model checkpoint at emilyalsentzer/Bio_ClinicalBERT were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model

In [None]:
training_args_bio_clinical_bert  = TrainingArguments(output_dir="drive/MyDrive/ПМО/trainer_bio_clinical_bert",
                                                     per_device_train_batch_size=16,
                                                     per_device_eval_batch_size=16,
                                                     num_train_epochs=3,
                                                     evaluation_strategy="steps",
                                                     logging_steps=5000,
                                                     save_steps=5000,
                                                     load_best_model_at_end=True,
                                                     metric_for_best_model="f1_weighted",
                                                     save_total_limit=1
                                                    )

In [None]:
trainer_bio_clinical_bert = Trainer(
    model=model_bio_clinical_bert,
    args=training_args_bio_clinical_bert,
    train_dataset=tokenized_bio_clinical_bert_train,
    eval_dataset=tokenized_bio_clinical_bert_test,
    compute_metrics=compute_metrics,
)

In [None]:
trainer_bio_clinical_bert.train()



Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1 Micro,F1 Macro,F1 Weighted
5000,0.5675,0.5058,0.80151,0.781047,0.80151,0.80151,0.68244,0.776194
10000,0.481,0.448718,0.816576,0.831961,0.816576,0.816576,0.759346,0.822514
15000,0.3813,0.409545,0.842633,0.842239,0.842633,0.842633,0.78241,0.842432
20000,0.3496,0.383353,0.863482,0.856876,0.863482,0.863482,0.801437,0.858348
25000,0.2398,0.41788,0.87267,0.877713,0.87267,0.87267,0.829035,0.874784
30000,0.218,0.405113,0.887271,0.887167,0.887271,0.887271,0.844884,0.887216


TrainOutput(global_step=30246, training_loss=0.3714658502216638, metrics={'train_runtime': 6690.0758, 'train_samples_per_second': 72.33, 'train_steps_per_second': 4.521, 'total_flos': 6.216709707901349e+16, 'train_loss': 0.3714658502216638, 'epoch': 3.0})

In [None]:
model_bio_clinical_bert.save_pretrained("drive/MyDrive/ПМО/model_bio_clinical_bert")
tokenizer_bio_clinical_bert.save_pretrained("drive/MyDrive/ПМО/tokenizer_bio_clinical_bert")

('drive/MyDrive/ПМО/tokenizer_bio_clinical_bert/tokenizer_config.json',
 'drive/MyDrive/ПМО/tokenizer_bio_clinical_bert/special_tokens_map.json',
 'drive/MyDrive/ПМО/tokenizer_bio_clinical_bert/vocab.txt',
 'drive/MyDrive/ПМО/tokenizer_bio_clinical_bert/added_tokens.json',
 'drive/MyDrive/ПМО/tokenizer_bio_clinical_bert/tokenizer.json')

In [None]:
trainer_bio_clinical_bert.save_model("drive/MyDrive/ПМО/saved_trainer_bio_clinical_bert")

In [None]:
!pip install torch torchvision

In [None]:
from transformers import pipeline

In [None]:
pipe = pipeline(task="zero-shot-classification", model="facebook/bart-large-mnli", framework="pt", device=0)

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [None]:
d = {'negative': 0,
     'neutral': 1,
     'positive': 2}

In [None]:
from tqdm import tqdm

In [None]:
preds = []
for i in tqdm(range(len(test))):
    result = pipe(test["text"][i], candidate_labels=["negative", "neutral", "positive"])
    sentiment = result['labels'][np.argmax(result['scores'])]
    preds.append(d[sentiment])

100%|██████████| 53766/53766 [1:33:53<00:00,  9.54it/s]


In [None]:
import pickle

In [None]:
with open("drive/MyDrive/ПМО/predictions_bart_large_mnli", "wb") as fp:
    pickle.dump(preds, fp)

In [None]:
labels = test['label'].tolist()

In [None]:
len(labels)

53766

In [None]:
accuracy = accuracy_score(labels, preds)
precision = precision_score(labels, preds, average='weighted')
recall = recall_score(labels, preds, average='weighted')
f1_micro = f1_score(labels, preds, average='micro')
f1_macro = f1_score(labels, preds, average='macro')
f1_weighted = f1_score(labels, preds, average='weighted')
print('accuracy: ', accuracy)
print('precision: ', precision)
print('recall: ', recall)
print('f1_micro: ', f1_micro)
print('f1_macro: ', f1_macro)
print('f1_weighted: ', f1_weighted)

accuracy:  0.6762452107279694
precision:  0.7022155489621839
recall:  0.6762452107279694
f1_micro:  0.6762452107279694
f1_macro:  0.48499167028909884
f1_weighted:  0.6368385487205434
