# install package from github

In [None]:
!pip3 install --upgrade git+https://github.com/kiangkiangkiang/BERT_Family.git --user

# Append path to environment (for Google Colab)

In [2]:
import os, sys
sys.path.append("/root/.local/lib/python3.8/site-packages")

# import BERT_Family

In [3]:
import BERT_Family as BF

# Example 1

Workflow:
1. Create BF object with specific pretrained model
2. set your dataset
3. load the pretrain model
4. start train, eval, inference

In [None]:
from datasets import load_dataset
import pandas as pd

PRETRAINED_MODEL = "albert-base-v2"
MAX_LENGTH = 80

## 1. Create BF object with pretrained model

The model can be found from https://huggingface.co/models?sort=downloads&search=bert

In [34]:
myFamily = BF.BFClassification(pretrained_model=PRETRAINED_MODEL, tokenizer=PRETRAINED_MODEL, max_length = MAX_LENGTH)



  0%|          | 0/3 [00:00<?, ?it/s]

Using device:  cuda:0


##2. Set the dataset
The required parameters of set_dataset() are 
1. raw_data, which is a type of pd.DataFrame with only x_features.
2. raw_target, which is a type of list, represents the label of raw_data.

In [35]:
dataset = load_dataset('glue', "mrpc")
data = pd.DataFrame(dataset["train"])[["sentence1", "sentence2"]]
target = dataset["train"]["label"]
myFamily.set_dataset(raw_data=data, raw_target=target, batch_size=64)

Using  albert-base-v2  to tokenize...


<torch.utils.data.dataloader.DataLoader at 0x7f7498ee8550>

##3. load model

If you have the pre-trained weights, set its path to the parameters "model_path" to initialize the weights.

In [None]:
myFamily.load_model()

## 4. Start experiment
Once you set the dataset, "train_data_loader" for training will be available in the object.

In [None]:
myFamily.train(train_data_loader=myFamily.train_data_loader, epochs=1, save_model=False)

In [39]:
myFamily.status

{'BERT_Type': ['BERTFamily', 'BFClassification'],
 'pretrained_model': 'albert-base-v2',
 'tokenizer': 'albert-base-v2',
 'has_train_data': True,
 'has_validation_data': False,
 'has_test_data': False,
 'hasModel': True,
 'isTrained': True,
 'train_acc': 0.645856052344602,
 'train_loss': 38.7393524646759,
 'test_acc': 0.0,
 'test_loss': 0.0,
 'accumulateEpoch': 1}

# Example 2
1. change pre-trained model
2. evaluation and inference
3. optimizer & learning scheduler

In [None]:
from datasets import load_dataset
import pandas as pd
import torch

PRETRAINED_MODEL = "prajjwal1/bert-tiny"
MAX_LENGTH = 100
BATCH_SIZE = 64

#set dataset
dataset = load_dataset('glue', "mrpc")
data = pd.DataFrame(dataset["train"])[["sentence1", "sentence2"]]
target = dataset["train"]["label"]

val_data = pd.DataFrame(dataset["validation"])[["sentence1", "sentence2"]]
val_target = dataset["validation"]["label"]

test_data = pd.DataFrame(dataset["test"])[["sentence1", "sentence2"]]

#build BERT_Family object
myFamily = BF.BFClassification(pretrained_model=PRETRAINED_MODEL, tokenizer=PRETRAINED_MODEL, max_length = MAX_LENGTH)
myFamily.set_dataset(raw_data=data, raw_target=target, data_type="train", batch_size=BATCH_SIZE)
myFamily.set_dataset(raw_data=val_data, raw_target=val_target, data_type="validation", batch_size=BATCH_SIZE)
myFamily.load_model()

#adjust training method
optimizer = torch.optim.SGD(myFamily.model.parameters(), lr=1e-4, momentum=0.9)
lr_scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer=optimizer, gamma=0.6)

myFamily.train(train_data_loader=myFamily.train_data_loader, epochs=3, save_model=False,
              eval=True, validation_data_loader=myFamily.validation_data_loader)

#test/inference
result = myFamily.inference(model=myFamily.model, data=test_data)

#test result
test_target = dataset["test"]["label"]
torch.sum(torch.tensor(result) == torch.tensor(test_target))/len(result)

#or using myFamily.evaluation()


#Example 3 - Automodeling

In [None]:
PRETRAINED_MODEL = "prajjwal1/bert-tiny"
MAX_LENGTH = 100
BATCH_SIZE = 64

dataset = load_dataset('glue', "cola")
#The features in dataset must be the same name, or you can specific x_dataframe and y instead of dataset.
mymodel = BF.auto_build_model(dataset=dataset, 
                              dataset_x_features=["sentence"],
                              dataset_y_features=["label"],
                              batch_size=BATCH_SIZE ,
                              tokenizer=PRETRAINED_MODEL,
                              pretrained_model=PRETRAINED_MODEL,
                              max_length=MAX_LENGTH)
mymodel.train(train_data_loader=mymodel.train_data_loader, 
              validation_data_loader=mymodel.validation_data_loader, 
              epochs=1,
              eval=True)