# install package from github

Two demos in google colab: 
1. Step by Step: https://colab.research.google.com/drive/1NeurI_grWw_G-w8XosjY3elpdQrYE7U1#scrollTo=HBNJBdBprrEg
2. mrpc: https://colab.research.google.com/drive/15sQPgfSncybmcCp_1XydmHrmBugir_7y?usp=sharing


In [12]:
#check the output where the package install (system may miss your package)
!pip3 install git+https://github.com/kiangkiangkiang/BERT_Family.git

Collecting git+https://github.com/kiangkiangkiang/BERT_Family.git
  Cloning https://github.com/kiangkiangkiang/BERT_Family.git to /tmp/pip-req-build-axs5z5j_
  Running command git clone --filter=blob:none --quiet https://github.com/kiangkiangkiang/BERT_Family.git /tmp/pip-req-build-axs5z5j_
  Resolved https://github.com/kiangkiangkiang/BERT_Family.git to commit 95e76651be8feaf41be340503d1bd8b7ad574c7a
  Preparing metadata (setup.py) ... [?25ldone


# Append path to environment 
If you do not add the package to the environment you're using now, python may not detect this package. The system will not find the module.

In [13]:
import os, sys
sys.path.append("/home/ubuntu/.local/lib/python3.9/site-packages")

# import BERT_Family

In [14]:
import BERT_Family as BF

# Example 1

Workflow:
1. Create BF object with specific pretrained model
2. set your dataset
3. load the pretrain model
4. start train, eval, inference

In [15]:
from datasets import load_dataset
import pandas as pd

PRETRAINED_MODEL = "albert-base-v2"
MAX_LENGTH = 80

## 1. Create BF object with pretrained model

The model can be found from https://huggingface.co/models?sort=downloads&search=bert

In [16]:
myFamily = BF.BFClassification(pretrained_model=PRETRAINED_MODEL, tokenizer=PRETRAINED_MODEL, max_length = MAX_LENGTH)

Using device:  cuda:0


##2. Set the dataset
The required parameters of set_dataset() are 
1. raw_data, which is a type of pd.DataFrame with only x_features.
2. raw_target, which is a type of list, represents the label of raw_data.

In [17]:
dataset = load_dataset('glue', "mrpc")
data = pd.DataFrame(dataset["train"])[["sentence1", "sentence2"]]
target = dataset["train"]["label"]
myFamily.set_dataset(raw_data=data, raw_target=target, batch_size=64)

Downloading readme:   0%|          | 0.00/27.9k [00:00<?, ?B/s]

Found cached dataset glue (/home/ubuntu/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


  0%|          | 0/3 [00:00<?, ?it/s]

Using  albert-base-v2  to tokenize...


<torch.utils.data.dataloader.DataLoader at 0x7f5d4ce4db80>

##3. load model

If you have the pre-trained weights, set its path to the parameters "model_path" to initialize the weights.

In [18]:
myFamily.load_model()

Some weights of the model checkpoint at albert-base-v2 were not used when initializing AlbertForSequenceClassification: ['predictions.decoder.bias', 'predictions.dense.bias', 'predictions.decoder.weight', 'predictions.bias', 'predictions.LayerNorm.bias', 'predictions.dense.weight', 'predictions.LayerNorm.weight']
- This IS expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.weight', 'classifier.bias']
You sho

AlbertForSequenceClassification(
  (albert): AlbertModel(
    (embeddings): AlbertEmbeddings(
      (word_embeddings): Embedding(30000, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0, inplace=False)
    )
    (encoder): AlbertTransformer(
      (embedding_hidden_mapping_in): Linear(in_features=128, out_features=768, bias=True)
      (albert_layer_groups): ModuleList(
        (0): AlbertLayerGroup(
          (albert_layers): ModuleList(
            (0): AlbertLayer(
              (full_layer_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
              (attention): AlbertAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768,

## 4. Start experiment
Once you set the dataset, "train_data_loader" for training will be available in the object.

In [None]:
myFamily.train(train_data_loader=myFamily.train_data_loader, epochs=1, save_model=False)

In [39]:
myFamily.status

{'BERT_Type': ['BERTFamily', 'BFClassification'],
 'pretrained_model': 'albert-base-v2',
 'tokenizer': 'albert-base-v2',
 'has_train_data': True,
 'has_validation_data': False,
 'has_test_data': False,
 'hasModel': True,
 'isTrained': True,
 'train_acc': 0.645856052344602,
 'train_loss': 38.7393524646759,
 'test_acc': 0.0,
 'test_loss': 0.0,
 'accumulateEpoch': 1}

# Example 2
1. change pre-trained model
2. evaluation and inference
3. optimizer & learning scheduler

In [None]:
from datasets import load_dataset
import pandas as pd
import torch

PRETRAINED_MODEL = "prajjwal1/bert-tiny"
MAX_LENGTH = 100
BATCH_SIZE = 64

#set dataset
dataset = load_dataset('glue', "mrpc")
data = pd.DataFrame(dataset["train"])[["sentence1", "sentence2"]]
target = dataset["train"]["label"]

val_data = pd.DataFrame(dataset["validation"])[["sentence1", "sentence2"]]
val_target = dataset["validation"]["label"]

test_data = pd.DataFrame(dataset["test"])[["sentence1", "sentence2"]]

#build BERT_Family object
myFamily = BF.BFClassification(pretrained_model=PRETRAINED_MODEL, tokenizer=PRETRAINED_MODEL, max_length = MAX_LENGTH)
myFamily.set_dataset(raw_data=data, raw_target=target, data_type="train", batch_size=BATCH_SIZE)
myFamily.set_dataset(raw_data=val_data, raw_target=val_target, data_type="validation", batch_size=BATCH_SIZE)
myFamily.load_model()

#adjust training method
optimizer = torch.optim.SGD(myFamily.model.parameters(), lr=1e-4, momentum=0.9)
lr_scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer=optimizer, gamma=0.6)

myFamily.train(train_data_loader=myFamily.train_data_loader, epochs=3, save_model=False,
              eval=True, validation_data_loader=myFamily.validation_data_loader)

#test/inference
result = myFamily.inference(model=myFamily.model, data=test_data)

#test result
test_target = dataset["test"]["label"]
torch.sum(torch.tensor(result) == torch.tensor(test_target))/len(result)

#or using myFamily.evaluation()


#Example 3 - Automodeling

In [None]:
PRETRAINED_MODEL = "prajjwal1/bert-tiny"
MAX_LENGTH = 100
BATCH_SIZE = 64

dataset = load_dataset('glue', "cola")
#The features in dataset must be the same name, or you can specific x_dataframe and y instead of dataset.
mymodel = BF.auto_build_model(dataset=dataset, 
                              dataset_x_features=["sentence"],
                              dataset_y_features=["label"],
                              batch_size=BATCH_SIZE ,
                              tokenizer=PRETRAINED_MODEL,
                              pretrained_model=PRETRAINED_MODEL,
                              max_length=MAX_LENGTH)
mymodel.train(train_data_loader=mymodel.train_data_loader, 
              validation_data_loader=mymodel.validation_data_loader, 
              epochs=1,
              eval=True)