In [1]:
# pretrained_BERT and botswine

import pandas as pd
import datasets
from tqdm.auto import tqdm
import os

# os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"

from transformers import AdamW

from pathlib import Path

from transformers import BertForMaskedLM
from transformers import BertConfig
import torch
from transformers import BertTokenizer
import os
from tokenizers import ByteLevelBPETokenizer

#from tokenizers import TFBertModel

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = pd.read_csv("../../master-thesis/data/dataset1.csv")

  exec(code_obj, self.user_global_ns, self.user_ns)


In [3]:
df = df[['Note', 'varietal_name']]
df.reset_index(inplace=True)

In [4]:
df.dropna(inplace=True)

In [5]:
df.Note.isnull().sum()

0

In [6]:
df.shape

(130197, 3)

In [7]:
df.drop_duplicates("Note", inplace=True)

In [8]:
df.shape

(59211, 3)

In [9]:
# subset dataset1 just for the top contenders
df = df[(df['varietal_name'] == 'Cabernet Sauvignon') \
                    |(df['varietal_name'] == 'Chardonnay') \
                    |(df['varietal_name'] == 'Côte de Beaune White')\
                    |(df['varietal_name'] == 'Côte de Nuits Red')\
                    |(df['varietal_name'] == 'Riesling')\
                    |(df['varietal_name'] == 'Pomerol')]

In [10]:
df.shape

(14167, 3)

In [11]:
train = df.sample(frac = 0.75)

In [12]:
test = df[~df.isin(train)].dropna()

In [13]:
# square True
test.shape[0]+train.shape[0]==df.shape[0]

True

In [14]:
test['index'] = test['index'].astype(int)

In [15]:
train_vn_unqs = train.varietal_name.unique()
train_vn_unqs.sort()

In [16]:
test_vn_unqs = test.varietal_name.unique()
test_vn_unqs.sort()

In [17]:
train.rename(columns={
    "index": "id",
    "Note": "text",
    "varietal_name": "label"
}, inplace=True)

test.rename(columns={
    "index": "id",
    "Note": "text",
    "varietal_name": "label"
}, inplace=True)

In [18]:
train.head()

Unnamed: 0,id,text,label
123022,123022,シャルドネ\nそこまでは。,Chardonnay
123403,123403,butter vanilla oak melon minerals butterscotch,Chardonnay
128918,128918,Goed als je veel dorst hebt.,Côte de Beaune White
121043,121043,"Не кислое, слегка минеральное, мягкое, приятное",Côte de Beaune White
111811,111811,Perfection!\n,Cabernet Sauvignon


In [19]:
train_dataset = datasets.Dataset.from_pandas(train)
test_dataset = datasets.Dataset.from_pandas(test)
dataset = datasets.DatasetDict({"train":train_dataset,
                                        "test":test_dataset})

In [20]:
text_data = []
file_count = 0

for sample in tqdm(dataset['train']):
    sample = sample['text'].replace('\n', '')
    text_data.append(sample)
    if len(text_data) == 10_000:
        # once we git the 10K mark, save to file
        with open(f'text_{file_count}.txt', 'w', encoding='utf-8') as fp:
            fp.write('\n'.join(text_data))
        text_data = []
        file_count += 1
# after saving in 10K chunks, we will have ~2082 leftover samples, we save those now too
with open(f'text_{file_count}.txt', 'w', encoding='utf-8') as fp:
    fp.write('\n'.join(text_data))

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10625/10625 [00:00<00:00, 22711.49it/s]


In [21]:

paths = [str(x) for x in Path('').glob('*.txt')]

In [22]:
tokenizer = ByteLevelBPETokenizer()

In [23]:
tokenizer.train(files=paths[:5], vocab_size=30_522, min_frequency=2,
                special_tokens=['<s>', '<pad>', '</s>', '<unk>', '<mask>'])






In [24]:
tokenizer.save_model('models/')

['models/vocab.json', 'models/merges.txt']

In [25]:
# initialize the tokenizer using the tokenizer we initialized and saved to file
#tokenizer = RobertaTokenizer.from_pretrained('bert-base-cased', max_len=512)

tokenizer = BertTokenizer.from_pretrained('bert-base-cased', max_len=512)
#model = TFBertModel.from_pretrained("bert-base-cased")



In [26]:
# test our tokenizer on a simple sentence
tokens = tokenizer('id like a dry red wine with my veal salad')

In [27]:
with open('text_0.txt', 'r', encoding='utf-8') as fp:
    lines = fp.read().split('\n')

In [28]:
batch = tokenizer(lines, max_length=512, padding='max_length', truncation=True)

In [29]:
len(batch)

3

In [30]:
batch.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

In [31]:
labels = torch.tensor(batch.input_ids)
mask = torch.tensor(batch.attention_mask)

In [32]:
# make copy of labels tensor, this will be input_ids
input_ids = labels.detach().clone()
# create random array of floats with equal dims to input_ids
rand = torch.rand(input_ids.shape)
# mask random 15% where token is not 0 [PAD], 1 [CLS], or 2 [SEP]
mask_arr = (rand < .15) * (input_ids != 0) * (input_ids != 1) * (input_ids != 2)
# loop through each row in input_ids tensor (cannot do in parallel)
for i in range(input_ids.shape[0]):
    # get indices of mask positions from mask array
    selection = torch.flatten(mask_arr[i].nonzero()).tolist()
    # mask input_ids
    input_ids[i, selection] = 3  # our custom [MASK] token == 3

In [33]:
input_ids.shape

torch.Size([10013, 512])

In [34]:
encodings = {'input_ids': input_ids, 'attention_mask': mask, 'labels': labels}


In [35]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        # store encodings internally
        self.encodings = encodings

    def __len__(self):
        # return the number of samples
        return self.encodings['input_ids'].shape[0]

    def __getitem__(self, i):
        # return dictionary of input_ids, attention_mask, and labels for index i
        return {key: tensor[i] for key, tensor in self.encodings.items()}

In [36]:
dataset = Dataset(encodings)

In [37]:
loader = torch.utils.data.DataLoader(dataset, batch_size=16, shuffle=True)

In [38]:
config = BertConfig(
    vocab_size=tokenizer.vocab_size,  # we align this to the tokenizer vocab_size
    max_position_embeddings=514,
    hidden_size=768,
    num_attention_heads=12,
    num_hidden_layers=6,
    type_vocab_size=1
)

In [39]:
model = BertForMaskedLM(config)

In [40]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
# and move our model over to the selected device
model.to(device)

BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(514, 768)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=Tr

In [41]:
# activate training mode
model.train()

# initialize optimizer
optim = AdamW(model.parameters(), lr=1e-5)
#optim = AdamW()



In [None]:
epochs = 2

for epoch in range(epochs):
    # setup loop with TQDM and dataloader
    loop = tqdm(loader, leave=True)
    for batch in loop:
        # initialize calculated gradients (from prev step)
        optim.zero_grad()
        # pull all tensor batches required for training
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        # process
        outputs = model(input_ids, attention_mask=attention_mask,
                        labels=labels)
        # extract loss
        loss = outputs.loss
        # calculate loss for every parameter that needs grad update
        loss.backward()
        # update parameters
        optim.step()
        # print relevant info to progress bar
        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())

Epoch 0:   3%|██▊                                                                                                | 18/626 [19:34<11:53:54, 70.45s/it, loss=10.2]

In [None]:
model.save_pretrained('models/winedomainspecificbertpoc')