In [1]:
import pandas as pd
import numpy as np
import torch
import csv
from sentence_transformers import SentenceTransformer, util
import torch.nn as nn
import torch.nn.functional as func
from torch.nn import Linear as lin
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
from transformers import DistilBertForMaskedLM
from transformers import BertTokenizerFast
from transformers import BertForMaskedLM
import random

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df_test = pd.read_csv("Listing_Titles.tsv.gz", sep="\t", dtype=str, keep_default_na=False, na_values=[""], quoting=csv.QUOTE_NONE)

In [3]:
def get_text(df):
    text = []
    for idx, row in df.iterrows():
        text.append(row["Title"].lower().split(" "))
        if idx%100000 == 0:
            print(idx)
    return text

In [4]:
train, test = train_test_split(get_text(df_test[:10000]), train_size=0.90)

0


In [None]:
train = get_text(df_test[:10000])

In [5]:
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
train_encodings = tokenizer(train, is_split_into_words=True, return_offsets_mapping=True, padding=True, truncation=True, return_tensors="pt")
val_encodings = tokenizer(test, is_split_into_words=True, return_offsets_mapping=True, padding=True, truncation=True, return_tensors="pt")

In [6]:
for i in range(len(train_encodings.input_ids)):
    for j in range(len(train_encodings.input_ids[i])):
        if train_encodings.input_ids[i][j] != 102:
            if random.random() <= 0.15:
                train_encodings.input_ids[i][j] = 103
                train_encodings.attention_mask[i][j] = 0
        else:
            break
for i in range(len(val_encodings.input_ids)):
    for j in range(len(val_encodings.input_ids[i])):
        if val_encodings.input_ids[i][j] != 102:
            if random.random() <= 0.15:
                val_encodings.input_ids[i][j] = 103
                val_encodings.attention_mask[i][j] = 0
        else:
            break

In [7]:
def encode_tags(labels, encodings):
    
    encoded_labels = []
    for doc_labels, doc_offset in zip(labels, encodings.offset_mapping):
        doc_enc_labels = np.ones(len(doc_offset),dtype=int) * -100
        arr_offset = np.array(doc_offset)
        doc_enc_labels[(arr_offset[:,0] == 0) & (arr_offset[:,1] != 0)] = np.array(doc_labels)[(arr_offset[:,0] == 0) & (arr_offset[:,1] != 0)]


        encoded_labels.append(doc_enc_labels.tolist())
    

    return encoded_labels

t = encode_tags(train_encodings["input_ids"], train_encodings)
v = encode_tags(val_encodings["input_ids"], val_encodings)

In [8]:
train_tags = torch.where(torch.tensor(train_encodings.input_ids) == tokenizer.mask_token_id, torch.tensor(t), -100)
val_tags = torch.where(torch.tensor(val_encodings.input_ids) == tokenizer.mask_token_id, torch.tensor(v), -100)
train_encodings.pop("offset_mapping")
val_encodings.pop("offset_mapping")
train_encodings.pop("token_type_ids")
val_encodings.pop("token_type_ids")

  train_tags = torch.where(torch.tensor(train_encodings.input_ids) == tokenizer.mask_token_id, torch.tensor(t), -100)
  val_tags = torch.where(torch.tensor(val_encodings.input_ids) == tokenizer.mask_token_id, torch.tensor(v), -100)


tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]])

In [9]:
class Data(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
        

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [10]:
train_dataset = Data(train_encodings, train_tags)
val_dataset = Data(val_encodings, val_tags)

In [11]:
torch.cuda.set_per_process_memory_fraction(1.0, device=0)

In [12]:
torch.save(model.state_dict(), "/home/henry/work/projects/bert_1_epoch.obj")

In [13]:
m = DistilBertForMaskedLM.from_pretrained("distilbert-base-uncased")
m.load_state_dict(torch.load("/home/henry/work/projects/bert_1_epoch.obj"))

In [11]:
model = BertForMaskedLM.from_pretrained("bert-base-uncased").to("cuda")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
model.load_state_dict(torch.load("/home/henry/work/projects/bert_1_epoch.obj"))

In [12]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(output_dir="./results", learning_rate=0.00001, num_train_epochs=1,per_device_train_batch_size=128,per_device_eval_batch_size=8,evaluation_strategy="steps", do_eval=True, eval_steps=250, warmup_steps=500,weight_decay=0.01,logging_dir='./logs',logging_steps=10,dataloader_pin_memory=False)

#model = BertForMaskedLM.from_pretrained("bert-base-uncased").to("cuda")

trainer = Trainer(
    model=model,                         
    args=training_args,                  
    train_dataset=train_dataset,         
    eval_dataset=val_dataset             
)

trainer.train()

***** Running training *****
  Num examples = 9000
  Num Epochs = 1
  Instantaneous batch size per device = 128
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 1
  Total optimization steps = 71
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mhwarzecha[0m ([33muchcia[0m). Use [1m`wandb login --relogin`[0m to force relogin


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels[idx])
