## Following the guide for BERT as practice
https://www.depends-on-the-definition.com/named-entity-recognition-with-bert/

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm, trange

In [4]:
df = pd.read_pickle("../data/suc3_formatted")
df.head(1)


Unnamed: 0,Sentence,Token_ID,Entities,Entity_ID,Attention_Mask
0,I sin första reaktion på Sovjetledarens varnin...,"[135, 243, 578, 10540, 68, 3380, 7245, 49796, ...","[O, O, O, O, O, O, O, O, LOC, O, PRS, O, O, O,...","[11, 11, 11, 11, 11, 11, 11, 11, 16, 11, 14, 1...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ..."


In [13]:
from sklearn.model_selection import train_test_split

# Split into training and validation sets.

#Albert takes both the sentence tokens (as ids), the labels (as ids) and attention masks as input, so we split those as well.
r=random_state = 2018
t=test_size = 0.1

tr_inputs, val_inputs, tr_tags, val_tags = train_test_split(df['Token_ID'].to_list(), df['Entity_ID'].to_list(), random_state=r, test_size=t)
tr_masks, val_masks, _, _ = train_test_split(df['Attention_Mask'].to_list(), df['Token_ID'].to_list(),random_state=r, test_size=t)
y = tr_masks

In [15]:
import torch
#Convert to tensors

tr_inputs = torch.tensor(tr_inputs)
val_inputs = torch.tensor(val_inputs)
tr_tags = torch.tensor(tr_tags)
val_tags = torch.tensor(val_tags)
tr_masks = torch.tensor(tr_masks)
val_masks = torch.tensor(val_masks)

In [18]:
#Convert it all into a tensordataset 
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

#Set batch size
bs = 1

train_data = TensorDataset(tr_inputs, tr_masks, tr_tags)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=bs)

valid_data = TensorDataset(val_inputs, val_masks, val_tags)
valid_sampler = SequentialSampler(valid_data)
valid_dataloader = DataLoader(valid_data, sampler=valid_sampler, batch_size=bs)

# Distilbert

In [19]:
from transformers import modeling_distilbert as md


In [None]:
model_class = ppb.DistilBertModel
tokenizer_class = ppb.DistilBertTokenizer
pretrained_weights = 'distilbert-base-uncased'

In [None]:
# Load pretrained model/tokenizer
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)

In [None]:
# Tokenize and process all sentences together as a batch
# That is, replace each unique word with a corresponding id to get a list of lists
tokenized = batch_1[0].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))

In [None]:
#Pad to ensure the same length

max_len = 0
for i in tokenized.values:
    if len(i) > max_len:
        max_len = len(i)

padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])

np.array(padded).shape

In [None]:
#Now that input is all the same, tell the model to ignore the padding when it sees it.
#This is attention
attention_mask = np.where(padded != 0, 1, 0)
attention_mask.shape

In [None]:
#Convert to tensors
input_ids = torch.tensor(padded)  
attention_mask = torch.tensor(attention_mask)

In [None]:
#The results of the processing ends up in last_hidden_states.
#TODO what does torch.no_grad() do?
with torch.no_grad():
    last_hidden_states = model(input_ids, attention_mask=attention_mask)

In [None]:
#Let's have a look at the final output

"""
We get a 2000 x 59 x 768 tensor

The first dimension is the sentence
The second dimension is the word
The third dimension is the hidden state

last_hidden_states[0][:,0,:]
All rows (sentences), The first word(the prepended CLS token), all hidden states)
"""


last_hidden_states[0].shape

In [None]:
# Bert classifies sentences by generating a [CLS] (classification) token and prepending it to the output sentences.
# This token is an embedding for the entire sentence
features = last_hidden_states[0][:,0,:].numpy()
features.shape

In [None]:
#So we have an embedding for a sentence.
#And we have a classification for each sentence.
#Then we're back to familiar territory
labels = batch_1[1]
train_features, test_features, train_labels, test_labels = train_test_split(features, labels)

In [None]:
#Let's just use basic logistic regression
lr_clf = LogisticRegression()
lr_clf.fit(train_features, train_labels)

In [None]:
lr_clf.score(test_features, test_labels)
