In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm, trange

In [6]:
#Load the data
df = pd.read_pickle("../data/sentence_labels")
input_df = df[['Sentence']].head().copy()
label_df = df[['Labels']].head().copy()

In [20]:
# Format the input
from keras.preprocessing.sequence import pad_sequences
from transformers import AutoTokenizer
import torch

#Load swedish tokenizer
tokenizer = AutoTokenizer.from_pretrained("KB/bert-base-swedish-cased-ner")

# Use wordpiece tokenization
input_df['Tokenized'] = input_df[['Sentence']].apply(lambda x: tokenizer.tokenize(x[0]), axis=1)

# Replace words with integers, add the special [CLS] and [SEP] tokens
input_df['Integerized'] = input_df[['Tokenized']].apply(lambda x: tokenizer.encode(x[0], add_special_tokens=True), axis=1)

# Pad and truncate all sentences so they are the same length
length = 50
input_df['Padded'] = input_df[['Integerized']].apply(lambda x: pad_sequences(x, maxlen=length, dtype="long", truncating="post", padding="post"), axis=1)

# Add attention mask. Attention is 0 for padding, else 1
input_df['Attention_Mask'] = input_df[['Padded']].apply(lambda x: (x[0] != 0).astype(int), axis=1)

# The model only accepts Tensor, so convert the Padded Input
input_df['Input_Tensor'] = input_df[['Padded']].apply(lambda x: torch.tensor(x[0]), axis=1)

# The model only accepts Tensor, so convert the Mask
input_df['Attention_Tensor'] = input_df[['Attention_Mask']].apply(lambda x: torch.tensor(x[0]), axis=1)

#Sanity checks for sentences
def check_sentence(index):
    for column in input_df:
        print(column)
        print(input_df[[column]].iloc[index][0])
        print()
    
check_sentence(0)

Sentence
I sin första reaktion på Sovjetledarens varningar deklarerade Litauens president Vytautas Landsbergis att " nu avvisar Gorbatjov vår utsträckta hand med extremt skarpa och hämndlystna ord " .

Tokenized
['I', 'sin', 'första', 'reaktion', 'på', 'Sovjet', '##ledaren', '##s', 'varningar', 'deklarerade', 'Litauen', '##s', 'president', 'V', '##yta', '##uta', '##s', 'Land', '##sberg', '##is', 'att', '"', 'nu', 'avvisar', 'Gorbatjov', 'vår', 'utsträck', '##ta', 'hand', 'med', 'extremt', 'skarpa', 'och', 'hämnd', '##lyst', '##na', 'ord', '"', '.']

Integerized
[2, 135, 243, 578, 10540, 68, 3380, 7245, 49796, 27689, 28413, 26922, 49796, 3206, 121, 3393, 4634, 49796, 2901, 6697, 116, 48, 98, 346, 31843, 24926, 671, 4958, 237, 541, 66, 9926, 21667, 36, 16370, 15191, 42, 696, 98, 7, 3]

Padded
[[    2   135   243   578 10540    68  3380  7245 49796 27689 28413 26922
  49796  3206   121  3393  4634 49796  2901  6697   116    48    98   346
  31843 24926   671  4958   237   541    66  9926 

In [18]:
x = input_df[['Padded']]
x

#Switch to Swedish NER
#Add a NER head
#Switch to ALBERT
#Fix input matrix instaed of single sentence
#Switch to BIO tagging

Unnamed: 0,Padded
0,"[[101, 146, 11850, 175, 19593, 9731, 1161, 123..."
1,"[[101, 146, 4035, 187, 15012, 27629, 5815, 403..."
2,"[[101, 118, 9352, 1116, 5871, 1197, 191, 1182,..."
3,"[[101, 118, 3401, 4035, 1260, 4342, 5970, 1181..."
4,"[[101, 20164, 1197, 250, 1197, 1260, 1204, 173..."


In [29]:
from transformers import AutoModel
model = AutoModel.from_pretrained('KB/bert-base-swedish-cased')


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=964.0, style=ProgressStyle(description_…




In [39]:
x = input_df[['Padded']]
display(x)
input_df.as_matrix(columns=df.Padded[1:])

Unnamed: 0,Padded
0,"[[2, 135, 243, 578, 10540, 68, 3380, 7245, 497..."
1,"[[2, 135, 59, 17275, 2548, 59, 1337, 1207, 68,..."
2,"[[2, 52, 9077, 108, 186, 1696, 9273, 48, 690, ..."
3,"[[2, 52, 299, 59, 41611, 103, 444, 1024, 729, ..."
4,"[[2, 1504, 54, 82, 327, 31, 15894, 302, 3, 0, ..."


AttributeError: 'DataFrame' object has no attribute 'as_matrix'

In [32]:
from transformers import DistilBertModel, DistilBertConfig
#model = DistilBertModel.from_pretrained('distilbert-base-cased')

index = 0
x = input_df[['Input_Tensor']].iloc[index][0]
y = input_df[['Attention_Tensor']].iloc[index][0]

print(x.shape)
embeddings = model.forward(input_ids=x,
    attention_mask=y,
    head_mask=None)

print(embeddings[0].shape)

torch.Size([1, 50])
torch.Size([1, 50, 768])


In [21]:
from sklearn.model_selection import train_test_split

# Split into training and validation sets.

#Albert takes both the sentence tokens (as ids), the labels (as ids) and attention masks as input, so we split those as well.
r=random_state = 2018
t=test_size = 0.1

tr_token, val_token = train_test_split(df['Token_ID'].to_list(),random_state=r, test_size=t)
tr_class, val_class = train_test_split(df['Entity_ID'].to_list(),random_state=r, test_size=t)
tr_masks, val_masks = train_test_split(df['Attention_Mask'].to_list(),random_state=r, test_size=t)

KeyError: 'Token_ID'

In [10]:
import torch
#Convert to tensors

text_enc = tokenizer.encode(text, add_special_tokens=True)
mask_enc = tokenizer.encode(mask, add_special_tokens=True)
tags_enc = tokenizer.encode(tags, add_special_tokens=True)

print(text_enc)
print(mask_enc)
print(tags_enc)

text_enc_tensor = torch.tensor(text_enc).unsqueeze(0)
mask_enc_tensor = torch.tensor(mask_enc).unsqueeze(0)
tags_enc_tensor = torch.tensor(tags_enc).unsqueeze(0)



tr_inputs = torch.tensor(tr_token)
val_inputs = torch.tensor(val_token)
tr_tags = torch.tensor(tr_class)
val_tags = torch.tensor(val_class)
tr_masks = torch.tensor(tr_masks)
val_masks = torch.tensor(val_masks)

In [11]:
#Convert it all into a tensordataset 
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

#Set batch size
bs = 1

train_data = TensorDataset(tr_inputs, tr_masks, tr_tags)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=bs)

valid_data = TensorDataset(val_inputs, val_masks, val_tags)
valid_sampler = SequentialSampler(valid_data)
valid_dataloader = DataLoader(valid_data, sampler=valid_sampler, batch_size=bs)

In [12]:
#Load model
from transformers import DistilBertModel, DistilBertConfig
model = DistilBertModel.from_pretrained('distilbert-base-cased')

In [13]:
#Pass the data throug it to get embeddings
embeddings = model.forward(input_ids=tr_inputs,
    attention_mask=tr_masks,
    head_mask=None)

RuntimeError: index out of range: Tried to access index 41611 out of table with 28995 rows. at /pytorch/aten/src/TH/generic/THTensorEvenMoreMath.cpp:418