In [2]:
import pandas as pd
import numpy as np
from tqdm import tqdm, trange
from transformers import AutoTokenizer
from transformers import AutoModel

## Prepare the data


## Checklist to make the data compatible
* Get sentences
* Tokenize the sentences
* Put wrongly split tokens together.
* Transform the tokens to id:s
* Define the length L of the input sequence. The longer the sentence the more the context, but also the more processing power needed
* Pad sentences shorter than L with 0
* Crop sentences longer than L
* Tell Albert to ignore padded info (attention_mask = np.where(padded != 0, 1, 0))
* Convert to tensors

## Load data

In [5]:
#Load the dataset
data = pd.read_pickle("../data/adjusted_dataframe")
subset = data.head(100)
subset

Unnamed: 0,Sentence,Padded_Tokenized_Sentence,Entities,Entity_IDS,Attention_Masks
0,I sin första reaktion på Sovjetledarens varnin...,"[135, 243, 578, 10540, 68, 3380, 7245, 49796, ...","[O, O, O, O, O, O, O, O, LOC, O, PRS, O, O, O,...","[22, 22, 22, 22, 22, 22, 22, 22, 11, 22, 9, 22...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ..."
1,I en ruta talar en kort rad på ryska om att de...,"[135, 59, 17275, 2548, 59, 1337, 1207, 68, 370...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 2...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ..."
2,"- Dels har vi inget index att gå efter , vi kr...","[52, 9077, 108, 186, 1696, 9273, 48, 690, 275,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, TME...","[22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 2...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ..."
3,"- Men en deporterad blir aldrig fri , säger Ri...","[52, 299, 59, 41611, 103, 444, 1024, 729, 19, ...","[O, O, O, O, O, O, O, O, O, PRS, O, O, O, O, O...","[22, 22, 22, 22, 22, 22, 22, 22, 22, 9, 22, 22...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ..."
4,Hur är det då i Mellanöstern ?,"[1504, 54, 82, 327, 31, 15894, 302, 0, 0, 0, 0...","[O, O, O, O, O, LOC, O]","[22, 22, 22, 22, 22, 11, 22, 22, 22, 22, 22, 2...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, ..."
...,...,...,...,...,...
95,Den svåraste frågan - hur de 100000 människor ...,"[219, 15255, 2162, 52, 612, 102, 48804, 1372, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 2...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ..."
96,- Så vitt jag vet har vi varken skickat fallsk...,"[52, 1022, 2839, 217, 743, 108, 186, 5077, 155...","[O, O, O, O, O, O, O, O, O, O, O, LOC, O, O, O...","[22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 1...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ..."
97,De deporterades egen organisation - Lietuvas T...,"[335, 41611, 439, 1078, 3589, 52, 20639, 16583...","[O, O, O, O, O, ORG, O, O, O, O, O, O]","[22, 22, 22, 22, 22, 4, 22, 22, 22, 22, 22, 22...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ..."
98,Men i stället utlöste den nya hot och hot på a...,"[299, 31, 1484, 35059, 97, 664, 2201, 36, 2201...","[O, O, O, O, O, O, O, O, O, O, O, O, O]","[22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 2...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ..."


In [6]:
#Load the model

#This model is an alpha-version of albert for swedish from huggingface, trained on ca 20gb of text from swedish wikipedia, books, news, govn publications etc
#https://huggingface.co/KB/bert-base-swedish-cased-ner
#Models trained with whole word cased masking
model = AutoModel.from_pretrained('KB/albert-base-swedish-cased-alpha')

In [9]:
epoch = 0
for _ in trange(10, desc="Epoch"):
    epoch += 1

Epoch: 100%|██████████| 10/10 [00:00<00:00, 47233.15it/s]


In [11]:
from transformers import AlbertConfig, AlbertModel
# Initializing an ALBERT-xxlarge style configuration
albert_xxlarge_configuration = AlbertConfig()

# Initializing an ALBERT-base style configuration
albert_base_configuration = AlbertConfig(
    hidden_size=768,
    num_attention_heads=12,
    intermediate_size=3072,
)

# Initializing a model from the ALBERT-base style configuration
model = AlbertModel(albert_xxlarge_configuration)

# Accessing the model configuration
configuration = model.config

In [12]:
model.train(True)

AlbertModel(
  (embeddings): AlbertEmbeddings(
    (word_embeddings): Embedding(30000, 128, padding_idx=0)
    (position_embeddings): Embedding(512, 128)
    (token_type_embeddings): Embedding(2, 128)
    (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0, inplace=False)
  )
  (encoder): AlbertTransformer(
    (embedding_hidden_mapping_in): Linear(in_features=128, out_features=4096, bias=True)
    (albert_layer_groups): ModuleList(
      (0): AlbertLayerGroup(
        (albert_layers): ModuleList(
          (0): AlbertLayer(
            (full_layer_layer_norm): LayerNorm((4096,), eps=1e-12, elementwise_affine=True)
            (attention): AlbertAttention(
              (query): Linear(in_features=4096, out_features=4096, bias=True)
              (key): Linear(in_features=4096, out_features=4096, bias=True)
              (value): Linear(in_features=4096, out_features=4096, bias=True)
              (dropout): Dropout(p=0, inplace=False)
        

In [18]:
import torch
from torch.optim import Adam
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertConfig
from transformers import BertForTokenClassification

In [19]:
MAX_LEN = 75
bs = 32

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()

torch.cuda.get_device_name(0) 


'GeForce GTX 1050 Ti'

In [26]:
subset

Unnamed: 0,Sentence,Padded_Tokenized_Sentence,Entities,Entity_IDS,Attention_Masks
0,I sin första reaktion på Sovjetledarens varnin...,"[135, 243, 578, 10540, 68, 3380, 7245, 49796, ...","[O, O, O, O, O, O, O, O, LOC, O, PRS, O, O, O,...","[22, 22, 22, 22, 22, 22, 22, 22, 11, 22, 9, 22...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ..."
1,I en ruta talar en kort rad på ryska om att de...,"[135, 59, 17275, 2548, 59, 1337, 1207, 68, 370...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 2...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ..."
2,"- Dels har vi inget index att gå efter , vi kr...","[52, 9077, 108, 186, 1696, 9273, 48, 690, 275,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, TME...","[22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 2...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ..."
3,"- Men en deporterad blir aldrig fri , säger Ri...","[52, 299, 59, 41611, 103, 444, 1024, 729, 19, ...","[O, O, O, O, O, O, O, O, O, PRS, O, O, O, O, O...","[22, 22, 22, 22, 22, 22, 22, 22, 22, 9, 22, 22...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ..."
4,Hur är det då i Mellanöstern ?,"[1504, 54, 82, 327, 31, 15894, 302, 0, 0, 0, 0...","[O, O, O, O, O, LOC, O]","[22, 22, 22, 22, 22, 11, 22, 22, 22, 22, 22, 2...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, ..."
...,...,...,...,...,...
95,Den svåraste frågan - hur de 100000 människor ...,"[219, 15255, 2162, 52, 612, 102, 48804, 1372, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 2...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ..."
96,- Så vitt jag vet har vi varken skickat fallsk...,"[52, 1022, 2839, 217, 743, 108, 186, 5077, 155...","[O, O, O, O, O, O, O, O, O, O, O, LOC, O, O, O...","[22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 1...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ..."
97,De deporterades egen organisation - Lietuvas T...,"[335, 41611, 439, 1078, 3589, 52, 20639, 16583...","[O, O, O, O, O, ORG, O, O, O, O, O, O]","[22, 22, 22, 22, 22, 4, 22, 22, 22, 22, 22, 22...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ..."
98,Men i stället utlöste den nya hot och hot på a...,"[299, 31, 1484, 35059, 97, 664, 2201, 36, 2201...","[O, O, O, O, O, O, O, O, O, O, O, O, O]","[22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 2...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ..."


In [69]:
tr_inputs, val_inputs, tr_tags, val_tags = train_test_split(subset['Padded_Tokenized_Sentence'].values, subset['Entity_IDS'].values, 
                                                            random_state=2018, test_size=0.1)
tr_masks, val_masks, _, _ = train_test_split(subset['Attention_Masks'].values, subset['Padded_Tokenized_Sentence'].values,
                                             random_state=2018, test_size=0.1)

In [66]:
#display(tr_inputs)
#display(tr_masks)
#display(tr_tags)
#tr_inputs.tolist()

In [70]:
tr_inputs = torch.tensor(tr_inputs)
val_inputs = torch.tensor(val_inputs)
tr_tags = torch.tensor(tr_tags)
val_tags = torch.tensor(val_tags)
tr_masks = torch.tensor(tr_masks)
val_masks = torch.tensor(val_masks)

TypeError: can't convert np.ndarray of type numpy.object_. The only supported types are: float64, float32, float16, int64, int32, int16, int8, uint8, and bool.

In [46]:
tr_inputs.to_numpy().shape

(90, 1)

In [44]:
tr_inputs
tr_inputs = torch.tensor(tr_inputs.to_numpy())


TypeError: can't convert np.ndarray of type numpy.object_. The only supported types are: float64, float32, float16, int64, int32, int16, int8, uint8, and bool.