In [1]:
pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/ae/05/c8c55b600308dc04e95100dc8ad8a244dd800fe75dfafcf1d6348c6f6209/transformers-3.1.0-py3-none-any.whl (884kB)
[K     |████████████████████████████████| 890kB 9.3MB/s 
[?25hCollecting sentencepiece!=0.1.92
[?25l  Downloading https://files.pythonhosted.org/packages/d4/a4/d0a884c4300004a78cca907a6ff9a5e9fe4f090f5d95ab341c53d28cbc58/sentencepiece-0.1.91-cp36-cp36m-manylinux1_x86_64.whl (1.1MB)
[K     |████████████████████████████████| 1.1MB 42.7MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 56.4MB/s 
Collecting tokenizers==0.8.1.rc2
[?25l  Downloading https://files.pythonhosted.org/packages/80/83/8b9fccb9e48eeb575ee19179e2bdde0ee9a1904f97de5f02d19016b8804f/tokenizers-0.8.1rc2-cp36-cp36m-manylinux1_x86_64.whl (3.0MB

In [2]:
from transformers import TFBertModel,  BertConfig, BertTokenizerFast
from tensorflow.keras.layers import Input, Dropout, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.initializers import TruncatedNormal
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.metrics import CategoricalAccuracy
from tensorflow.keras.utils import to_categorical
import pandas as pd
from sklearn.model_selection import train_test_split

In [3]:
data = pd.read_csv('/content/drive/My Drive/MinorProject2/BERT2/German_train.csv')
data = data[['text', 'task2']]
data.head()

Unnamed: 0,text,task2
0,Deutsche rothaarige porno reife deutsche fraue...,NONE
1,"Lehrstück auch, wie in der linken Jammerfemini...",NONE
2,RT @NDRinfo: Die deutsche Klimaaktivistin Luis...,NONE
3,@ruhrbahn jeden Morgen eine neue „Fahrzeugstör...,NONE
4,@Junge_Freiheit Die Inkas hatten sich schon dä...,NONE


In [4]:
data['task2'] = pd.Categorical(data['task2'])
data['task2'] = data['task2'].cat.codes
data.head()

Unnamed: 0,text,task2
0,Deutsche rothaarige porno reife deutsche fraue...,1
1,"Lehrstück auch, wie in der linken Jammerfemini...",1
2,RT @NDRinfo: Die deutsche Klimaaktivistin Luis...,1
3,@ruhrbahn jeden Morgen eine neue „Fahrzeugstör...,1
4,@Junge_Freiheit Die Inkas hatten sich schon dä...,1


In [5]:
data, data_test = train_test_split(data, test_size = 0.2)
data.shape,data_test.shape

((1961, 2), (491, 2))

In [6]:
model_name = 'bert-base-uncased'
max_length = 64
config = BertConfig.from_pretrained(model_name)
config.output_hidden_states = False
tokenizer = BertTokenizerFast.from_pretrained(pretrained_model_name_or_path = model_name, config = config)
transformer_model = TFBertModel.from_pretrained(model_name, config = config)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=536063208.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [7]:
bert = transformer_model.layers[0]
input_ids = Input(shape=(max_length,), name='input_ids', dtype='int32')
inputs = {'input_ids': input_ids}
bert_model = bert(inputs)[1]
dropout = Dropout(config.hidden_dropout_prob, name='pooled_output')
pooled_output = dropout(bert_model, training=False)
task2 = Dense(units=len(data.task2.value_counts()), kernel_initializer=TruncatedNormal(stddev=config.initializer_range), name='task2')(pooled_output)
outputs = {'task2': task2}
model = Model(inputs=inputs, outputs=outputs, name='BERT_MultiLabel_MultiClass')
model.summary()


Model: "BERT_MultiLabel_MultiClass"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_ids (InputLayer)       [(None, 64)]              0         
_________________________________________________________________
bert (TFBertMainLayer)       ((None, 64, 768), (None,  109482240 
_________________________________________________________________
pooled_output (Dropout)      (None, 768)               0         
_________________________________________________________________
task2 (Dense)                (None, 4)                 3076      
Total params: 109,485,316
Trainable params: 109,485,316
Non-trainable params: 0
_________________________________________________________________


In [8]:
optimizer = Adam(
    learning_rate=5e-05,
    epsilon=1e-08,
    decay=0.01,
    clipnorm=1.0)
loss = {'task2': CategoricalCrossentropy(from_logits = True)}
metric = {'task2': CategoricalAccuracy('accuracy')}

In [9]:
model.compile(
    optimizer = optimizer,
    loss = loss, 
    metrics = metric)
y_issue = to_categorical(data['task2'])

In [10]:
x = tokenizer(
    text=data['text'].to_list(),
    add_special_tokens=True,
    max_length=max_length,
    truncation=True,
    padding=True, 
    return_tensors='tf',
    return_token_type_ids = False,
    return_attention_mask = True,
    verbose = True)
history = model.fit(
    x={'input_ids': x['input_ids']},
    y={'task2': y_issue},
    validation_split=0.2,
    batch_size=64,
    epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [11]:
test_y_issue = to_categorical(data_test['task2'])
test_x = tokenizer(
    text=data_test['text'].to_list(),
    add_special_tokens=True,
    max_length=max_length,
    truncation=True,
    padding=True, 
    return_tensors='tf',
    return_token_type_ids = False,
    return_attention_mask = False,
    verbose = True)
model_eval = model.evaluate(
    x={'input_ids': test_x['input_ids']},
    y={'task2': test_y_issue}
)



In [12]:
test_x

{'input_ids': <tf.Tensor: shape=(491, 64), dtype=int32, numpy=
array([[  101, 19387,  1030, ...,     0,     0,     0],
       [  101,  5202,  7384, ...,     0,     0,     0],
       [  101,  1030,  2002, ...,     0,     0,     0],
       ...,
       [  101, 22564,  2162, ...,     0,     0,     0],
       [  101,  1030,  2210, ...,     0,     0,     0],
       [  101, 19387,  1030, ...,     0,     0,     0]], dtype=int32)>}

In [13]:
preds = model.predict(test_x['input_ids'])

In [14]:
preds=preds['task2'].argmax(axis=-1)

In [15]:
preds

array([1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 0, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 3, 1, 3, 1, 1, 3, 1, 2,
       1, 3, 1, 1, 1, 1, 3, 3, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3,
       3, 3, 1, 1, 3, 2, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 3, 3, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3,
       1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
       1, 1, 1, 3, 1, 1, 1, 1, 0, 2, 1, 1, 1, 3, 1, 1, 1, 0, 0, 1, 1, 1,
       1, 1, 1, 1, 3, 1, 1, 1, 1, 2, 1, 2, 2, 1, 1, 1, 3, 1, 1, 1, 1, 1,
       0, 1, 3, 1, 3, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 3, 1, 1, 1, 1, 3, 1,
       1, 3, 1, 0, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1,
       1, 1, 3, 1, 1, 1, 2, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
       1, 1, 1, 1, 3, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

In [16]:
test_y_issue=test_y_issue.argmax(axis=-1)

In [17]:
test_y_issue

array([1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2,
       3, 1, 1, 2, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 3, 1, 2, 1, 1, 3, 1, 1,
       2, 3, 1, 3, 1, 1, 3, 3, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2,
       3, 3, 1, 1, 1, 0, 1, 3, 3, 0, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1,
       1, 1, 3, 3, 1, 1, 1, 2, 1, 3, 1, 1, 1, 1, 1, 1, 1, 0, 3, 2, 1, 3,
       1, 3, 3, 1, 1, 3, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 3, 1, 3,
       1, 1, 1, 3, 1, 1, 1, 1, 1, 2, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 3, 1, 1, 3, 1, 3, 0, 1, 2, 1, 1, 1, 3, 1, 1, 1, 1, 1,
       1, 3, 3, 1, 0, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 3, 1, 3, 3, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 3, 1, 1, 1, 3, 1,
       1, 1, 2, 1, 3, 1, 3, 1, 1, 3, 3, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 2, 1, 1, 0, 1, 0,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1,

In [18]:
from sklearn.metrics import classification_report
print(classification_report(test_y_issue,preds))

              precision    recall  f1-score   support

           0       0.06      0.08      0.06        13
           1       0.89      0.93      0.91       385
           2       0.27      0.17      0.21        23
           3       0.74      0.60      0.66        70

    accuracy                           0.82       491
   macro avg       0.49      0.44      0.46       491
weighted avg       0.82      0.82      0.82       491

