In [1]:
!pip install simpletransformers

Collecting simpletransformers
  Downloading simpletransformers-0.63.7-py3-none-any.whl (249 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m249.8/249.8 kB[0m [31m905.9 kB/s[0m eta [36m0:00:00[0m
Collecting streamlit
  Downloading streamlit-1.11.0-py2.py3-none-any.whl (9.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.1/9.1 MB[0m [31m15.4 MB/s[0m eta [36m0:00:00[0m
Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Collecting watchdog
  Downloading watchdog-2.1.9-py3-none-manylinux2014_x86_64.whl (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.4/78.4 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
Collecting pydeck>=0.1.dev5
  Downloading pydeck-0.7.1-py2.py3-none-any.whl (4.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

# Preprocessing Data

In [2]:
import os
import sys
import json
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

In [3]:
df = pd.read_csv('../input/word-annotation/data.csv')

In [4]:
df.describe()

Unnamed: 0,sentence #,word,annotation
count,8947836,8947835,8947836
unique,533182,531621,9
top,sentence: 417846,.,O
freq,50,558818,7299435


In [5]:
df['word'] = df['word'].fillna(method="ffill")

In [6]:
df['sentence #'] = LabelEncoder().fit_transform(df['sentence #'])

In [7]:
df.rename(columns={"sentence #": "sentence_id","word": "words", "annotation": "labels"}, inplace =True)

In [8]:
df["labels"] = df["labels"].str.upper()

In [9]:
size = 3000000
sys.stdout.write(str(size))

3000000

7

In [10]:
X = df[["sentence_id","words"]][:size]
Y = df["labels"][:size]

In [11]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.3, random_state=0)

In [12]:
X_train.shape, X_test.shape, Y_train.shape, Y_test.shape

((2100000, 2), (900000, 2), (2100000,), (900000,))

In [13]:
train_data = pd.DataFrame({"sentence_id": X_train["sentence_id"], "words": X_train["words"], "labels": Y_train})
test_data = pd.DataFrame({"sentence_id": X_test["sentence_id"], "words": X_test["words"], "labels": Y_test})

In [14]:
train_data.head()

Unnamed: 0,sentence_id,words,labels
593800,286914,",",O
1160315,500705,ve,O
172048,3504,İslam,O
2561332,62192,oryantal,B-MISC
928292,484708,bölge,O


# Model Training

In [15]:
from simpletransformers.ner import NERModel, NERArgs

In [16]:
label = df["labels"].unique().tolist()
sys.stdout.write(json.dumps(label, indent=4, sort_keys=True))

[
    "B-PERSON",
    "I-PERSON",
    "O",
    "B-LOCATION",
    "B-MISC",
    "B-ORGANIZATION",
    "I-ORGANIZATION",
    "I-MISC",
    "I-LOCATION"
]

151

In [17]:
args = NERArgs()
args.num_train_epochs = 3
args.learning_rate = 1e-4
args.overwrite_output_dir = True
args.train_batch_size = 32
args.eval_batch_size = 32
args.save_steps = -1
args.save_model_every_epoch = False
args.save_eval_checkpoints = False

In [18]:
model = NERModel('bert', 'dbmdz/bert-base-turkish-128k-cased', labels=label, args=args)

Downloading:   0%|          | 0.00/386 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/706M [00:00<?, ?B/s]

Some weights of the model checkpoint at dbmdz/bert-base-turkish-128k-cased were not used when initializing BertForTokenClassification: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not ini

Downloading:   0%|          | 0.00/1.12M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/60.0 [00:00<?, ?B/s]

In [19]:
model.train_model(train_data, eval_data=test_data, acc=accuracy_score)

  0%|          | 0/3 [00:01<?, ?it/s]



Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Running Epoch 0 of 3:   0%|          | 0/5678 [00:00<?, ?it/s]



Running Epoch 1 of 3:   0%|          | 0/5678 [00:00<?, ?it/s]

Running Epoch 2 of 3:   0%|          | 0/5678 [00:00<?, ?it/s]

(17034, 0.24098763203632634)

In [20]:
result, model_outputs, preds_list = model.eval_model(test_data)

  0%|          | 0/3 [00:02<?, ?it/s]

Running Evaluation:   0%|          | 0/5560 [00:00<?, ?it/s]

In [21]:
sys.stdout.write(json.dumps(result, indent=4, sort_keys=True))

{
    "eval_loss": 0.30565422760999406,
    "f1_score": 0.7038454204538231,
    "precision": 0.6862879985125965,
    "recall": 0.7223247774045204
}

147

In [22]:
prediction, model_output = model.predict(["Hilmi Can Taşkıran Fethiye'lidir ."])

  0%|          | 0/1 [00:00<?, ?it/s]

Running Prediction:   0%|          | 0/1 [00:00<?, ?it/s]

In [23]:
sys.stdout.write(str(prediction))

[[{'Hilmi': 'O'}, {'Can': 'O'}, {'Taşkıran': 'O'}, {"Fethiye'lidir": 'B-LOCATION'}, {'.': 'O'}]]

96