# set up path

In [None]:
from google.colab import drive

drive.mount('/content/gdrive')

In [None]:
input_path = "/content/gdrive/MyDrive/SuperAI/ss3-hackathon-online-natural-language-processing/ne_sample_submission.csv"

In [None]:
q_path = "/content/gdrive/MyDrive/SuperAI/ss3-hackathon-online-natural-language-processing/ne_test.txt"

In [None]:
ner_list_path = "/content/gdrive/MyDrive/SuperAI/ss3-hackathon-online-natural-language-processing/ne_list.txt"

# simple sentence transformer

In [None]:
texts_test_raw = []
f = open(q_path, "r")
file_data = f.readlines()
for line in file_data:
  line = line.strip().replace(u'\xa0', u'') # turn new line -> ''
  texts_test_raw.append(line)

In [None]:
print(texts_test_raw)

In [None]:
#remove last space
texts_test_raw = texts_test_raw[:-1]
len(texts_test_raw)

In [None]:
#Replace blank with "_"
def blank_space(x):
  if x == '':
    x = '_'
  return x

#Loop replace blank to "_"
for i in range(len(texts_test_raw)):
  texts_test_raw[i] = blank_space(texts_test_raw[i])

In [None]:
print(texts_test_raw)

In [None]:
def split_into_sentences(tokens, tokens_per_sentence=20):
    sentences = []
    for i in range(0, len(tokens), tokens_per_sentence):
        sentence = tokens[i:i+tokens_per_sentence]
        sentences.append(sentence)
    return sentences

In [None]:
my_token = split_into_sentences(texts_test_raw)

In [None]:
print(my_token) # to batch

In [None]:
sent_join = ' '.join(my_token[0])
type(sent_join)

In [None]:
sent_join # join to sentence with " "

In [None]:
my_token_list = [] # turn element -> sentence
for i in range(len(my_token)):
  sent_join = ' '.join(my_token[i])
  print(sent_join)
  my_token_list.append(sent_join)

## pip zone

In [None]:
!pip install -q simpletransformers

In [None]:
!pip install gdown

## load data ls20

In [None]:
import pandas as pd
import json
from datasets import load_dataset
import gdown

In [None]:
!tar -xvf "/content/gdrive/MyDrive/SuperAI/AIFORTHAI-LST20Corpus.tar.gz"

In [None]:
#Load file in directory into variable lst20
lst20 = load_dataset("lst20", data_dir="/content/LST20_Corpus")
lst20

In [None]:
train_df = pd.DataFrame(lst20['train'])
validation_df = pd.DataFrame(lst20['validation'])
test_df = pd.DataFrame(lst20['test'])
train_df.head(3)

In [None]:
df_filter = ['id', 'tokens', 'ner_tags']
train_df = train_df[df_filter]
validation_df = validation_df[df_filter]
test_df = test_df[df_filter]
train_df.head(3)

In [None]:
with open(ner_list_path, "r") as file:
    ner_list = file.read()
print(ner_list)

In [None]:
# Sorting NER Tags start from O, B, I, and E
ner_list = [item.strip().strip(" '") for item in ner_list.strip("[]\n").split(",")]
O_list = [item for item in ner_list if item.startswith("O")]
B_list = [item for item in ner_list if item.startswith("B_")]
I_list = [item for item in ner_list if item.startswith("I_")]
E_list = [item for item in ner_list if item.startswith("E_")]
#Sort start from O,B_,I_,E_
NER_TAGS = O_list + B_list + I_list + E_list
print(NER_TAGS)

In [None]:
# convert to simple transformer format
def convert_data_to_df(df):
  data_df = pd.DataFrame()
  sentence_id = []
  words = []
  labels = []

  for sentence in range(len(df)):
    for token in range(len(df['tokens'][sentence])):
      sentence_id.append(sentence)
      words.append(df['tokens'][sentence][token])
      labels.append(NER_TAGS[df['ner_tags'][sentence][token]]) #Map 0 to "O", 1 to "B_BRN"

  return pd.DataFrame(
      {"sentence_id": sentence_id, "words": words, "labels": labels}
  )

In [None]:
train_df.head()

In [None]:
# preprocess

In [None]:
train_data = convert_data_to_df(train_df)
eval_data = convert_data_to_df(validation_df )
test_data = convert_data_to_df(test_df)

In [None]:
train_data.head()

In [None]:
len(NER_TAGS)

In [None]:
import logging
from simpletransformers.ner import NERModel, NERArgs
import torch

# Simple Transformer https://simpletransformers.ai/docs/ner-minimal-start/
logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

In [None]:
data = {
    'Model': ['ALBERT', 'BERT', 'BERTweet', 'BigBird', 'CamemBERT', 'DeBERTa', 'DeBERTa', 'DeBERTaV2', 'DistilBERT', 'ELECTRA', 'HerBERT', 'LayoutLM', 'LayoutLMv2', 'Longformer', 'MobileBERT', 'MPNet', 'RemBERT', 'RoBERTa', 'SqueezeBert', 'XLM', 'XLM-RoBERTa', 'XLNet'],
    'Model code for NERModel': ['albert', 'bert', 'bertweet', 'bigbird', 'camembert', 'deberta', 'deberta', 'deberta-v2', 'distilbert', 'electra', 'herbert', 'layoutlm', 'layoutlmv2', 'longformer', 'mobilebert', 'mpnet', 'rembert', 'roberta', 'squeezebert', 'xlm', 'xlmroberta', 'xlnet']
}
modelType_df = pd.DataFrame(data)
modelType_df

In [None]:
train_data.info()

In [None]:
max_seq_length = train_data['words'].str.len().max()
print("Maximum length in column 'words':", max_seq_length)

In [None]:
# Configure the model
ner_args = NERArgs()
ner_args.train_batch_size = 128 #192 is fit for GPU T4, 512 for A100
ner_args.use_multiprocessing = True
ner_args.evaluate_during_training = True
ner_args.eval_batch_size = 1024
ner_args.num_train_epochs = 2
ner_args.overwrite_output_dir = True

model = NERModel(
     "roberta", # Model Type
     "xlm-roberta-base",  #Ner Pre-trained Model
     args=ner_args, use_cuda=torch.cuda.is_available(), labels=NER_TAGS # Local Config
)

In [None]:
model.train_model(train_data, eval_data=eval_data)

In [None]:
result, model_outputs, preds_list = model.eval_model(eval_data)
result

In [None]:
!cp -r /content/outputs/best_model /content/gdrive/MyDrive/SuperAI/hack5/model

# finetune pythainlp with ls20

In [None]:
!pip install pythainlp
!pip install transformers

In [None]:
from transformers import AutoTokenizer
from transformers import AutoModelForTokenClassification
from pythainlp.tokenize import word_tokenize
import torch

In [None]:
name="pythainlp/thainer-corpus-v2-base-model"
tokenizer = AutoTokenizer.from_pretrained(name)
model = AutoModelForTokenClassification.from_pretrained(name)

# sparknlp tokenize+ner

In [None]:
! pip install -q pyspark==3.3.0 spark-nlp==4.2.8
! pip install --upgrade -q spark-nlp-display

In [None]:
import json
import pandas as pd
import numpy as np

import sparknlp
import pyspark.sql.functions as F

from pyspark.ml import Pipeline
from pyspark.sql import SparkSession
from sparknlp.annotator import *
from sparknlp.base import *
from sparknlp.pretrained import PretrainedPipeline
from pyspark.sql.types import StringType, IntegerType

In [None]:
spark = sparknlp.start()

print("Spark NLP version", sparknlp.version())
print("Apache Spark version:", spark.version)

spark

In [None]:
document_assembler = DocumentAssembler() \
    .setInputCol("text") \
    .setOutputCol("document")

sentence_detector = SentenceDetector()\
    .setInputCols(["document"])\
    .setOutputCol("sentence")

word_segmenter = WordSegmenterModel.pretrained("wordseg_best", "th")\
    .setInputCols(["sentence"])\
    .setOutputCol("token")

embeddings = WordEmbeddingsModel.pretrained("glove_840B_300", "xx")\
    .setInputCols("document", "token") \
    .setOutputCol("embeddings")

ner = NerDLModel.pretrained("ner_lst20_glove_840B_300d", "th") \
    .setInputCols(["document", "token", "embeddings"]) \
    .setOutputCol("ner")

In [None]:
pipeline = Pipeline(stages=[document_assembler, sentence_detector, word_segmenter, embeddings, ner, ner_converter])
example = spark.createDataFrame([['ผมคิดว่าเป็นปลายเหตุที่แทกซี่ไม่ยอมกดมิเตอร์ จริงๆต้องเห็นใจเขาเหมือนกัน หากกดมิเตอร์ไปแล้ว ในช่วงที่รถติดจริงๆ มิเตอร์ไม่ขึ้น เขาก็ไม่ได้ตัง']], ["text"])
result = pipeline.fit(example).transform(example)

In [None]:
result.collect()[0]

In [None]:
from sparknlp_display import NerVisualizer

NerVisualizer().display(
    result = result.collect()[0],
    label_col = 'ner_chunk',
    document_col = 'document'
)

# wangchanberta

In [None]:
from google.colab import drive

drive.mount('/content/gdrive')

In [None]:
!pip install -q simpletransformers

In [None]:
!tar -xvf "/content/AIFORTHAI-LST20Corpus.tar.gz"

In [None]:
import pandas as pd
import json
from datasets import load_dataset
import gdown
from sklearn.metrics import f1_score

In [None]:
from datasets import load_dataset
lst20 = load_dataset("lst20", data_dir="/content/gdrive/MyDrive/SuperAI/LST20_Corpus")

In [None]:
lst20

In [None]:
import pandas as pd
train_df = pd.DataFrame(lst20['train'])
validation_df = pd.DataFrame(lst20['validation'])
test_df = pd.DataFrame(lst20['test'])

In [None]:
#train_ = pd.concat([train_df, validation_df] ,ignore_index = True)

In [None]:
train_df

In [None]:
_POS_TAGS = ["NN", "VV", "PU", "CC", "PS", "AX", "AV", "FX", "NU", "AJ", "CL", "PR", "NG", "PA", "XX", "IJ"]
_NER_TAGS = [
        "O",
        "B_BRN",
        "B_DES",
        "B_DTM",
        "B_LOC",
        "B_MEA",
        "B_NUM",
        "B_ORG",
        "B_PER",
        "B_TRM",
        "B_TTL",
        "I_BRN",
        "I_DES",
        "I_DTM",
        "I_LOC",
        "I_MEA",
        "I_NUM",
        "I_ORG",
        "I_PER",
        "I_TRM",
        "I_TTL",
        "E_BRN",
        "E_DES",
        "E_DTM",
        "E_LOC",
        "E_MEA",
        "E_NUM",
        "E_ORG",
        "E_PER",
        "E_TRM",
        "E_TTL",
    ]
_CLAUSE_TAGS = ["O", "B_CLS", "I_CLS", "E_CLS"]

In [None]:
target = 'กระต่าย'

for i in range(len(train_df)):
  if(target in train_df.loc[i, "tokens"]):
    print(train_df.iloc[i])

In [None]:
row = 1
sample_show = pd.DataFrame({"token":train_df.loc[row, "tokens"],
                            "ner":[_NER_TAGS[i] for i in train_df["ner_tags"][row]]})
sample_show

In [None]:
def convert_to_simple_transformer_format(df, field_name, tags):
  sentence_id = []
  words = []
  labels = []

  #Limit at 1000 rows for speed.
  for (idx, r) in df[:20000].iterrows():
    # print(idx)
    for (i, t) in enumerate(r['tokens']):
      # print(i,t)
      sentence_id.append(idx)
      words.append(t)
      labels.append(tags[r[field_name][i]])

  return pd.DataFrame(
      {"sentence_id": sentence_id, "words": words, "labels": labels}
  )

In [None]:
train_ = convert_to_simple_transformer_format(train_df, "ner_tags", _NER_TAGS)
validation_ = convert_to_simple_transformer_format(validation_df, "ner_tags", _NER_TAGS)
test_ = convert_to_simple_transformer_format(test_df, "ner_tags", _NER_TAGS)

In [None]:
train_

In [None]:
import torch
import pandas as pd
from simpletransformers.ner import NERModel, NERArgs

In [None]:
# Configure the model
ner_args = NERArgs()
ner_args.train_batch_size = 2002
ner_args.use_multiprocessing = True
ner_args.evaluate_during_training = True
ner_args.eval_batch_size = 2048
ner_args.num_train_epochs = 20
ner_args.overwrite_output_dir = True
ner_args.gradient_accumulation_steps = 16
ner_args.max_seq_length = 81

#ner_args.learning_rate = 2e-4
#optimizer = Adafactor(model.parameters(), scale_parameter=False, relative_step=True, warmup_init=True, lr=None)
#ner_args.optimizer = (Adafactor(model.parameters(), scale_parameter=False, relative_step=True, warmup_init=True, lr=None))

In [None]:
model = NERModel(
    "camembert", "airesearch/wangchanberta-base-att-spm-uncased", args=ner_args, use_cuda=torch.cuda.is_available(), labels=_NER_TAGS
)

# Train the model
model.train_model(train_, eval_data=test_, show_running_loss=True)

In [None]:
!cp -r /content/outputs/best_model /content/gdrive/MyDrive/SuperAI/hack5/model

## load model

In [None]:
ner_args = NERArgs()
ner_args.eval_batch_size = 1024
ner_args.use_multiprocessing = True
test_ner = NERModel("camembert", "/content/gdrive/MyDrive/SuperAI/hack5/model/best_model_wangchanberta_addarg"
, args=ner_args
                    , use_cuda=torch.cuda.is_available(), labels= _NER_TAGS)

In [None]:
predictions, raw_outputs = test_ner.predict(["Obama did his last work at 12 June"])
predictions

In [None]:
!cp -r /content/outputs/best_model /content/gdrive/MyDrive/hack5/model

## make submission

In [None]:
import pandas as pd

test_data = pd.read_csv("/content/gdrive/MyDrive/SuperAI/hack5/nithan-chadok-name-entity-recognition/test.csv")

In [None]:
texts_test_raw = []
for i in range(len(test_data)):
  texts_test_raw.append(test_data.loc[i, "word"])

In [None]:
def blank_space(x):
  if x == '':
    x = '_'
  return x

#Loop replace blank to "_"
for i in range(len(texts_test_raw)):
  texts_test_raw[i] = blank_space(texts_test_raw[i])

In [None]:
def split_into_sentences(tokens, tokens_per_sentence=15):
    sentences = []
    for i in range(0, len(tokens), tokens_per_sentence):
        sentence = tokens[i:i+tokens_per_sentence]
        sentences.append(sentence)
    return sentences

In [None]:
def check_inside(my_token):
  counter = 0
  for i in range(len(my_token)):
    counter+=len(my_token[i])
  return counter

In [None]:
my_token = split_into_sentences(texts_test_raw)

In [None]:
sent_join = ' '.join(my_token[0])
type(sent_join)

In [None]:
my_token_list = []
for i in range(len(my_token)):
  sent_join = ' '.join(my_token[i])
  #print(sent_join)
  my_token_list.append(sent_join)

In [None]:
# Test Model
ner_args = NERArgs()
ner_args.eval_batch_size = 128
ner_args.use_multiprocessing = True
model = NERModel(
     "auto", "/content/gdrive/MyDrive/SuperAI/hack5/model/best_model_wangchanberta_addarg", args=ner_args, use_cuda=torch.cuda.is_available(), labels=_NER_TAGS  # your latest model
)

In [None]:
predictions, raw_outputs = model.predict(my_token, False)

In [None]:
test_tag = pd.read_csv("/content/gdrive/MyDrive/SuperAI/hack5/nithan-chadok-name-entity-recognition/tag_list.csv")

In [None]:
test_tag

In [None]:
test_tag.info()

In [None]:
tag2class = {}
for i in range(len(test_tag)):
  tag2class[test_tag.loc[i, "tag"]] = test_tag.loc[i, "class"]

In [None]:
#Extract data value from dict list
final_test_df = []
for i in range(len(predictions)):
  for j in range(len(predictions[i])):
    data = predictions[i][j]
    #print(data, type(data))
    value = data.values()
    final_test_df += value

In [None]:
final_result = pd.DataFrame(final_test_df, columns=["id"])
final_result

In [None]:
for i in range(len(final_result)):
  final_result.loc[i,"pred"] = str(tag2class[final_result.loc[i, "id"]])

In [None]:
final_result

In [None]:
submission = pd.read_csv("/content/gdrive/MyDrive/SuperAI/hack5/nithan-chadok-name-entity-recognition/sample_submission.csv")

In [None]:
test_df = pd.read_csv("/content/gdrive/MyDrive/SuperAI/hack5/nithan-chadok-name-entity-recognition/test.csv")

In [None]:
test_df

In [None]:
for i in range(len(test_df)):
  target = test_df.loc[i, "word"]

  #animal
  ani = "นกมูลไถ,พังพอน,ลิง,หงส์,นกแขกเต้า,เต่า,กวาง,จระเข้,นกหัวขวาน,กบเขียว,ปูทอง,ปู,นกต้อย,นกดุเหว่า,ปลา,นกเค้า,นก,ช้าง,กระต่าย,นาก,เหยี่ยว,หงส์ทอง,หมูป่า,แรด"
  ani_ls = ani.split(',')
  if(target in ani_ls):
    final_result.loc[i, "pred"] = "0"
  #นายพราน
  if("นายพราน" in target):
    final_result.loc[i, "pred"] = "0"
  #prefix
  if(target in ["ท้าว", "พญา", "ท่าน", "พระ"]):
    final_result.loc[i, "pred"] = "9"
  #prefix sth
  if(target in ["ท้าว", "เจ้า"]):
    final_result.loc[i, "pred"] = "11"
  #มีแต่่ 12
  if(final_result.loc[i, "pred"] == "12"):
    final_result.loc[i-1, "pred"] = "3"
  #"_" + สรรพนามแทนตัวเอง
  if(target in ["_", "เรา", "ข้าพเจ้า"]):
    final_result.loc[i, "pred"] = "0"
  # ที่ + order number
  if (target == "ที่") and str(test_df.loc[i+2, "word"]).isdigit() :
    final_result.loc[i, "pred"] = "4"
  # pair of 4 and 17
  if(final_result.loc[i, "pred"]=="4") and (final_result.loc[i+2, "pred"]=="17"):
    final_result.loc[i+1, "pred"] = "15"

In [None]:
submission["pred"] = final_result["pred"]

In [None]:
submission

In [None]:
submission = submission[['i','pred']]
submission = submission.set_index('i')
submission.head()

In [None]:
submission.to_csv("submission_wangchanberta-53-unluckies.csv")

In [None]:
final_result.to_csv('/content/myresult.csv',index=False)

In [None]:
submisstion_df = pd.read_csv('/content/gdrive/MyDrive/SuperAI/ss3-hackathon-online-natural-language-processing/ne_sample_submission.csv')
submisstion_df

In [None]:
submisstion_df['token'] = pd.DataFrame({'Token': texts_test_raw})
submisstion_df['Predicted'] = final_result
submisstion_df.head(20)

In [None]:
submission_df = submisstion_df[['Id','Predicted']]
submission_df = submission_df.set_index('Id')
submission_df.head()

In [None]:
submission_df.to_csv("submission_wangchanberta.csv")

# LSTM + crf
ไม่เวิร์คจ้า

## pip zone

In [None]:
! pip install transformers
! pip install datasets
! pip install pythainlp
! pip install python-crfsuite
! pip install sentencepiece
! pip install keras-crf
! pip install git+https://www.github.com/keras-team/keras-contrib.git
! pip install tf2crf
! pip install keras
! pip install seqeval
! pip install sklearn_crfsuite
! pip install pyyaml h5py

In [None]:
import numpy as np
import pandas as pd
import datasets
import re
import matplotlib.pyplot as plt
import tensorflow as tf

## load + preprocess

In [None]:
ner_tag = [
        "O",
        "B_BRN",
        "B_DES",
        "B_DTM",
        "B_LOC",
        "B_MEA",
        "B_NUM",
        "B_ORG",
        "B_PER",
        "B_TRM",
        "B_TTL",
        "I_BRN",
        "I_DES",
        "I_DTM",
        "I_LOC",
        "I_MEA",
        "I_NUM",
        "I_ORG",
        "I_PER",
        "I_TRM",
        "I_TTL",
        "E_BRN",
        "E_DES",
        "E_DTM",
        "E_LOC",
        "E_MEA",
        "E_NUM",
        "E_ORG",
        "E_PER",
        "E_TRM",
        "E_TTL",
    ]

In [None]:
len(ner_tag)

In [None]:
id_to_ner = {}
for i in range(len(ner_tag)):
    id_to_ner[i] = ner_tag[i]


ner_to_id = {}
for i in range(len(ner_tag)):
    ner_to_id[ner_tag[i]] = i

In [None]:
def idx_to_ner(idx, id_to_ner):
    ret = []
    for i in idx:
        ret.append(id_to_ner[i])
    return ret

In [None]:
def ner_to_idx(ner, ner_to_id):
    ret = []
    for i in ner:
        ret.append(ner_to_id[i])
    return ret

In [None]:
dataset = datasets.load_dataset("lst20", data_dir="/content/gdrive/MyDrive/SuperAI/LST20_Corpus")

In [None]:
dataset

In [None]:
# df = pd.concat([pd.DataFrame(dataset['train']),
#                 pd.DataFrame(dataset['validation']),
#                 pd.DataFrame(dataset['test'])])
df = pd.DataFrame(dataset['train'])
df['len_token'] = [len(i) for i in df.tokens.values]
df = df[df['len_token'] <= 100] # smallest unit <=100
df.head()

In [None]:
df.shape

In [None]:
tokens = []
for i in df.tokens:
    for j in i:
        tokens.append(j)

len(tokens)

In [None]:
ner_label = []
for i in df.ner_tags:
    for j in i:
        ner_label.append(j)

len(ner_label)

In [None]:
words = list(set(tokens))
n_words = len(words)

tags = []
for tag in set(ner_label):
    tags.append(tag)
n_tags = len(tags)

print(tags)

In [None]:
from future.utils import iteritems

word2idx = {w: i for i, w in enumerate(words)}
idx2tag = id_to_ner
tag2idx = ner_to_id

## vectorize to same size

In [None]:
from keras.preprocessing import sequence
from keras.utils import to_categorical
from sklearn.model_selection import train_test_split

maxlen = max([len(s) for s in df.tokens])

X = [[word2idx[w] for w in s] for s in df.tokens]
X = tf.keras.utils.pad_sequences(maxlen=maxlen, sequences=X, padding="post") #make it all same lenght


y = [[w for w in s] for s in df.ner_tags]
y = tf.keras.utils.pad_sequences(maxlen=maxlen, sequences=y, padding="post")
y = [to_categorical(i, num_classes=n_tags) for i in y]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
print(len(X_train), len(X_test), len(y_train), len(y_test))

## model

In [None]:
import tensorflow as tf
import tensorflow.keras.backend as K
import tensorflow.keras.layers as L
from tensorflow_addons.text import crf_log_likelihood, crf_decode


class CRF(L.Layer):
    def __init__(self,
                 output_dim,
                 sparse_target=True,
                 **kwargs):
        """
        Args:
            output_dim (int): the number of labels to tag each temporal input.
            sparse_target (bool): whether the the ground-truth label represented in one-hot.
        Input shape:
            (batch_size, sentence length, output_dim)
        Output shape:
            (batch_size, sentence length, output_dim)
        """
        super(CRF, self).__init__(**kwargs)
        self.output_dim = int(output_dim)
        self.sparse_target = sparse_target
        self.input_spec = L.InputSpec(min_ndim=3)
        self.supports_masking = False
        self.sequence_lengths = None
        self.transitions = None

    def build(self, input_shape):
        assert len(input_shape) == 3
        f_shape = tf.TensorShape(input_shape)
        input_spec = L.InputSpec(min_ndim=3, axes={-1: f_shape[-1]})

        if f_shape[-1] is None:
            raise ValueError('The last dimension of the inputs to `CRF` '
                             'should be defined. Found `None`.')
        if f_shape[-1] != self.output_dim:
            raise ValueError('The last dimension of the input shape must be equal to output'
                             ' shape. Use a linear layer if needed.')
        self.input_spec = input_spec
        self.transitions = self.add_weight(name='transitions',
                                           shape=[self.output_dim, self.output_dim],
                                           initializer='glorot_uniform',
                                           trainable=True)
        self.built = True

    def compute_mask(self, inputs, mask=None):
        # Just pass the received mask from previous layer, to the next layer or
        # manipulate it if this layer changes the shape of the input
        return mask

    def call(self, inputs, sequence_lengths=None, training=None, **kwargs):
        sequences = tf.convert_to_tensor(inputs, dtype=self.dtype)
        if sequence_lengths is not None:
            assert len(sequence_lengths.shape) == 2
            assert tf.convert_to_tensor(sequence_lengths).dtype == 'int32'
            seq_len_shape = tf.convert_to_tensor(sequence_lengths).get_shape().as_list()
            assert seq_len_shape[1] == 1
            self.sequence_lengths = K.flatten(sequence_lengths)
        else:
            self.sequence_lengths = tf.ones(tf.shape(inputs)[0], dtype=tf.int32) * (
                tf.shape(inputs)[1]
            )

        viterbi_sequence, _ = crf_decode(sequences,
                                         self.transitions,
                                         self.sequence_lengths)
        output = K.one_hot(viterbi_sequence, self.output_dim)
        return K.in_train_phase(sequences, output)

    @property
    def loss(self):
        def crf_loss(y_true, y_pred):
            y_pred = tf.convert_to_tensor(y_pred, dtype=self.dtype)
            log_likelihood, self.transitions = crf_log_likelihood(
                y_pred,
                tf.cast(K.argmax(y_true), dtype=tf.int32) if self.sparse_target else y_true,
                self.sequence_lengths,
                transition_params=self.transitions,
            )
            return tf.reduce_mean(-log_likelihood)
        return crf_loss

    @property
    def accuracy(self):
        def viterbi_accuracy(y_true, y_pred):
            # -1e10 to avoid zero at sum(mask)
            mask = K.cast(
                K.all(K.greater(y_pred, -1e10), axis=2), K.floatx())
            shape = tf.shape(y_pred)
            sequence_lengths = tf.ones(shape[0], dtype=tf.int32) * (shape[1])
            y_pred, _ = crf_decode(y_pred, self.transitions, sequence_lengths)
            if self.sparse_target:
                y_true = K.argmax(y_true, 2)
            y_pred = K.cast(y_pred, 'int32')
            y_true = K.cast(y_true, 'int32')
            corrects = K.cast(K.equal(y_true, y_pred), K.floatx())
            return K.sum(corrects * mask) / K.sum(mask)
        return viterbi_accuracy

    def compute_output_shape(self, input_shape):
        tf.TensorShape(input_shape).assert_has_rank(3)
        return input_shape[:2] + (self.output_dim,)

    def get_config(self):
        config = {
            'output_dim': self.output_dim,
            'sparse_target': self.sparse_target,
            'supports_masking': self.supports_masking,
            'transitions': K.eval(self.transitions)
        }
        base_config = super(CRF, self).get_config()
        return dict(base_config, **config)

In [None]:
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional
from tensorflow.keras import Model
import keras as k

input = Input(shape=(maxlen,))
word_embedding_size = 128

# Embedding Layer
model = Embedding(input_dim=n_words,
                  output_dim=word_embedding_size,
                  input_length=140,
                  mask_zero=True
                  )(input)

# BI-LSTM Layer
model = Bidirectional(LSTM(units=word_embedding_size,
                           return_sequences=True,
                           dropout=0.5,
                           recurrent_dropout=0.5,
                           kernel_initializer=k.initializers.he_normal()))(model)

model = LSTM(units=word_embedding_size * 2,
             return_sequences=True,
             dropout=0.5,
             recurrent_dropout=0.5,
             kernel_initializer=k.initializers.he_normal())(model)

# TimeDistributed Layer
model = TimeDistributed(Dense(n_tags, activation="relu"))(model)

# CRF Layer
crf = CRF(n_tags)
output = crf(model)
model = Model(input, output)

In [None]:
from keras.callbacks import ModelCheckpoint
import matplotlib.pyplot as plt

#Optimiser
adam = k.optimizers.Adam(lr=0.0005, beta_1=0.9, beta_2=0.999)

# Compile model
model.compile(optimizer=adam, loss=crf.loss, metrics=crf.accuracy)

model.summary()

# Saving the best model only
filepath="/content/gdrive/MyDrive/SuperAI/hack5/model/best_model_crf.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_viterbi_accuracy', verbose=1, save_best_only=True, mode='max')
stopping = tf.keras.callbacks.EarlyStopping(monitor='val_viterbi_accuracy', patience=3)
callbacks_list = [checkpoint, stopping]

In [None]:
# Fit the best model
train_hist = model.fit(X_train, np.array(y_train), batch_size=128, epochs=5, validation_split=0.1, verbose=True, callbacks=callbacks_list)

## visualize result

In [None]:
# Plot the graph
plt.style.use('ggplot')

def plot_history(history):
    accuracy = history.history['viterbi_accuracy']
    val_accuracy = history.history['val_viterbi_accuracy']
    loss = history.history['loss']
    val_loss = history.history['val_loss']
    x = range(1, len(accuracy) + 1)

    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    plt.plot(x, accuracy, 'b', label='Training acc')
    plt.plot(x, val_accuracy, 'r', label='Validation acc')
    plt.title('Training and validation accuracy')
    plt.legend()
    plt.subplot(1, 2, 2)
    plt.plot(x, loss, 'b', label='Training loss')
    plt.plot(x, val_loss, 'r', label='Validation loss')
    plt.title('Training and validation loss')
    plt.legend()

plot_history(train_hist)

https://www.tensorflow.org/tutorials/keras/save_and_load?hl=th

In [None]:
model.load_weights('/content/gdrive/MyDrive/SuperAI/hack5/model/best_model_crf.hdf5')

In [None]:
def pred2label(pred):
    out = []
    for pred_i in pred:
        out_i = []
        for p in pred_i:
            p_i = np.argmax(p)
            out_i.append(idx2tag[p_i])
        out.append(out_i)
    return out
test_pred = model.predict(X_test, verbose=1)
pred_labels = pred2label(test_pred)
test_labels = pred2label(y_test)

In [None]:
from seqeval.metrics import precision_score, recall_score, f1_score, classification_report
print("F1-score: {:.1%}".format(f1_score(test_labels, pred_labels)))

## make submission

In [None]:
with open('/content/gdrive/MyDrive/SuperAI/ss3-hackathon-online-natural-language-processing/ne_test.txt') as f:
    ne_test = f.read().split('\n')

for i in range(len(ne_test)):
    if (ne_test[i] == ''):
        ne_test[i] = '_'

In [None]:
import pandas as pd

test_data = pd.read_csv("/content/gdrive/MyDrive/SuperAI/hack5/nithan-chadok-name-entity-recognition/test.csv")
test_data.shape
test_data

In [None]:
submit_test = []
for i in range(len(test_data)):
    w = test_data.loc[i, "word"]
    if w not in word2idx.keys(): # same word idk i think writer did a bag-of-word
        submit_test.append(np.random.choice(range(0, max(list(word2idx.values())))))
    else:
        submit_test.append(word2idx[w])

submit_test[:10]

In [None]:
def split_sentence(sentence, maxlen):
    ret = []
    for i in range(0, len(sentence), maxlen):
        try:
            ret.append(sentence[i:i+maxlen])
        except:
            ret.append(sentence[i:])
    return ret

split_test = split_sentence(submit_test, maxlen)
split_test = tf.keras.utils.pad_sequences(maxlen=maxlen, sequences=split_test,
                                          padding="post", value=max(submit_test) - 1)

In [None]:
num_padded = len(split_test.flatten()) - len(submit_test)
num_padded

In [None]:
submit_pred = model.predict(split_test, verbose=1)

In [None]:
submit_labels = pred2label(submit_pred)
print(len(submit_labels))
submit_labels = np.array(submit_labels).flatten()[:69561]
submit_labels.shape

In [None]:
submit_df = pd.DataFrame({'Id': range(0, len(submit_labels)), 'Predicted': submit_labels})
submit_df

In [None]:
submit_df = submit_df[0:65007]

In [None]:
submit_df

In [None]:
test_tag = pd.read_csv("/content/gdrive/MyDrive/SuperAI/hack5/nithan-chadok-name-entity-recognition/tag_list.csv")

In [None]:
tag2class = {}
for i in range(len(test_tag)):
  tag2class[test_tag.loc[i, "tag"]] = str(test_tag.loc[i, "class"])

In [None]:
tag2class

In [None]:
for i in range(len(submit_df)):
  if(submit_df.loc[i,"Predicted"] in tag2class.keys()):
    submit_df.loc[i,"pred"] = str(tag2class[submit_df.loc[i, "Predicted"]])
  else:
    submit_df.loc[i,"pred"] = str(0)

In [None]:
submit_df

In [None]:
submit_df.to_csv("submission_crf_1.csv", index=False)

# Using HoogBERTa-NER-lst20
แถม tokenize ไม่น่าใช้

In [None]:
!pip install attacut
!pip install transformers

In [None]:
texts_test_raw = []
f = open(q_path, "r")
file_data = f.readlines()
for line in file_data:
  line = line.strip().replace(u'\xa0', u'') # turn new line -> ''
  texts_test_raw.append(line)

In [None]:
texts_test_raw = texts_test_raw[:-1]

In [None]:
#Replace blank with "_"
def blank_space(x):
  if x == '':
    x = '_'
  return x

#Loop replace blank to "_"
for i in range(len(texts_test_raw)):
  texts_test_raw[i] = blank_space(texts_test_raw[i])

In [None]:
def split_into_sentences(tokens, tokens_per_sentence=20):
    sentences = []
    for i in range(0, len(tokens), tokens_per_sentence):
        sentence = tokens[i:i+tokens_per_sentence]
        sentences.append(sentence)
    return sentences

In [None]:
my_token = split_into_sentences(texts_test_raw)

In [None]:
my_token[0]

In [None]:
sent_join = ''.join(my_token[0])

In [None]:
my_token_list = [] # turn element -> sentence
for i in range(len(my_token)):
  sent_join = ' '.join(my_token[i])
  print(sent_join)
  my_token_list.append(sent_join)

In [None]:
from transformers import RobertaTokenizerFast, RobertaForTokenClassification
from attacut import tokenize
import torch

tokenizer = RobertaTokenizerFast.from_pretrained("Sirinya/ner-finetuned-lst20")
model = RobertaForTokenClassification.from_pretrained("Sirinya/ner-finetuned-lst20")

In [None]:
my_token_list[0]

test test

In [None]:
from transformers import pipeline

nlp = pipeline('token-classification', model=model, tokenizer=tokenizer, aggregation_strategy="none")

sentence = "วันที่ 12 มีนาคมนี้ ฉันจะไปเที่ยววัดพระแก้ว ที่กรุงเทพ"
all_sent = []
sentences = sentence.split(" ")
for sent in sentences:
    all_sent.append(" ".join(tokenize(sent)).replace("_","[!und:]"))

sentence = " _ ".join(all_sent)
print(sentence)

print(nlp(sentence))

In [None]:
from transformers import pipeline

nlp = pipeline('token-classification', model=model, tokenizer=tokenizer, aggregation_strategy="none")

sentence = my_token_list[0]
print(sentence)
print(nlp(sentence))

pred = nlp(sentence)

for i in pred:
  print(i['word'], i['entity'])