In [1]:
import os
import numpy as np
import pandas as pd
from itertools import chain

from keras.models import Model, Input
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional
from sklearn.model_selection import train_test_split
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
#Load dataset and split it into categories
dataset = open("./dataset/hironsan.txt", "r").read().splitlines()
data = list(map(lambda x: x.split("\t"), dataset))
word = list(map(lambda x:x[0], data))
label = list(map(lambda x:x[-1], data))

In [3]:
#Create dataframe for the dataset
dataframe = pd.DataFrame({"Word":word,
                         "Label": label})

In [4]:
def create_sentence_frame(dataframe):
    """ Loop each row and name the word in sentence #
    
    Parameters: dataframe
    
    Returns:
        dataframe
    """
    # replace the empty row by NaN
    dataframe.Word.replace("", np.nan, inplace=True) 
    dataframe.Label.replace("", np.nan, inplace=True)
    # drop the NaN row
    dataframe = dataframe.dropna()
    sent_a = []
    begin = 0
    index = 0
    for i, word in enumerate(dataframe.Word.tolist()):
        if "。" in word:
            sentence = ["Sent {}".format(index) for _ in range(begin,i+1)]
            begin = begin + (i-begin) + 1
            sent_a.append(sentence)
            index += 1
    dataframe["Sentence"] = list(chain.from_iterable(sent_a))
    return dataframe

In [5]:
#Include a sentence so that a groupby can be carried out
sent_dataframe = create_sentence_frame(dataframe)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [6]:
# Create columns in dataframe to store the sequence tokens
sent_dataframe["Label_Sequence"] = sent_dataframe.Label.astype("category").cat.codes
sent_dataframe["Word_Sequence"] = sent_dataframe.Word.astype("category").cat.codes

# Create dictionaries for mapping
ids_to_words = dict(enumerate(sent_dataframe.Word.astype("category").cat.categories))
ids_to_words[len(ids_to_words)] = "<Pad>" 
ids_to_labels = dict(enumerate(sent_dataframe.Label.astype("category").cat.categories))
words_to_ids = {word:ids for ids, word in ids_to_words.items()}
labels_to_ids = {label:ids for ids, label in ids_to_labels.items()}

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [7]:
# Extract the sequence tokens
X = sent_dataframe.groupby("Sentence")["Word_Sequence"].apply(list).tolist()
y = sent_dataframe.groupby("Sentence")["Label_Sequence"].apply(list).tolist()

In [8]:
#Find the maxlen of each sentence in dataset
length = [len(sent) for sent in X]
maxlen = int(np.mean(length) + 2 * np.std(length))

#Pad the tokens so that each sentence has same length
padded_X = pad_sequences(X, maxlen=maxlen, padding="post", value=words_to_ids["<Pad>"])
padded_y = pad_sequences(y, maxlen=maxlen, padding="post", value=labels_to_ids["O"])

In [9]:
# Convert labels to one-hot
padded_y = [to_categorical(i, num_classes=len(ids_to_labels)) for i in padded_y]

In [10]:
#Splot the dataset into training and testing
X_train, X_test, y_train, y_test = train_test_split(padded_X,padded_y, test_size=0.2)

In [11]:
# Create a simplest BiLSTM model
input = Input(shape=(maxlen,))
model = Embedding(input_dim=len(words_to_ids), output_dim=maxlen, input_length=maxlen)(input)
model = Dropout(0.1)(model)
model = Bidirectional(LSTM(units=100, return_sequences=True, recurrent_dropout=0.1))(model)
out = TimeDistributed(Dense(len(labels_to_ids), activation="softmax"))(model)  # softmax output layer

model = Model(input, out)
model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])

history = model.fit(X_train, np.array(y_train), batch_size=32, epochs=100, verbose=1)






Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
E

Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


In [13]:
# Predict and test the trained model.
i = 13
p = model.predict(np.array([X_test[i]]))
p = np.argmax(p, axis=-1)
print("{:14} ({:5}): {}".format("Word", "True", "Pred"))
for w,pred in zip(X_test[i],p[0]):
    print("{:14}: {}".format(ids_to_words[w],ids_to_labels[pred]))

Word           (True ): Pred
両             : O
信用金庫          : O
は             : O
経営            : O
破綻            : O
し             : O
、             : O
日本銀行          : O
から            : O
の             : O
公的            : O
融資            : O
を             : O
受け            : O
た             : O
が             : O
1995          : B-DAT
年             : I-DAT
2             : I-DAT
月             : I-DAT
に             : O
解散            : O
し             : O
た             : O
。             : O
<Pad>         : O
<Pad>         : O
<Pad>         : O
<Pad>         : O
<Pad>         : O
<Pad>         : O
<Pad>         : O
<Pad>         : O
<Pad>         : O
<Pad>         : O
<Pad>         : O
<Pad>         : O
<Pad>         : O
<Pad>         : O
<Pad>         : O
<Pad>         : O
<Pad>         : O
<Pad>         : O
<Pad>         : O
<Pad>         : O
<Pad>         : O
<Pad>         : O
<Pad>         : O
<Pad>         : O
<Pad>         : O
<Pad>         : O
<Pad>         : O
<Pad>         : O
<