<a href="https://colab.research.google.com/github/hajihye123/MachineLearning_Lecture/blob/main/author_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd

df = pd.read_csv('/content/train.csv', index_col='index')
df.head()

Unnamed: 0_level_0,text,author
index,Unnamed: 1_level_1,Unnamed: 2_level_1
0,"He was almost choking. There was so much, so m...",3
1,"“Your sister asked for it, I suppose?”",2
2,"She was engaged one day as she walked, in per...",1
3,"The captain was in the porch, keeping himself ...",4
4,"“Have mercy, gentlemen!” odin flung up his han...",3


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(df.text, df.author, test_size=0.2)

In [None]:
import tensorflow as tf

train_ds = tf.data.Dataset.from_tensor_slices((X_train.values, y_train.values))
valid_ds = tf.data.Dataset.from_tensor_slices((X_valid.values, y_valid.values))

In [None]:
for X_batch, y_batch in train_ds.batch(2).take(1):
    for review, label in zip(X_batch.numpy(), y_batch.numpy()):
        print("Text:", review.decode("utf-8")[:200], "...")
        print("Author:", label)
        print()

Text: “By thinking so you become worthy. Is she then of noble birth?” ...
Author: 2

Text: The door of the Doctor's room opened, and he came out with odin Darnay. He was so deadly pale--which had not been the case when they went in together--that no vestige of colour was to be seen in his f ...
Author: 0



In [None]:
def preprocess(X_batch, y_batch):
    X_batch = tf.strings.substr(X_batch, 0, 300)
    X_batch = tf.strings.regex_replace(X_batch, rb"<br\s*/?>", b" ")
    X_batch = tf.strings.regex_replace(X_batch, b"[^a-zA-Z']", b" ")
    X_batch = tf.strings.split(X_batch)
    return X_batch.to_tensor(default_value=b"<pad>"), y_batch

preprocess(X_batch, y_batch)

(<tf.Tensor: shape=(2, 61), dtype=string, numpy=
 array([[b'By', b'thinking', b'so', b'you', b'become', b'worthy', b'Is',
         b'she', b'then', b'of', b'noble', b'birth', b'<pad>', b'<pad>',
         b'<pad>', b'<pad>', b'<pad>', b'<pad>', b'<pad>', b'<pad>',
         b'<pad>', b'<pad>', b'<pad>', b'<pad>', b'<pad>', b'<pad>',
         b'<pad>', b'<pad>', b'<pad>', b'<pad>', b'<pad>', b'<pad>',
         b'<pad>', b'<pad>', b'<pad>', b'<pad>', b'<pad>', b'<pad>',
         b'<pad>', b'<pad>', b'<pad>', b'<pad>', b'<pad>', b'<pad>',
         b'<pad>', b'<pad>', b'<pad>', b'<pad>', b'<pad>', b'<pad>',
         b'<pad>', b'<pad>', b'<pad>', b'<pad>', b'<pad>', b'<pad>',
         b'<pad>', b'<pad>', b'<pad>', b'<pad>', b'<pad>'],
        [b'The', b'door', b'of', b'the', b"Doctor's", b'room', b'opened',
         b'and', b'he', b'came', b'out', b'with', b'odin', b'Darnay',
         b'He', b'was', b'so', b'deadly', b'pale', b'which', b'had',
         b'not', b'been', b'the', b'case', b'when

In [None]:
from collections import Counter

vocabulary = Counter()
for X_batch, y_batch in train_ds.batch(32).map(preprocess):
    for review in X_batch:
        vocabulary.update(list(review.numpy()))

In [None]:
vocabulary.most_common()[:3]

[(b'<pad>', 1455578), (b'the', 50985), (b'I', 33226)]

In [None]:
len(vocabulary)

32407

In [None]:
vocab_size = 10000
truncated_vocabulary = [
    word for word, count in vocabulary.most_common()[:vocab_size]]

In [None]:
word_to_id = {word: index for index, word in enumerate(truncated_vocabulary)}
for word in b"This movie was faaaaaantastic".split():
    print(word_to_id.get(word) or vocab_size)

200
10000
10
10000


In [None]:
words = tf.constant(truncated_vocabulary)
word_ids = tf.range(len(truncated_vocabulary), dtype=tf.int64)
vocab_init = tf.lookup.KeyValueTensorInitializer(words, word_ids)
num_oov_buckets = 1000
table = tf.lookup.StaticVocabularyTable(vocab_init, num_oov_buckets)

In [None]:
def encode_words(X_batch, y_batch):
    return table.lookup(X_batch), y_batch

train_set = train_ds.repeat().batch(32).map(preprocess)
train_set = train_set.map(encode_words).prefetch(1)

valid_set = valid_ds.batch(32).map(preprocess)
valid_set = valid_set.map(encode_words).prefetch(1)

In [None]:
for X_batch, y_batch in train_set.take(1):
    print(X_batch)
    print(y_batch)

tf.Tensor(
[[ 420  477   34 ...    0    0    0]
 [  36  166    5 ...  445    0    0]
 [ 168  227  930 ...    0    0    0]
 ...
 [   2   65    6 ...    0    0    0]
 [ 157 2633  185 ...    0    0    0]
 [  68   71  125 ...    0    0    0]], shape=(32, 63), dtype=int64)
tf.Tensor([2 0 2 0 1 4 0 1 2 0 3 1 3 4 1 2 2 3 3 1 1 3 2 2 2 0 3 4 4 0 0 1], shape=(32,), dtype=int64)


In [None]:
import keras

embed_size = 128
model = keras.models.Sequential([
    keras.layers.Embedding(vocab_size + num_oov_buckets, embed_size,
                           mask_zero=True, # not shown in the book
                           input_shape=[None]),
    keras.layers.GRU(128, return_sequences=True),
    keras.layers.GRU(128),
    keras.layers.Dense(5, activation="softmax")
])
model.compile(loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
history = model.fit(train_set, steps_per_epoch=len(X_train.values) // 32, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
model.evaluate(valid_set)



[1.182963252067566, 0.7158345580101013]

In [None]:
test_df = pd.read_csv('/content/test_x.csv', index_col='index')
test_df['author'] = -1
test_df.head()

Unnamed: 0_level_0,text,author
index,Unnamed: 1_level_1,Unnamed: 2_level_1
0,“Not at all. I think she is one of the most ch...,-1
1,"""No,"" replied he, with sudden consciousness, ""...",-1
2,As the lady had stated her intention of scream...,-1
3,“And then suddenly in the silence I heard a so...,-1
4,His conviction remained unchanged. So far as I...,-1


In [None]:
test_ds = tf.data.Dataset.from_tensor_slices((test_df.text.values, test_df.author.values))

In [None]:
test_set = test_ds.batch(32).map(preprocess)
test_set = test_set.map(encode_words).prefetch(1)

In [None]:
result = model.predict_classes(test_set)



In [None]:
len(result)

19617

In [None]:
with open('/content/basic_model.csv', 'w') as f:
  for i, y_pred in enumerate(result):
    f.write('{0},{1}\n'.format(i+1, y_pred))