<a href="https://colab.research.google.com/github/hululuzhu/chinese-ai-writing-share/blob/main/training/transformer_supervised/poem_Transformer_Source_Code_Share_V1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Colab to train a Chinese poem writing transformer. e.g.

```
标题: 秋思
正文: 秋风吹雨过，秋色满江城。一叶无人到，千山有客情。
标题: 百度
正文: 百尺孤城上，千金万里中。山川无限水，水石有余风。
标题: 湾区春日之谜
正文: 春风吹雨不成秋，春色如何一日休。不是春光无处着，只应春色是人愁。
```

# Imports

In [1]:
import json
import urllib.request
import pandas as pd
!pip install -q "tqdm>=4.36.1" > /tmp/na
from tqdm.notebook import tqdm
!pip install chinese-converter > /tmp/na
import chinese_converter
import pickle
import os
import pandas as pd
import numpy as np
!pip install keras-transformer &> /dev/null
os.environ['TF_KERAS'] = '1'
from keras_transformer import get_model, decode, get_custom_objects
import tensorflow as tf

## TPU

In [2]:
resolver = tf.distribute.cluster_resolver.TPUClusterResolver()
tf.config.experimental_connect_to_cluster(resolver)
tf.tpu.experimental.initialize_tpu_system(resolver)
# print("All devices: ", tf.config.list_logical_devices('TPU'))
strategy = tf.distribute.TPUStrategy(resolver)

INFO:tensorflow:Initializing the TPU system: grpc://10.115.204.170:8470


INFO:tensorflow:Initializing the TPU system: grpc://10.115.204.170:8470


INFO:tensorflow:Clearing out eager caches


INFO:tensorflow:Clearing out eager caches


INFO:tensorflow:Finished initializing TPU system.


INFO:tensorflow:Finished initializing TPU system.


INFO:tensorflow:Found TPU system:


INFO:tensorflow:Found TPU system:


INFO:tensorflow:*** Num TPU Cores: 8


INFO:tensorflow:*** Num TPU Cores: 8


INFO:tensorflow:*** Num TPU Workers: 1


INFO:tensorflow:*** Num TPU Workers: 1


INFO:tensorflow:*** Num TPU Cores Per Worker: 8


INFO:tensorflow:*** Num TPU Cores Per Worker: 8


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:0, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:0, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:1, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:1, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:2, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:2, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:3, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:3, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:4, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:4, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:5, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:5, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:6, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:6, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:7, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:7, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU_SYSTEM:0, TPU_SYSTEM, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU_SYSTEM:0, TPU_SYSTEM, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 0, 0)


# Connect to Google Drive for storage
- useful to store model or dict/params

In [None]:
# Mount your google drive if you haven't
!mkdir -p drive/MyDrive/ML/Models/chinese_poem_v1
WORK_DIR = 'drive/MyDrive/ML/Models/chinese_poem_v1'

# Load data and transform and persist to Drive (no need to rerun)

In [None]:
# https://github.com/chinese-poetry/chinese-poetry
POEM_CONTENT = {
    'tang': {
        'total': 58,
        'pattern': "https://raw.githubusercontent.com/chinese-poetry/chinese-poetry/master/json/poet.tang.{0}.json"
    },
    'song': {
        'total': 255,
        'pattern': "https://raw.githubusercontent.com/chinese-poetry/chinese-poetry/master/json/poet.song.{0}.json"
    }
}

def get_poems(is_test=True, verbose=True):
  df_list = []
  for dynasty in POEM_CONTENT:
    size = 3 if is_test else POEM_CONTENT[dynasty]['total']
    pbar = tqdm(total=size, desc="Dynasty " + dynasty)
    for i in range(size):
      url = POEM_CONTENT[dynasty]['pattern'].format(i * 1000)
      if verbose:
        print(f"download {url} now")
      df_list.append(pd.read_json(url))
      pbar.update(1)
  return pd.concat(df_list)

In [None]:
df = get_poems(is_test=False, verbose=False)

In [None]:
df['concat_paragraphs'] = [''.join(map(str, l)) for l in df['paragraphs']]

In [None]:
df = df[['author', 'title', 'concat_paragraphs']]

## Convert to simplified Chinese

In [None]:
def convert_schinese(tchinese):
  return chinese_converter.to_simplified(tchinese)

In [None]:
df['s_content'] = df.apply(lambda row: convert_schinese(''.join(row.concat_paragraphs)), axis=1)
df['s_title'] = df.apply(lambda row: convert_schinese(''.join(row.title)), axis=1)
df['s_author'] = df.apply(lambda row: convert_schinese(''.join(row.author)), axis=1)

In [None]:
my_df = df[['s_content', 's_title', 's_author']]
my_df

In [None]:
for key in my_df.columns:
  print(my_df[key][:].apply(len).describe())

def trim_author_fn(row):
  return row.s_author[:4]

def trim_title_fn(row):
  trimed_title = row.s_title[:12].replace(" ", "").replace("(", "").replace(")", "")
  return trimed_title

def trim_content_fn(row):
  trimed_content = row.s_content[:64]
  last_period = trimed_content.rfind("。")
  return trimed_content[:last_period+1]

# Trim the size
my_df['s_author'] = my_df.apply(trim_author_fn, axis=1)
my_df['s_title'] = my_df.apply(trim_title_fn, axis=1)
my_df['s_content'] = my_df.apply(trim_content_fn, axis=1)


# TODO, find space in title and choose 1st part
# TODO, find last period of content and stop there after triming

In [None]:
short_mask = (my_df['s_title'].str.len() == 0) | (my_df['s_content'].str.len() <= 10) | ('无正文' == my_df['s_content']) | ('无正文' == my_df['s_author'])
filter_my_df = my_df.loc[~short_mask]
filter_my_df

In [None]:
filter_my_df[filter_my_df['s_content'].str.len() <= 10]

## Get Dictionary

In [None]:
token_dict = {
  '<PAD>': 0,
  '<START>': 1,
  '<END>': 2,
}

def process_token(token_dict, df):
  for field in df.columns:
    for title in df[field]:
      for c in title:
        if c not in token_dict:
          token_dict[c] = len(token_dict)

process_token(token_dict, filter_my_df)
rev_token_dict = {v: k for k, v in token_dict.items()}
vocab_size = len(token_dict)

print("vocab_size", vocab_size)

## Persist DF and Dictionary for future use

In [None]:
with open(os.path.join(WORK_DIR, 'vocab_0604_v1.pickle'), 'wb') as handle:
    pickle.dump(token_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

filter_my_df.to_pickle(os.path.join(WORK_DIR, 'dataframe_300k_0604_v1.pickle'))

In [None]:
!ls -l {WORK_DIR}

# Train model from Title to Content (without author)

## Reload from storage

In [None]:
loaded_token_dict = pickle.load(
    open(os.path.join(WORK_DIR, 'vocab_0604_v1.pickle'), "rb" ))

loaded_df = pd.read_pickle(
    os.path.join(WORK_DIR, 'dataframe_300k_0604_v1.pickle'))

In [None]:
rev_token_dict = {v: k for k, v in loaded_token_dict.items()}

assert 11289 == len(rev_token_dict)

## Encode data

In [None]:
MAX_INPUT_SEQ = 14 # max title length + 2 special tokens
MAX_OUTPUT_SEQ = 66 # max 64 content length + 2 special tokens
START_TOKEN_ID = loaded_token_dict['<START>']
END_TOKEN_ID = loaded_token_dict['<END>']
PAD_TOKEN_ID = loaded_token_dict['<PAD>']


def encode(raw_text, is_decode_input, is_decode_output):
  assert not (is_decode_input and is_decode_output)
  output = []
  if not is_decode_output:
    output.append(START_TOKEN_ID)
  for c in raw_text:
    output.append(loaded_token_dict[c])
  output.append(END_TOKEN_ID)
  # padding
  total_size = MAX_OUTPUT_SEQ if is_decode_input or is_decode_output else MAX_INPUT_SEQ
  for i in range(total_size - len(output)):
    output.append(PAD_TOKEN_ID)
  return output

def decode(token_ids):
  output = ""
  for token_id in token_ids:
    if token_id > 2:
      output += rev_token_dict[token_id]
    elif token_id == 0:
      break
  return output

In [None]:
print(encode('登竺云山', is_decode_input = False, is_decode_output = False))
print(encode('独上千峰与万峰，晴岚淡写海江容', is_decode_input = True, is_decode_output = False))
print(encode('独上千峰与万峰，晴岚淡写海江容', is_decode_input = False, is_decode_output = True))

print(decode([1, 546, 4787, 35, 344, 2, 0, 0, 0, 0, 0, 0, 0, 0]))
print(decode([1, 302, 167, 17, 168, 481, 185, 168, 8, 773, 2281, 3939, 94, 342, 1566, 1563, 2, 0, 0,]))

In [None]:
# from sklearn.model_selection import train_test_split
# Shuffle the order of df
# TEST_RATE = 0.03
shuffle_loaded_df = loaded_df.sample(frac=1).reset_index(drop=True)
cutoff = 6000 # Use 6k as test
df_test = shuffle_loaded_df[:cutoff]
df_train = shuffle_loaded_df[cutoff:]

In [None]:
def prepare_ds(df):
  text_x = df['s_title'].values
  text_y = df['s_content'].values
  x = np.asarray([encode(k, False, False) for k in text_x])
  x_d = np.asarray([encode(k, True, False) for k in text_y])
  # final output need extra 1 dim
  y = np.expand_dims(np.asarray([encode(k, False, True) for k in text_y]), -1)
  return x, x_d, y

train_x, train_x_d, train_y = prepare_ds(df_train)
test_x, test_x_d, test_y = prepare_ds(df_test)

In [None]:
print(train_x.shape, train_x_d.shape, train_y.shape)
print(test_x.shape, test_x_d.shape, test_y.shape)

print(decode(train_x[1000]), decode(train_x_d[1000]), decode(np.squeeze(train_y[1000], -1)))
print(decode(test_x[1000]), decode(test_x_d[1000]), decode(np.squeeze(test_y[1000], -1)))

## Build transformer model

In [None]:
with strategy.scope():
  num_encoders = 4
  num_docoders = 4
  num_heads = 8
  embed_size = 64 * num_docoders
  drop_out_rate = 0.3
  model = get_model(
    token_num=len(rev_token_dict),
    embed_dim=embed_size,
    encoder_num=num_encoders,
    decoder_num=num_docoders,
    head_num=num_heads,
    hidden_dim=embed_size,
    attention_activation='gelu',
    feed_forward_activation='gelu',
    dropout_rate=drop_out_rate,
    embed_weights=np.random.random((len(rev_token_dict), embed_size)),
  )
  model.compile(
      optimizer=tf.keras.optimizers.Adam(),
      loss='sparse_categorical_crossentropy',
  )

In [None]:
epochs = 60  # 60 is minimal to be meaningful
batch_size = 128
model.fit(
  x=[train_x, train_x_d],
  y=train_y,
  batch_size=batch_size,
  epochs=epochs,
  validation_data=([test_x, test_x_d], test_y),
)

# Save your model

In [None]:
!mkdir -p {WORK_DIR}/model_weights
model.save_weights(f'{WORK_DIR}/model_weights')

# Inference, please see [this colab](https://github.com/hululuzhu/chinese-ai-writing-share/blob/main/RC_01_AI_Writing_Demo_06_2021.ipynb)