## bert 切词模块测试

In [None]:
import tokenization
tokenizer = tokenization.FullTokenizer(vocab_file="model/chinese_wwm_ext_L-12_H-768_A-12/vocab.txt", do_lower_case=True)
tokenizer.tokenize("。，可以吧你，你就记住号码好了，我随后再再具体联系。我我我。")

all_documents = [[]]
for line in ["。那这样吧，现在正在忙着。", "。，我没有搞这方面的，我不大清楚，唉。", "。，那你说一下，大概。"]:
    tokens = tokenizer.tokenize(line)
    if tokens:
        all_documents[-1].append(tokens)
all_documents, all_documents[0][0]


## FAQ问题baseline与检索结果评测

In [1]:
import pandas as pd

f = "data/za_data/kd_dev.csv"
df_base = df = pd.read_csv("data/za_data/kd_train.csv", header=None, sep="\t", names=("a", "b", "y"))
df = pd.read_csv(f, header=None, sep="\t", names=("a", "b", "y"))

In [2]:
df_base.y.mean()

0.42853673935214415

In [3]:
from sklearn.metrics import log_loss, roc_auc_score, accuracy_score, f1_score
df["y_pred"] = 0
accuracy_score(df["y"], df["y_pred"], normalize=True)

0.5771173271173271

In [9]:
url = "http://39.108.171.231:8001/za_bot/q={s}"
import requests
r = requests.get(url.format(s="单身好不好"))
r.json()

{'text': '单身 好不好'}

In [10]:
def levenshtein_sim(sentence1, sentence2, sim=True):
    first, second = sentence1, sentence2
    sentence1_len, sentence2_len = len(first), len(second)
    maxlen = max(sentence1_len, sentence2_len)
    if sentence1_len > sentence2_len:
        first, second = second, first

    distances = range(len(first) + 1)# 短串+1
    for index2, char2 in enumerate(second):# 长字符串
        new_distances = [index2 + 1] #第几个字符串
        for index1, char1 in enumerate(first): # 短字符串
            if char1 == char2:
                new_distances.append(distances[index1]) #distances[ix]=ix
            else:
                min_ix = min((distances[index1], distances[index1+1], new_distances[-1]))
                new_distances.append(1+min_ix)
        distances = new_distances
    levenshtein = distances[-1]
    return float((maxlen - levenshtein) / maxlen) if sim else levenshtein

In [18]:
def row_sim(r, min_score=0.3):
    r1 = requests.get(url.format(s=r["a"])).json()["text"].replace(" ", "")
    r2 = requests.get(url.format(s=r["b"])).json()["text"].replace(" ", "")
    v = levenshtein_sim(r1, r2)
    return 1 if v > min_score else 0
df["y_"] = df.apply(row_sim, axis=1)

In [19]:
# min_score=0.3
accuracy_score(df["y"], df["y_"], normalize=True)

0.6668609168609169

In [15]:
df["y"].value_counts(normalize=True)

0    0.577117
1    0.422883
Name: y, dtype: float64

In [17]:
df["y_"].value_counts(normalize=True)

0    0.977661
1    0.022339
Name: y_, dtype: float64

In [16]:
accuracy_score(df["y"], [1]*len(df), normalize=True)

0.4228826728826729

## language model

In [1]:
%%!
python run_classifier.py \
  --task_name=za \
  --do_train=true \
  --do_eval=true \
  --data_dir=data/za_data \
  --vocab_file=model/chinese_L-12_H-768_A-12/vocab.txt \
  --bert_config_file=model/chinese_L-12_H-768_A-12/bert_config.json \
  --init_checkpoint=model/chinese_L-12_H-768_A-12/bert_model.ckpt \
  --max_seq_length=30 \
  --train_batch_size=32 \
  --learning_rate=2e-5 \
  --num_train_epochs=3.0 \
  --output_dir=result/za_output001/

 "INFO:tensorflow:Using config: {'_model_dir': 'result/za_output001/', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': 1000, '_save_checkpoints_secs': None, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': None, '_train_distribute': None, '_device_fn': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7fa5a10d4320>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1, '_tpu_config': TPUConfig(iterations_per_loop=1000, num_shards=8, computation_shape=None, per_host_input_for_training=3, tpu_job_name=None, initial_infeed_sleep_secs=None), '_cluster': None}",
 'INFO:tensorflow:_TPUContext: eval_on_tpu True',
 'INFO:tensorflow:Writing example 0 of 20591',
 'INFO:tensorflow:*** Example ***',
 'INFO:tensorflow:guid: t

In [2]:
%%!
python run_classifier.py \
  --task_name=za \
  --do_train=true \
  --do_eval=true \
  --data_dir=data/za_data \
  --vocab_file=model/chinese_L-12_H-768_A-12/vocab.txt \
  --bert_config_file=model/chinese_L-12_H-768_A-12/bert_config.json \
  --init_checkpoint=model/chinese_L-12_H-768_A-12/bert_model.ckpt \
  --max_seq_length=30 \
  --train_batch_size=128 \
  --learning_rate=2e-5 \
  --num_train_epochs=3.0 \
  --output_dir=result/za_output003/

 "INFO:tensorflow:Using config: {'_model_dir': 'result/za_output003/', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': 1000, '_save_checkpoints_secs': None, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': None, '_train_distribute': None, '_device_fn': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7fc4ea754320>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1, '_tpu_config': TPUConfig(iterations_per_loop=1000, num_shards=8, computation_shape=None, per_host_input_for_training=3, tpu_job_name=None, initial_infeed_sleep_secs=None), '_cluster': None}",
 'INFO:tensorflow:_TPUContext: eval_on_tpu True',
 'INFO:tensorflow:Writing example 0 of 20591',
 'INFO:tensorflow:*** Example ***',
 'INFO:tensorflow:guid: t