In [1]:
import pandas as pd
import csv

# with open('pos.txt','r',encoding='utf-8') as reader_pos:
#     with open('neg.txt','r',encoding='utf-8') as reader_neg:
#         with open('bert_posneg.txt','w',encoding='utf-8') as bert_write:
#             for pos in reader_pos:
#                 bert_write.write(pos.strip()+'\t'+'1')
#                 bert_write.write('\n')
#             for neg in reader_neg:
#                 bert_write.write(neg.strip()+'\t'+'0')
#                 bert_write.write('\n')


In [3]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-chinese', do_lower_case=True)

In [20]:
max_length = 110
batch_size = 6

In [21]:
def convert_example_to_feature(review):
    return tokenizer.encode_plus(review,
                                 add_special_tokens=True,  # add [CLS], [SEP]
                                 max_length=max_length,  # max length of the text that can go to BERT
                                 pad_to_max_length=True,  # add [PAD] tokens
                                 return_attention_mask=True,  # add attention mask to not focus on pad tokens
                                 )

In [41]:
import tensorflow as tf


def map_example_to_dict(input_ids, attention_masks, token_type_ids, label):
    return {
        "input_ids": input_ids,
        "token_type_ids": token_type_ids,
        "attention_mask": attention_masks,
    }, label


def encode_examples(ds, limit=-1):
    # prepare list, so that we can build up final TensorFlow dataset from slices.
    input_ids_list = []
    token_type_ids_list = []
    attention_mask_list = []
    label_list = []
    if (limit > 0):
        ds = ds.take(limit)
    for row in ds:
        review, label = row.split('\t')
        bert_input = convert_example_to_feature(review)
        input_ids_list.append(bert_input['input_ids'])
        token_type_ids_list.append(bert_input['token_type_ids'])
        attention_mask_list.append(bert_input['attention_mask'])
        label_list.append([int(label)])
    return tf.data.Dataset.from_tensor_slices(
        (input_ids_list, attention_mask_list, token_type_ids_list, label_list)).map(map_example_to_dict)

In [23]:
import random


def data_split(full_list, ratio, shuffle=False):
    """
    数据集拆分: 将列表full_list按比例ratio（随机）划分为2个子列表sublist_1与sublist_2
    :param full_list: 数据列表
    :param ratio:     子列表1
    :param shuffle:   子列表2
    :return:
    """
    n_total = len(full_list)
    offset = int(n_total * ratio)
    if n_total == 0 or offset < 1:
        return [], full_list
    if shuffle:
        random.shuffle(full_list)
    sublist_1 = full_list[:offset]
    sublist_2 = full_list[offset:]
    return sublist_1, sublist_2


In [24]:
ds = []
with open('bert_posneg.txt', 'r', encoding='utf-8') as reader:
    for row in reader:
        ds.append(row)
ds_train, ds_test = data_split(ds, ratio=0.8, shuffle=True)

In [25]:
print(ds_test[:10])
print(ds_train[:10])
print(len(ds_test))
print(len(ds_train))

['我在上大学二年级的时候，英语阅读中有一篇课文就是节选自本书的英文原版。当时读了以后，非常认同作者的观点，并且从那是开始就暗下决心：等我有了孩子，一定要坚持给TA朗读！现在我马上要当妈妈了，所以就买了这本书，希望在宝宝出生之前好好学习一下朗读方面的知识，为今后做好准备。^_^希望更多的爸爸妈妈也能加入给孩子朗读的队伍，一起给宝宝朗读吧！\t1\n', '机子好用，反应快，像素略差，可以接受。\t1\n', '回复@日本评论:辛苦了 #1024 T少&?D# 柔情背后的非暴力不合作[晕]\t0\n', '最大支持容量 8GB  对以后的升级提供很大空间！\t1\n', '[哈哈][哈哈][哈哈]哇！?????作宣??理！[?你]\t1\n', '不好播卡那里太松了！\t0\n', '天儿的下午茶时间吗？[爱你][爱你]还有美女相陪啊！\t1\n', '看来现在经济形势很严峻啊，两个月前办的会员卡，才去过三次，你妹的就空屋转让了啊，我卡里的钱呢钱呢钱呢钱呢[怒][怒][怒][可怜]，@昨夜人生 随便找个剪剪吧再也不要办卡了\t0\n', '呵呵 假货东，华为也跟着混卖 而且还说什么未拆封之类的话 呵呵拆了有问题 就不管了 呵呵呵\t0\n', '【于小彤阚清子主演中国版《#继承者们#》】翻拍疑似更名《#亿万继承人#》，中韩合制共40集，主演千呼万唤始出来，已确认→#于小彤#(金叹)、#阚清子#(车恩尚)，待定中→#崔始源#(崔英道)、蒋梦婕(李宝娜)、#金喜善#，#JPM王子#、#张亮#客串←_←#翻拍是一种绝症#[失望]详细戳图↓→http:"刚进报社那年报社刚停止无息贷款买车[泪] 听说之前貌似还有无息贷款买房哈哈...再等你半年 姐从来都是载人的人啊@星子同学 "( ?o′) 真心被这个黑女给气疯了！太过分啦！RDV之前给我发个message说会晚个半小时，这就算了，等我到学校了，人家又发个短信来说tres malade回复 @Mr_仇:[鼓掌][鼓掌]VV，球总休了你啦！ 捧书的场景快成一道消逝的风景了。我挺喜欢＂想去＂。回味的slogan，简洁的界面，顺畅的交互，精选的内容[爱你]只是有一点我不喜欢，默认的＂专题＂，点进后却是不伦不类的杂志感[衰]第一次时我迟疑了一会不知道这和商品有何关系，就多翻几页直到出现查看更多才找到我想看的商品。现在看专题我都是快速到

In [42]:
ds_train_encoded = encode_examples(ds_train).shuffle(10000).batch(batch_size)
# test dataset
ds_test_encoded = encode_examples(ds_test).batch(batch_size)

In [32]:
from transformers import TFBertForSequenceClassification
import tensorflow as tf
# recommended learning rate for Adam 5e-5, 3e-5, 2e-5
learning_rate = 2e-5
# we will do just 1 epoch, though multiple epochs might be better as long as we will not overfit the model
number_of_epochs = 1
# model initialization
model = TFBertForSequenceClassification.from_pretrained('bert-base-chinese')
# choosing Adam optimizer
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate, epsilon=1e-08)
# we do not have one-hot vectors, we can use sparce categorical cross entropy and accuracy
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

tf_model.h5:   0%|          | 0.00/478M [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-chinese and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [43]:
model.fit(ds_train_encoded, epochs=number_of_epochs, validation_data=ds_test_encoded)
model.save('bert_sentiment')



KeyboardInterrupt: 

In [None]:
test_sentence = "房间OK，但隔音差，服务也不错，地理位置不太理想。"

predict_input = tokenizer.encode(test_sentence,
truncation=True,
padding=True,
return_tensors="tf")

tf_output = model.predict(predict_input)[0]
tf_prediction = tf.nn.softmax(tf_output, axis=1)
print(tf_prediction)
labels = ['Negative','Positive'] #(0:negative, 1:positive)
label = tf.argmax(tf_prediction, axis=1)
label = label.numpy()
print(labels[label[0]])