In [1]:
import torch
import numpy as np
from transformers import BertTokenizer
import pandas as pd
from torch import nn
from transformers import BertModel

# 預測

In [2]:
class BertClassifier(nn.Module):

    def __init__(self, dropout=0.5):

        super(BertClassifier, self).__init__()

        self.bert = BertModel.from_pretrained('bert-base-chinese')
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(768, 2)
        self.relu = nn.ReLU()

    def forward(self, input_id, mask):

        _, pooled_output = self.bert(input_ids= input_id, attention_mask=mask,return_dict=False)
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        final_layer = self.relu(linear_output)

        return final_layer

In [3]:
model_dict_path = './models/model_bert.pth'
model_dict = torch.load(model_dict_path)

In [4]:
model = BertClassifier()
model.load_state_dict(model_dict['model_state_dict'])
model.eval()

Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BertClassifier(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(21128, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=Tru

In [16]:
def predict(model, text):
    tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
    predict_input = tokenizer(text, padding='max_length', max_length = 32, truncation=True, return_tensors="pt")

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    if use_cuda:

        model = model.cuda()

    predict_mask = predict_input['attention_mask'].to(device)
    predict_input_id = predict_input['input_ids'].squeeze(1).to(device)

    predict_output = model(predict_input_id, predict_mask)
    #print(predict_output)
    #print(predict_output.argmax(dim=1))
    predict_label = predict_output.argmax(dim=1).item()
    #print(text, predict_label)
    return predict_label

## 舉例

In [17]:
test_text_1 = "只會選舉的草包"
test_text_2 = "實在有夠噁心"
test_text_3 = "垃圾，在那叫什麼"
test_text_4 = "希望台灣能更好"

In [18]:
offensive_predict = predict(model, test_text_1)
print(test_text_1, offensive_predict)

只會選舉的草包 1


In [19]:
offensive_predict = predict(model, test_text_2)
print(test_text_2, offensive_predict)

實在有夠噁心 1


In [20]:
offensive_predict = predict(model, test_text_3)
print(test_text_3, offensive_predict)

垃圾，在那叫什麼 1


In [21]:
offensive_predict = predict(model, test_text_4)
print(test_text_4, offensive_predict)

希望台灣能更好 0


## 選取資料區間進行預測

In [53]:
data_root = './preprocessing/data.csv'

In [54]:
df = pd.read_csv(data_root)
df = df[['sentence', 'label']]

In [55]:
sample_df = df.sample(n=300)

In [56]:
predict_sentences = sample_df['sentence'].tolist()

In [57]:
predict_labels = []
for i in predict_sentences:
    predict_label = predict(model, i)
    predict_labels.append(predict_label)

In [58]:
print(predict_labels)

[1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0]


In [59]:
labelsMap = {
    'Non-offensive':0,
    'Offensive':1,
          }

#text_df.label = text_df.label.map(labelsMap)
translated_predict_labels = [next(key for key, value in labelsMap.items() if value == i) for i in predict_labels]

In [60]:
print(translated_predict_labels)

['Offensive', 'Offensive', 'Offensive', 'Non-offensive', 'Non-offensive', 'Non-offensive', 'Non-offensive', 'Non-offensive', 'Non-offensive', 'Non-offensive', 'Non-offensive', 'Non-offensive', 'Non-offensive', 'Non-offensive', 'Non-offensive', 'Non-offensive', 'Non-offensive', 'Non-offensive', 'Non-offensive', 'Non-offensive', 'Non-offensive', 'Non-offensive', 'Non-offensive', 'Non-offensive', 'Non-offensive', 'Non-offensive', 'Non-offensive', 'Offensive', 'Non-offensive', 'Non-offensive', 'Offensive', 'Non-offensive', 'Non-offensive', 'Non-offensive', 'Offensive', 'Non-offensive', 'Offensive', 'Offensive', 'Offensive', 'Non-offensive', 'Non-offensive', 'Non-offensive', 'Offensive', 'Non-offensive', 'Non-offensive', 'Offensive', 'Non-offensive', 'Non-offensive', 'Offensive', 'Non-offensive', 'Non-offensive', 'Offensive', 'Non-offensive', 'Offensive', 'Non-offensive', 'Offensive', 'Non-offensive', 'Non-offensive', 'Non-offensive', 'Non-offensive', 'Non-offensive', 'Offensive', 'Non-offe

In [61]:
sample_df['label'] = translated_predict_labels

In [62]:
sample_df

Unnamed: 0,sentence,label
631,笑死,Offensive
420,怎麼綠蟑螂超急,Offensive
3868,賣國KMT沒辦法去,Offensive
2746,你議員都黑道一起挺,Non-offensive
4609,搞不好4X早有照片等著在接駁車VIP？接駁車是勞講的東西沒幾個真的，這種讚讚讚，2024穩了...,Non-offensive
...,...,...
1904,很不方每天核酸，存款2099才加強版綠共阿vpn會大賣看不到好萊塢影片我不行護照不能用做核酸了,Non-offensive
3468,人就散啦zzzzz這人有可信度嗎？開啦,Non-offensive
1322,那我決定有高級業務嘴的實力寧願相信世上有鬼也不信小妳講的是哪個平行宇宙的陳疫苗要排,Non-offensive
4549,民進黨不可能認錯,Offensive


In [63]:
sample_df.to_csv('predict_sample_data.csv', encoding='utf_8_sig')

# 對 predict_sample_data 進行人工修正

# 加入到訓練資料

In [64]:
new_data_root = './preprocessing/new_data.csv'

In [65]:
new_df = pd.read_csv(new_data_root)
new_df = new_df[['sentence', 'label']]

In [66]:
new_df = pd.merge(new_df, sample_df, on=None, how='outer', indicator=False)

In [67]:
new_df

Unnamed: 0,sentence,label
0,希望台灣會更好大推這篇！！推,Non-offensive
1,下面沒人要跟土城找安囉,Non-offensive
2,0每個小瑕疵看起來都很細微過高屏溪都殺人無罪了,Non-offensive
3,淪落到連打一個教嗚嗚嗚，好可憐，都是柯P霸凌慣犯阿苗怎麼不幫幫你的苗粉快去死吧！政治蟑螂直接...,Non-offensive
4,但是司法你以綠能你不能啦要就全國一起玩台南感覺更哇，把新竹選民當白癡耍是第一天認識民進黨？還沒就職,Non-offensive
...,...,...
1191,很不方每天核酸，存款2099才加強版綠共阿vpn會大賣看不到好萊塢影片我不行護照不能用做核酸了,Non-offensive
1192,人就散啦zzzzz這人有可信度嗎？開啦,Non-offensive
1193,那我決定有高級業務嘴的實力寧願相信世上有鬼也不信小妳講的是哪個平行宇宙的陳疫苗要排,Non-offensive
1194,民進黨不可能認錯,Offensive


In [None]:
new_df.to_csv('./preprocessing/new_data.csv', encoding='utf_8_sig')