In [34]:
import speech_recognition as sr
import torch
import numpy as np
from transformers import BertTokenizer
import pandas as pd

In [35]:
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')

In [3]:
r = sr.Recognizer()

In [28]:
with sr.Microphone() as source:
    audio_data = r.record(source, duration = 10)
    print("Recognizing...")
    text = r.recognize_google(audio_data, language="zh-TW")
    print(text)

Recognizing...
胃抽筋


In [18]:
model_dict_path = './models/model_bert.pth'
model_dict = torch.load(model_dict_path)

In [19]:
from torch import nn
from transformers import BertModel

class BertClassifier(nn.Module):

    def __init__(self, dropout=0.5):

        super(BertClassifier, self).__init__()

        self.bert = BertModel.from_pretrained('bert-base-chinese')
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(768, 5)
        self.relu = nn.ReLU()

    def forward(self, input_id, mask):

        _, pooled_output = self.bert(input_ids= input_id, attention_mask=mask,return_dict=False)
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        final_layer = self.relu(linear_output)

        return final_layer

In [23]:
model = BertClassifier()
model.load_state_dict(model_dict['model_state_dict'])

Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


<All keys matched successfully>

In [24]:
model.eval()

BertClassifier(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(21128, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=Tru

In [150]:
text = "你是一個一個"

In [151]:
unlabeled_input = tokenizer(text, padding='max_length', max_length = 32, truncation=True, return_tensors="pt")

use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")

if use_cuda:

    model = model.cuda()

unlabeled_mask = unlabeled_input['attention_mask'].to(device)
unlabeled_input_id = unlabeled_input['input_ids'].squeeze(1).to(device)

unlabeled_output = model(unlabeled_input_id, unlabeled_mask)
pseudo_label = unlabeled_output.argmax(dim=1).item()
text_df = pd.DataFrame([[text, pseudo_label]], columns=['text', 'label'])

In [152]:
text, pseudo_label

('你是一個一個', 0)

In [133]:
labelsMap = {0:'Offensive',
             1:'Non-offensive'
            }

text_df.label = text_df.label.map(labelsMap)

In [134]:
text_df

Unnamed: 0,text,label
0,你這個王希銘,Offensive
