# Language Model

In [1]:
import torch
from transformers import BertTokenizer, BertForMaskedLM

In [2]:
model_name = 'nghuyong/ernie-1.0'

In [3]:
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForMaskedLM.from_pretrained(model_name)

In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
pass

In [5]:
def language_model(input_text):
    input_tokens = tokenizer.tokenize(input_text)
    input_ids = tokenizer.convert_tokens_to_ids(input_tokens)
    input_ids = torch.tensor([input_ids], device=device)
    token_type_ids = torch.zeros_like(input_ids, device=device)
    
    model.eval()
    
    with torch.no_grad():
        outputs = model(input_ids=input_ids, token_type_ids=token_type_ids)
        preds = outputs[0].squeeze()
        pred_ids = torch.argmax(preds, dim=1)
        pred_text = ''.join(tokenizer.convert_ids_to_tokens(pred_ids.squeeze()[1:-1]))
        
    return pred_text

In [6]:
language_model('[CLS] [MASK] [MASK] 是中国的首都。[SEP]')

'北京是中国的首都。'

In [7]:
language_model('[CLS] [MASK] [MASK] [MASK]是美国的首都。[SEP]')

'华盛顿是美国的首都。'

In [8]:
language_model('[CLS] [MASK] [MASK] [MASK] 是河北省的省会。[SEP]')

'石定市是河北省的省会。'

In [9]:
language_model("[CLS] [MASK] [MASK] 是中国最伟大的企业之一。[SEP]")

'华为是中国最伟大的企业之一。'

In [10]:
language_model("[CLS] [MASK] [MASK]是美国最伟大的企业之一。[SEP]")

'苹果是美国最伟大的企业之一。'