In [1]:
from datasets import load_dataset
import re

dataset = load_dataset('lansinuote/ChnSentiCorp')

def clean_text(text):
    text = re.sub(r'[^\w\s]', '', text)
    text = text.strip()
    return text

dataset = dataset.map(lambda x: {'text': clean_text(x['text'])})

In [2]:

from transformers import BertTokenizer, BertForSequenceClassification

tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
model = BertForSequenceClassification.from_pretrained('bert-base-chinese', num_labels=3)

tokenizer.save_pretrained('./model')
model.save_pretrained('./model')

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-chinese and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
def tokenize_function(examples):
    return tokenizer(
        examples['text'],                
        padding='max_length',            
        truncation=True,                 
        max_length=128                   
    )

In [4]:
from transformers import Trainer, TrainingArguments

encoded_dataset = dataset.map(tokenize_function, batched=True)

training_args = TrainingArguments(
    output_dir='./results',           
    num_train_epochs=1,               
    per_device_train_batch_size=8,    
    per_device_eval_batch_size=8,     
    eval_strategy="epoch",      
    logging_dir='./logs',            
    logging_steps=10,                 
    save_strategy="epoch",            
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_dataset['train'],
    eval_dataset=encoded_dataset['validation']
)

trainer.train()

Epoch,Training Loss,Validation Loss
1,0.3928,0.315378


TrainOutput(global_step=1200, training_loss=0.36329500923554103, metrics={'train_runtime': 310.8684, 'train_samples_per_second': 30.881, 'train_steps_per_second': 3.86, 'total_flos': 631472202547200.0, 'train_loss': 0.36329500923554103, 'epoch': 1.0})

In [5]:
from sklearn.metrics import accuracy_score
def compute_metrics(p):
    preds = p.predictions.argmax(-1)  
    return {"accuracy": accuracy_score(p.label_ids, preds)}

trainer.evaluate(encoded_dataset['test'], metric_key_prefix="eval")

{'eval_loss': 0.2723783254623413,
 'eval_runtime': 4.9566,
 'eval_samples_per_second': 242.1,
 'eval_steps_per_second': 30.262,
 'epoch': 1.0}

In [6]:
model.save_pretrained('./bert_models/sentiment_model')
tokenizer.save_pretrained('./bert_models/sentiment_model')

('./bert_models/sentiment_model\\tokenizer_config.json',
 './bert_models/sentiment_model\\special_tokens_map.json',
 './bert_models/sentiment_model\\vocab.txt',
 './bert_models/sentiment_model\\added_tokens.json')