In [None]:
# bert-chinese

In [None]:
import sys
import pandas as pd

# Load annotated dataset from CSV
df = pd.read_csv('bili_labeled.csv')

# Retain only samples with valid label and text annotations
df = df[df['label'].notnull()]
df = df[df['text'].notnull()]

# Ensure label field uses integer type for further processing
df['label'] = df['label'].astype(int)

In [None]:
from sklearn.model_selection import train_test_split

# Split the annotated dataset into training and validation sets (stratified by label)
train_df, val_df = train_test_split(
    df, 
    test_size=0.2, 
    random_state=42, 
    stratify=df['label']
)

print(f"Number of training samples: {len(train_df)}, number of validation samples: {len(val_df)}")

In [None]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')

def encode_batch(texts):
    return tokenizer(
        list(texts),
        padding="max_length",
        truncation=True,
        max_length=128,
        return_tensors='pt'
    )

train_encodings = encode_batch(train_df['text'])
val_encodings = encode_batch(val_df['text'])

In [None]:
import torch

class SentimentDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels.reset_index(drop=True)
    def __getitem__(self, idx):
        item = {k: v[idx] for k, v in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item
    def __len__(self):
        return len(self.labels)

train_dataset = SentimentDataset(train_encodings, train_df['label'])
val_dataset = SentimentDataset(val_encodings, val_df['label'])

In [None]:
from transformers import BertForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import numpy as np
import evaluate

model = BertForSequenceClassification.from_pretrained('bert-base-chinese', num_labels=2)

training_args = TrainingArguments(
    output_dir='./bili-bert-model',
    num_train_epochs=5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    eval_strategy='epoch', 
    save_strategy='epoch', 
    learning_rate=2e-5,
    logging_dir='./logs',
    logging_steps=10,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)
trainer.train()

In [None]:
# test
from transformers import pipeline

clf = pipeline(
    "sentiment-analysis",
    model="./bili-bert-model/checkpoint-115",
    tokenizer="bert-base-chinese"  # no tokenizer in training model, so using tokenizer from bert-base-chinese
)

print(clf("gpt太好用了！！！！"))