### 1. Import libraries

In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
import jieba
import matplotlib.pyplot as plt
import seaborn as sns
import os
import joblib
import transformers
import torch

### 2. Load data and preprocess

We are using [ASAP](https://github.com/Meituan-Dianping/asap) dataset authored by Bu et. al. ASAP is a Chinese restaurant review dataset collected from Dianping App. Reviews are written in Chinese and each review is annotated with a star rating from 1 to 5 and 18 different aspects along with the sentiment. 


Each aspect category for example Location#Transportation is is labeled as 1(Positive), 0(Neutral), −1(Negative), −2(Not-Mentioned). The data is conveniently splited into train, dev, test dataset already.

[jieba](https://github.com/fxsjy/jieba) is used 


In [7]:
def preprocess_text(text):
    words = jieba.cut(text)
    return " ".join(words)


def convert_sentiment(score):
    if score == -2:
        return "not_mentioned"
    elif score == -1:
        return "negative"
    elif score == 0:
        return "neutral"
    else:  # score == 1
        return "positive"


def load_and_preprocess_data(file_path):
    df = pd.read_csv(file_path)

    # Define aspects, e.g. Food#Appearance, Service#Price, etc.
    aspect_columns = [col for col in df.columns if col not in ["id", "review", "star"]]
    y = df[aspect_columns]

    # Convert sentiment scores to categorical labels
    y = df[aspect_columns].astype("object")
    for col in y.columns:
        y.loc[:, col] = y[col].apply(convert_sentiment)

    # Data preprocessing
    df["processed_review"] = df["review"].apply(preprocess_text)

    return df["processed_review"], y, aspect_columns


train_path = "../data/train.csv"
dev_path = "../data/dev.csv"
test_path = "../data/test.csv"

X_train, y_train, aspect_columns = load_and_preprocess_data(train_path)
X_dev, y_dev, _ = load_and_preprocess_data(dev_path)
X_test, y_test, _ = load_and_preprocess_data(test_path)

Building prefix dict from the default dictionary ...
Dumping model to file cache /tmp/jieba.cache
Loading model cost 0.304 seconds.
Prefix dict has been built successfully.


In [8]:
print(f"Train shape: {X_train.shape}, {y_train.shape}")
print(f"Dev shape: {X_dev.shape}, {y_dev.shape}")
print(f"Test shape: {X_test.shape}, {y_test.shape}\n")

pd.concat([X_train, y_train], axis=1).head()

Train shape: (36850,), (36850, 18)
Dev shape: (4940,), (4940, 18)
Test shape: (4940,), (4940, 18)



Unnamed: 0,processed_review,Location#Transportation,Location#Downtown,Location#Easy_to_find,Service#Queue,Service#Hospitality,Service#Parking,Service#Timely,Price#Level,Price#Cost_effective,Price#Discount,Ambience#Decoration,Ambience#Noise,Ambience#Space,Ambience#Sanitary,Food#Portion,Food#Taste,Food#Appearance,Food#Recommend
0,状元 楼 饭店 第一次 去 ， 因为 地理位置 优越 ： 在 宁波市 和 义 大道 高 、 ...,positive,positive,positive,not_mentioned,positive,not_mentioned,not_mentioned,not_mentioned,not_mentioned,not_mentioned,positive,not_mentioned,not_mentioned,not_mentioned,not_mentioned,positive,not_mentioned,not_mentioned
1,我 最 爱 他们 家 的 猪手 ， 麻辣 鸡爪 ， 肉片 口磨 ， 道 道菜 都 是 家常菜...,positive,not_mentioned,not_mentioned,not_mentioned,positive,not_mentioned,not_mentioned,not_mentioned,not_mentioned,not_mentioned,not_mentioned,not_mentioned,not_mentioned,positive,not_mentioned,positive,not_mentioned,not_mentioned
2,我 是 比较 喜欢 荣 新馆 的 ， 因为 材料 新鲜 ， 服务 又 好 ， 价格 适中 ，...,not_mentioned,not_mentioned,not_mentioned,not_mentioned,positive,not_mentioned,not_mentioned,neutral,not_mentioned,not_mentioned,not_mentioned,not_mentioned,not_mentioned,not_mentioned,not_mentioned,neutral,positive,not_mentioned
3,8.8 秒杀 的 多嘴 肉蟹 煲 ， 第一天 开业 就 去 了 ， 大众 点评 很 给 力 ...,not_mentioned,not_mentioned,not_mentioned,negative,positive,not_mentioned,not_mentioned,neutral,not_mentioned,positive,not_mentioned,not_mentioned,not_mentioned,not_mentioned,positive,positive,not_mentioned,not_mentioned
4,喜欢 KOI 好多年 了 ， 但是 看着 它 的 价格 在 一路 飙涨 ， 真心 是 有点 ...,not_mentioned,positive,negative,not_mentioned,not_mentioned,not_mentioned,not_mentioned,positive,not_mentioned,positive,not_mentioned,not_mentioned,not_mentioned,not_mentioned,positive,positive,not_mentioned,not_mentioned


### 3. Exploratory data analysis

In [9]:
# Create a subdirectory for EDA plots
OUTPUT_DIR = "eda_plots"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Bar plot colors for sentiment categories
SENTIMENTS = ["not_mentioned", "negative", "neutral", "positive"]
SENTIMENT_COLORS = {
    "not_mentioned": "#808080",  # Gray
    "negative": "#FF0000",  # Red
    "neutral": "#1F77B4",  # Blue
    "positive": "#2CA02C",  # Green
}


def plot_aspect_mention_frequency(y, dataset_name):
    """Plot the frequency of aspect mentions in the dataset."""
    mention_freq = (y != "not_mentioned").mean()
    plt.figure(figsize=(12, 6))
    ax = sns.barplot(x=mention_freq.index, y=mention_freq.values)
    plt.title(f"Aspect Mention Frequency in {dataset_name} Dataset")
    plt.xticks(rotation=45, ha="right")
    plt.ylabel("Proportion of Reviews Mentioning Aspect")
    annotate_bars(ax)
    plt.tight_layout()
    plt.savefig(
        os.path.join(OUTPUT_DIR, f"aspect_mention_frequency_{dataset_name}.png")
    )
    plt.close()


def plot_sentiment_distribution(y, dataset_name):
    """Plot the sentiment distribution for each aspect in the dataset."""
    for aspect in y.columns:
        plt.figure(figsize=(8, 6))
        ax = sns.countplot(
            data=y,
            x=aspect,
            order=SENTIMENTS,
            hue=aspect,
            palette=SENTIMENT_COLORS,
            legend=False,
        )
        plt.title(f"Sentiment Distribution for {aspect} in {dataset_name} Dataset")
        plt.xlabel("Sentiment")
        plt.ylabel("Count")
        annotate_bars(ax)
        plt.savefig(
            os.path.join(
                OUTPUT_DIR, f"sentiment_distribution_{aspect}_{dataset_name}.png"
            )
        )
        plt.close()


def annotate_bars(ax):
    """Annotate bars with their heights, formatting based on value range."""
    for p in ax.patches:
        height = p.get_height()
        if height <= 0:
            return
        if 0 < height < 1:
            annotation_text = f"{height:.2f}"
        else:
            # Format as whole number for other values
            annotation_text = f"{int(height)}"

        ax.annotate(
            annotation_text,
            (p.get_x() + p.get_width() / 2.0, height),
            ha="center",
            va="bottom",
            xytext=(0, 5),
            textcoords="offset points",
        )


def plot_aspect_mention_distribution(y, dataset_name):
    """Plot the distribution of the number of aspects mentioned per review."""
    num_aspects_mentioned_per_review = (y != "not_mentioned").sum(axis=1)
    plt.figure(figsize=(10, 6))
    ax = sns.histplot(
        num_aspects_mentioned_per_review, bins=range(0, len(y.columns) + 1), kde=False
    )
    plt.title(
        f"Distribution of Number of Aspects Mentioned per Review in {dataset_name} Dataset"
    )
    plt.xlabel("Number of Aspects Mentioned")
    plt.ylabel("Number of Reviews")
    annotate_bars(ax)
    plt.savefig(os.path.join(OUTPUT_DIR, f"num_aspects_mentioned_{dataset_name}.png"))
    plt.close()


def summarize_dataset(y, dataset_name):
    """Print summary statistics for the dataset."""
    print(f"{dataset_name.capitalize()}")
    print(f"Total reviews: {len(y)}")

    num_aspects_mentioned_per_review = (y != "not_mentioned").sum(axis=1)
    avg_mentions = num_aspects_mentioned_per_review.mean()

    most_mentioned_aspect = (y != "not_mentioned").sum().idxmax()
    most_mentioned_aspect_count = (y != "not_mentioned").sum().max()
    most_mentioned_aspect_percentage = (
        (y[most_mentioned_aspect] != "not_mentioned").sum() / len(y)
    ) * 100

    least_mentioned_aspect = (y != "not_mentioned").sum().idxmin()
    least_mentioned_aspect_count = (y != "not_mentioned").sum().min()
    least_mentioned_aspect_percentage = (
        (y[least_mentioned_aspect] != "not_mentioned").sum() / len(y)
    ) * 100

    print(f"Average number of aspects mentioned per review: {avg_mentions:.2f}")
    print(
        f"Most frequently mentioned aspect: {most_mentioned_aspect} {most_mentioned_aspect_count} ({most_mentioned_aspect_percentage:.2f}%)"
    )
    print(
        f"Least frequently mentioned aspect: {least_mentioned_aspect} {least_mentioned_aspect_count} ({least_mentioned_aspect_percentage:.2f}%)\n"
    )


def perform_eda(y, dataset_name):
    plot_aspect_mention_frequency(y, dataset_name)
    plot_sentiment_distribution(y, dataset_name)
    plot_aspect_mention_distribution(y, dataset_name)
    summarize_dataset(y, dataset_name)

In [10]:
perform_eda(y_train, "train")
perform_eda(y_dev, "dev")
perform_eda(y_test, "test")

Train
Total reviews: 36850
Average number of aspects mentioned per review: 5.79
Most frequently mentioned aspect: Food#Taste 34872 (94.63%)
Least frequently mentioned aspect: Service#Parking 2476 (6.72%)

Dev
Total reviews: 4940
Average number of aspects mentioned per review: 5.89
Most frequently mentioned aspect: Food#Taste 4672 (94.57%)
Least frequently mentioned aspect: Service#Parking 323 (6.54%)

Test
Total reviews: 4940
Average number of aspects mentioned per review: 5.74
Most frequently mentioned aspect: Food#Taste 4679 (94.72%)
Least frequently mentioned aspect: Service#Parking 326 (6.60%)



### 4. Word embedding

To perform any sort of training, we need to convert raw string (chars) into vectors so that they can be computed. There are plenty of ways to do it including Bag of Words (BoW), Word2vec, GloVe, etc... 

We shall try them and compare the results.

### 5. Bidirectional Encoder Representations from Transformers (BERT)

In [11]:
from transformers import BertModel, BertTokenizer

tokenizer = BertTokenizer.from_pretrained("bert-base-chinese")
model = BertModel.from_pretrained("bert-base-chinese")

In [13]:
out = tokenizer.encode(
    text=X_train[0],  # 输入文本
    # 一律补pad到max_length长度
    padding="max_length",  # 少于max_length时就padding
    add_special_tokens=True,
    return_tensors=None,  # None表示不指定数据类型，默认返回list
)
print(out)  # [101, 2769, 3221, 2765, 3300, 1997, 102]  # token ids
print(tokenizer.decode(out))  # 我喜欢这家餐厅的食物，和服务

[101, 4307, 1039, 3517, 7649, 2421, 5018, 671, 3613, 1343, 8024, 1728, 711, 1765, 4415, 855, 5390, 831, 6632, 8038, 1762, 2123, 3797, 2356, 1469, 721, 1920, 6887, 7770, 510, 1920, 510, 677, 8024, 7027, 7481, 6163, 934, 704, 2466, 8024, 5831, 3221, 1765, 6887, 4638, 2123, 3797, 5831, 8024, 1366, 1456, 5283, 3633, 8024, 7004, 3799, 6090, 4294, 3472, 8024, 1391, 1168, 749, 2207, 3198, 952, 4638, 1456, 6887, 8024, 1728, 711, 1343, 749, 3241, 749, 8024, 1762, 1920, 1828, 5023, 749, 671, 833, 1036, 8024, 3309, 7313, 3300, 5763, 3717, 1600, 510, 3302, 1218, 1447, 6820, 680, 872, 5464, 1921, 8024, 1168, 749, 2218, 7623, 3198, 4495, 2692, 1922, 1962, 8024, 3302, 1218, 1447, 6963, 3221, 2207, 6651, 4307, 8024, 3302, 1218, 2578, 2428, 5318, 2190, 679, 2990, 6862, 8024, 3416, 3416, 6963, 3302, 1218, 1168, 855, 8024, 4157, 6983, 3717, 6820, 5447, 2552, 4638, 680, 2769, 812, 6237, 7025, 8024, 2218, 6821, 3416, 5318, 2190, 6206, 1930, 671, 1930, 8024, 4294, 1166, 3221, 2510, 3173, 3215, 510, 3825, 53

In [14]:
from transformers import BertModel, BertTokenizer
import torch
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report
import numpy as np
import joblib

# Ensure the model is in evaluation mode and move to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BertModel.from_pretrained("bert-base-chinese").to(device)
model.eval()  # Set to evaluation mode since we're only extracting embeddings
tokenizer = BertTokenizer.from_pretrained("bert-base-chinese")

In [18]:
from transformers import BertModel, BertTokenizer
import torch
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report
import numpy as np
import joblib
from tqdm import tqdm  # Import tqdm for progress bar

In [19]:
# Ensure the model is in evaluation mode and move to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BertModel.from_pretrained("bert-base-chinese").to(device)
model.eval()  # Set to evaluation mode since we're only extracting embeddings
tokenizer = BertTokenizer.from_pretrained("bert-base-chinese")


# Step 1: Define a function to generate BERT embeddings with a progress bar
def get_bert_embeddings(texts, batch_size=32):
    embeddings = []
    num_batches = (len(texts) + batch_size - 1) // batch_size

    # Use tqdm to show a progress bar for the batches
    for i in tqdm(
        range(0, len(texts), batch_size),
        total=num_batches,
        desc="Generating BERT embeddings",
    ):
        batch_texts = texts[i : i + batch_size]

        # Tokenize the batch
        inputs = tokenizer(
            batch_texts.tolist(),
            padding="max_length",
            truncation=True,
            max_length=512,  # BERT's max input length
            return_tensors="pt",  # Return PyTorch tensors
            add_special_tokens=True,
        )

        # Move inputs to the device (CPU/GPU)
        inputs = {key: val.to(device) for key, val in inputs.items()}

        # Get BERT embeddings (no gradient computation to save memory)
        with torch.no_grad():
            outputs = model(**inputs)
            # Use the [CLS] token embedding (first token) as the review embedding
            cls_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()

        embeddings.append(cls_embeddings)

    # Concatenate all batch embeddings
    embeddings = np.concatenate(embeddings, axis=0)
    return embeddings

In [None]:
print("Generating BERT embeddings for dev data...")
X_dev_bert = get_bert_embeddings(X_dev, batch_size=32)

Generating BERT embeddings for dev data...


Generating BERT embeddings:  12%|█▏        | 19/155 [02:47<19:26,  8.58s/it]

In [None]:
print("Generating BERT embeddings for training data...")
X_train_bert = get_bert_embeddings(X_train, batch_size=32)


print("Generating BERT embeddings for test data...")
X_test_bert = get_bert_embeddings(X_test, batch_size=32)

In [None]:
# Step 3: Train a classifier for each aspect using BERT embeddings
print("\nTraining classifiers with BERT embeddings...")
classifiers_bert = {}

for aspect in aspect_columns:
    print(f"Training classifier for {aspect}...")
    if y_train[aspect].nunique() < 2:
        print(f"Skipping {aspect}: only one class in training data")
        classifiers_bert[aspect] = None
    else:
        # Use LinearSVC with class weights to handle imbalance
        clf = LinearSVC(random_state=42, class_weight="balanced", max_iter=1000)
        clf.fit(X_train_bert, y_train[aspect])
        classifiers_bert[aspect] = clf

# Step 4: Evaluate on dev set
print("\nValidation Results (Dev Set) with BERT embeddings:")
for aspect in aspect_columns:
    if classifiers_bert[aspect] is not None:
        y_pred_dev = classifiers_bert[aspect].predict(X_dev_bert)
        print(f"\n{aspect}:")
        print(classification_report(y_dev[aspect], y_pred_dev))
    else:
        print(f"\n{aspect}: No model trained (single class)")

# Step 5: Evaluate on test set
print("\nTesting Results (Test Set) with BERT embeddings:")
for aspect in aspect_columns:
    if classifiers_bert[aspect] is not None:
        y_pred_test = classifiers_bert[aspect].predict(X_test_bert)
        print(f"\n{aspect}:")
        print(classification_report(y_test[aspect], y_pred_test))
    else:
        print(f"\n{aspect}: No model trained (single class)")

# Step 6: Save the classifiers
print("\nSaving BERT classifiers...")
for aspect, clf in classifiers_bert.items():
    if clf is not None:
        joblib.dump(clf, f"classifier_bert_{aspect}.pkl")
print("BERT classifiers saved successfully.")