In [1]:
import pandas as pd
from datasets import Dataset

# Load and preprocess your English-labeled data
df = pd.read_csv("1900rows_data.csv")[["text", "LABEL"]]
df = df.rename(columns={"text": "phrase", "LABEL": "label"})

# Encode labels
unique_labels = df["label"].unique().tolist()
label2id = {label: i for i, label in enumerate(unique_labels)}
id2label = {i: label for label, i in label2id.items()}
df["label_id"] = df["label"].map(label2id)

# Train-test split
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
train_ds = Dataset.from_pandas(train_df[["phrase", "label_id"]])
test_ds = Dataset.from_pandas(test_df[["phrase", "label_id"]])

In [2]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")


def tokenize_function(example):
    return tokenizer(
        example["phrase"], padding="max_length", truncation=True, max_length=128
    )


train_ds = train_ds.map(tokenize_function, batched=True)
test_ds = test_ds.map(tokenize_function, batched=True)

train_ds = train_ds.rename_column("label_id", "labels")
test_ds = test_ds.rename_column("label_id", "labels")
train_ds.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
test_ds.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

Map:   0%|          | 0/1524 [00:00<?, ? examples/s]

Map:   0%|          | 0/382 [00:00<?, ? examples/s]

In [3]:
from transformers import BertForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, f1_score
import numpy as np

model = BertForSequenceClassification.from_pretrained(
    "bert-base-multilingual-cased", num_labels=len(label2id)
)

training_args = TrainingArguments(
    output_dir="./multilingual_output",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_steps=50,
)


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1": f1_score(labels, preds, average="weighted"),
    }


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


In [4]:
trainer.train()
trainer.save_model("multilingual_output")

  0%|          | 0/573 [00:00<?, ?it/s]

{'loss': 2.1397, 'grad_norm': 12.840332984924316, 'learning_rate': 4.56369982547993e-05, 'epoch': 0.26}
{'loss': 1.7763, 'grad_norm': 15.957270622253418, 'learning_rate': 4.12739965095986e-05, 'epoch': 0.52}
{'loss': 1.2525, 'grad_norm': 7.2755842208862305, 'learning_rate': 3.691099476439791e-05, 'epoch': 0.79}


  0%|          | 0/48 [00:00<?, ?it/s]

{'eval_loss': 0.9233168363571167, 'eval_accuracy': 0.7146596858638743, 'eval_f1': 0.6857415134116677, 'eval_runtime': 4.1268, 'eval_samples_per_second': 92.565, 'eval_steps_per_second': 11.631, 'epoch': 1.0}
{'loss': 0.9348, 'grad_norm': 11.172807693481445, 'learning_rate': 3.254799301919721e-05, 'epoch': 1.05}
{'loss': 0.6055, 'grad_norm': 2.9885454177856445, 'learning_rate': 2.8184991273996508e-05, 'epoch': 1.31}
{'loss': 0.5925, 'grad_norm': 6.894710540771484, 'learning_rate': 2.382198952879581e-05, 'epoch': 1.57}
{'loss': 0.5689, 'grad_norm': 22.988672256469727, 'learning_rate': 1.9458987783595115e-05, 'epoch': 1.83}


  0%|          | 0/48 [00:00<?, ?it/s]

{'eval_loss': 0.5195261240005493, 'eval_accuracy': 0.8481675392670157, 'eval_f1': 0.8429289395713538, 'eval_runtime': 3.6841, 'eval_samples_per_second': 103.689, 'eval_steps_per_second': 13.029, 'epoch': 2.0}
{'loss': 0.4215, 'grad_norm': 21.366735458374023, 'learning_rate': 1.5095986038394417e-05, 'epoch': 2.09}
{'loss': 0.2199, 'grad_norm': 11.411602020263672, 'learning_rate': 1.0732984293193717e-05, 'epoch': 2.36}
{'loss': 0.2294, 'grad_norm': 1.2310848236083984, 'learning_rate': 6.369982547993019e-06, 'epoch': 2.62}
{'loss': 0.2625, 'grad_norm': 19.81229591369629, 'learning_rate': 2.006980802792321e-06, 'epoch': 2.88}


  0%|          | 0/48 [00:00<?, ?it/s]

{'eval_loss': 0.40276291966438293, 'eval_accuracy': 0.8717277486910995, 'eval_f1': 0.8694356763187971, 'eval_runtime': 3.7316, 'eval_samples_per_second': 102.368, 'eval_steps_per_second': 12.863, 'epoch': 3.0}
{'train_runtime': 228.1192, 'train_samples_per_second': 20.042, 'train_steps_per_second': 2.512, 'train_loss': 0.7933010181207307, 'epoch': 3.0}


# Run Zero-Shot Inference on Chinese Text with Your Multilingual Model

In [5]:
from transformers import BertTokenizer, BertForSequenceClassification
import torch

model_path = "multilingual_output"  # your saved model
tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")
model = BertForSequenceClassification.from_pretrained(model_path)
model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1

In [6]:
from deep_translator import GoogleTranslator
import pandas as pd

# Load and sample your English data
df = pd.read_csv("1900rows_data.csv")[["text", "LABEL"]].rename(
    columns={"text": "phrase", "LABEL": "label"}
)
sample_df = df.sample(10, random_state=42).reset_index(drop=True)

# Translate English to Chinese
sample_df["translated_phrase"] = sample_df["phrase"].apply(
    lambda x: GoogleTranslator(source="en", target="zh-CN").translate(x)
)

In [7]:
# Tokenize and predict
inputs = tokenizer(
    sample_df["translated_phrase"].tolist(),
    padding=True,
    truncation=True,
    return_tensors="pt",
)

with torch.no_grad():
    outputs = model(**inputs)
    predictions = torch.argmax(outputs.logits, dim=1)

In [8]:
# Label mapping (rebuild from training)
unique_labels = df["label"].unique().tolist()
id2label = {i: label for i, label in enumerate(unique_labels)}

# Add predictions to DataFrame
sample_df["predicted_label"] = [id2label[p.item()] for p in predictions]

# Show results
sample_df[["phrase", "translated_phrase", "label", "predicted_label"]]

Unnamed: 0,phrase,translated_phrase,label,predicted_label
0,I've been feeling really weak in my muscles an...,我的肌肉感觉真的很虚弱，脖子真的很僵硬。我的关节一直在肿胀，我很难四处走动而不会感到僵硬。步...,Chronic Conditions,Chronic Conditions
1,when i extend my leg there is pain in knee joint,当我伸出腿时，膝关节会疼痛,Pain-related Conditions,Pain-related Conditions
2,My bloody stools have caused me to lose a lot ...,我的血腥凳子使我失去了很多东西，包括铁和蓝色。结果，我现在患有贫血，通常我会感到很虚弱。,Gastrointestinal Conditions,Infections
3,My skin is red and scratchy. These can occasio...,我的皮肤是红色的。这些偶尔会剥落。我的脸颊和嘴唇肿胀，这真的很烦人。我偶尔会头痛和流鼻涕，因...,Allergic/Immunologic Reactions,Allergic/Immunologic Reactions
4,I have a cut on my foot that became infected f...,我的脚割伤了，由于在健身房使用淋浴而被感染。,Infections,Infections
5,I have been experiencing symptoms such as a he...,我一直在遇到症状，例如头痛，胸痛，头晕，平衡丧失和困难。,Chronic Conditions,Chronic Conditions
6,My knee hurts when I walk,我走路时膝盖疼,Pain-related Conditions,Pain-related Conditions
7,It feels like I can't take a deep breath,感觉我不能深吸一口气,Respiratory & Sensory Issues,Respiratory & Sensory Issues
8,I have a cut that is red and swollen.,我有一个红色和肿胀的切口。,Infections,Dermatological & Skin Conditions
9,"I have a high temperature, vomiting, chills, a...",我有高温，呕吐，发冷和严重的瘙痒。此外，我一直在说话很多，头痛。我也因恶心和肌肉疼痛而困扰。,Infections,Allergic/Immunologic Reactions


**We fine-tuned bert-base-multilingual-cased on 1,900 English-labeled medical symptom examples. We then evaluated the model on Chinese-translated versions of those examples, without any fine-tuning on Chinese data. The model correctly classified most inputs, achieving strong semantic generalization across languages. These results demonstrate the power of multilingual transformers for cross-lingual medical text classification in a zero-shot setting.**

