In [None]:
pip install transformers torch pandas scikit-learn jieba

In [None]:
import pandas as pd
import re
import jieba
from sklearn.model_selection import train_test_split

# Load datasets
train_df = pd.read_csv('/kaggle/input/misogyny/train.csv')
dev_df = pd.read_csv('/kaggle/input/misogyn/dev.csv')

# Predefined list of Chinese stopwords
chinese_stopwords = set([
    "的", "了", "在", "是", "我", "有", "和", "就", "不", "人", "都", "一", "一个", "上", "也", "很", "到", "说", "要", "去", "你", "会", "着", "没有", "看", "好", "自己", "这"
])


# Preprocess function for Chinese text
def preprocess_text(text, stopwords):
    # Remove URLs, special characters, etc.
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'[^\w\s]', '', text)  # Remove special characters
    text = re.sub(r'\d+', '', text)  # Remove numbers

    # Tokenize using jieba
    words = jieba.lcut(text)
    # Remove stopwords
    words = [word for word in words if word not in stopwords and len(word) > 1]
    return ' '.join(words)

# Apply preprocessing to the 'transcriptions' column
train_df['transcriptions'] = train_df['transcriptions'].apply(lambda x: preprocess_text(x, chinese_stopwords))
dev_df['transcriptions'] = dev_df['transcriptions'].apply(lambda x: preprocess_text(x, chinese_stopwords))

# Map string labels to integers
label_map = {
    "Misogyny": 1,
    "Not-Misogyny": 0
}
train_df['labels'] = train_df['labels'].map(label_map)
dev_df['labels'] = dev_df['labels'].map(label_map)

# Verify the labels
print("Train Labels:", train_df['labels'].unique())
print("Dev Labels:", dev_df['labels'].unique())

ChineseBERT

In [None]:
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer
import torch

# Custom Dataset class
class ChineseTextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]  # Ensure this is an integer
        encoding = self.tokenizer(
            text,
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(0),  # Remove batch dimension
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'label': torch.tensor(label, dtype=torch.long)  # Ensure label is an integer
        }

# Load tokenizers using AutoTokenizer
bert_tokenizer = AutoTokenizer.from_pretrained('bert-base-chinese')
roberta_tokenizer = AutoTokenizer.from_pretrained('hfl/chinese-roberta-wwm-ext')

# Create datasets for BERT and RoBERTa
train_dataset_bert = ChineseTextDataset(train_df['transcriptions'], train_df['labels'], bert_tokenizer)
dev_dataset_bert = ChineseTextDataset(dev_df['transcriptions'], dev_df['labels'], bert_tokenizer)

train_dataset_roberta = ChineseTextDataset(train_df['transcriptions'], train_df['labels'], roberta_tokenizer)
dev_dataset_roberta = ChineseTextDataset(dev_df['transcriptions'], dev_df['labels'], roberta_tokenizer)

# Create dataloaders
train_loader_bert = DataLoader(train_dataset_bert, batch_size=16, shuffle=True)
dev_loader_bert = DataLoader(dev_dataset_bert, batch_size=16)

train_loader_roberta = DataLoader(train_dataset_roberta, batch_size=16, shuffle=True)
dev_loader_roberta = DataLoader(dev_dataset_roberta, batch_size=16)

In [None]:
from transformers import BertModel
import torch.nn as nn

class ChineseBERTClassifier(nn.Module):
    def __init__(self, bert_model, num_classes):
        super(ChineseBERTClassifier, self).__init__()
        self.bert = bert_model
        self.dropout = nn.Dropout(0.2)  # Dropout for regularization
        self.fc = nn.Linear(768, num_classes)  # BERT hidden size is 768

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state[:, 0, :]  # Use [CLS] token representation
        pooled_output = self.dropout(pooled_output)
        logits = self.fc(pooled_output)
        return logits

# Load pre-trained BERT model
bert_model = BertModel.from_pretrained('bert-base-chinese')

# Initialize the classifier
num_classes = len(train_df['labels'].unique())  # Number of classes
bert_classifier = ChineseBERTClassifier(bert_model, num_classes)

In [None]:
import torch.optim as optim
from sklearn.metrics import classification_report, f1_score
from transformers import get_linear_schedule_with_warmup

def train_and_evaluate(model, train_loader, dev_loader, model_name):
    # Define loss function and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)  # Added weight decay

    # Learning rate scheduler
    total_steps = len(train_loader) * 10  # 10 epochs
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

    # Move model to GPU if available
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)

    # Training loop
    for epoch in range(10):  # Number of epochs
        model.train()
        running_loss = 0.0
        for batch in train_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            scheduler.step()  # Update learning rate
            running_loss += loss.item()

        print(f"{model_name} - Epoch {epoch + 1}, Loss: {running_loss / len(train_loader)}")

    # Evaluation
    model.eval()
    y_true, y_pred = [], []
    with torch.no_grad():
        for batch in dev_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids, attention_mask)
            _, predicted = torch.max(outputs, 1)
            y_true.extend(labels.cpu().numpy())
            y_pred.extend(predicted.cpu().numpy())

    # Print evaluation metrics
    print(f"{model_name} - Classification Report:")
    print(classification_report(y_true, y_pred))
    print(f"{model_name} - F1 Score: {f1_score(y_true, y_pred, average='macro')}")

# Train and evaluate BERT
train_and_evaluate(bert_classifier, train_loader_bert, dev_loader_bert, "BERT")



In [None]:
import os
os.makedirs('saved_models', exist_ok=True)
# Save BERT model
torch.save(bert_classifier.state_dict(), 'chinese_bert_misogyny_model.pth')

mBERT

In [None]:
from transformers import AutoTokenizer, AutoModel

# Load mBERT tokenizer and model
mbert_tokenizer = AutoTokenizer.from_pretrained('bert-base-multilingual-cased')
mbert_model = AutoModel.from_pretrained('bert-base-multilingual-cased')

# Create datasets for mBERT
train_dataset_mbert = ChineseTextDataset(train_df['transcriptions'], train_df['labels'], mbert_tokenizer)
dev_dataset_mbert = ChineseTextDataset(dev_df['transcriptions'], dev_df['labels'], mbert_tokenizer)

# Create dataloaders for mBERT
train_loader_mbert = DataLoader(train_dataset_mbert, batch_size=16, shuffle=True)
dev_loader_mbert = DataLoader(dev_dataset_mbert, batch_size=16)

# Define mBERT classifier
class ChineseMBERTClassifier(nn.Module):
    def __init__(self, mbert_model, num_classes):
        super(ChineseMBERTClassifier, self).__init__()
        self.mbert = mbert_model
        self.dropout = nn.Dropout(0.2)
        self.fc = nn.Linear(768, num_classes)  # mBERT hidden size is 768

    def forward(self, input_ids, attention_mask):
        outputs = self.mbert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state[:, 0, :]  # Use [CLS] token representation
        pooled_output = self.dropout(pooled_output)
        logits = self.fc(pooled_output)
        return logits

# Initialize mBERT classifier
mbert_classifier = ChineseMBERTClassifier(mbert_model, num_classes)

# Train and evaluate mBERT
train_and_evaluate(mbert_classifier, train_loader_mbert, dev_loader_mbert, "mBERT")

# Save mBERT model
torch.save(mbert_classifier.state_dict(), 'chinese_mbert_misogyny_model.pth')

XLM-R

In [None]:
from transformers import AutoTokenizer, AutoModel

# Load XLM-R tokenizer and model
xlmr_tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-base')
xlmr_model = AutoModel.from_pretrained('xlm-roberta-base')

# Create datasets for XLM-R
train_dataset_xlmr = ChineseTextDataset(train_df['transcriptions'], train_df['labels'], xlmr_tokenizer)
dev_dataset_xlmr = ChineseTextDataset(dev_df['transcriptions'], dev_df['labels'], xlmr_tokenizer)

# Create dataloaders for XLM-R
train_loader_xlmr = DataLoader(train_dataset_xlmr, batch_size=16, shuffle=True)
dev_loader_xlmr = DataLoader(dev_dataset_xlmr, batch_size=16)

# Define XLM-R classifier
class ChineseXLMRClassifier(nn.Module):
    def __init__(self, xlmr_model, num_classes):
        super(ChineseXLMRClassifier, self).__init__()
        self.xlmr = xlmr_model
        self.dropout = nn.Dropout(0.2)
        self.fc = nn.Linear(768, num_classes)  # XLM-R hidden size is 768

    def forward(self, input_ids, attention_mask):
        outputs = self.xlmr(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state[:, 0, :]  # Use [CLS] token representation
        pooled_output = self.dropout(pooled_output)
        logits = self.fc(pooled_output)
        return logits

# Initialize XLM-R classifier
xlmr_classifier = ChineseXLMRClassifier(xlmr_model, num_classes)

# Train and evaluate XLM-R
train_and_evaluate(xlmr_classifier, train_loader_xlmr, dev_loader_xlmr, "XLM-R")

# Save XLM-R model
torch.save(xlmr_classifier.state_dict(), 'chinese_xlmr_misogyny_model.pth')