In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer

# Load the Jigsaw dataset
data = pd.read_csv('jigsaw_dataset.csv')

# Split the data into training and validation sets
train_data, val_data = train_test_split(data, test_size=0.1, random_state=42)

# Instantiate the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def encode_data(data, tokenizer, max_length=128):
    input_ids = []
    attention_masks = []
    
    for text in data:
        # Tokenize text and apply mask word replacement
        encoded = tokenizer.encode_plus(text, add_special_tokens=True, max_length=max_length, padding='max_length', truncation=True)
        input_ids.append(encoded['input_ids'])
        attention_masks.append(encoded['attention_mask'])
    
    return input_ids, attention_masks

# Preprocess the data
train_input_ids, train_attention_masks = encode_data(train_data['comment_text'], tokenizer)
val_input_ids, val_attention_masks = encode_data(val_data['comment_text'], tokenizer)


In [None]:
import torch
from transformers import BertForMaskedLM, DataCollatorForLanguageModeling

# Set up GPU for training
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Load the pre-trained BERT model for masked language modeling
model = BertForMaskedLM.from_pretrained('bert-base-uncased')
model.to(device)

# Prepare the dataset for training
train_dataset = torch.utils.data.TensorDataset(torch.tensor(train_input_ids), torch.tensor(train_attention_masks))
val_dataset = torch.utils.data.TensorDataset(torch.tensor(val_input_ids), torch.tensor(val_attention_masks))

# Set up the data collator for masked language modeling
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15)

# Set up the training arguments
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',
    overwrite_output_dir=True,
    num_train_epochs=2,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    save_steps=10_000,
    save_total_limit=2,
    evaluation_strategy='epoch',
    fp16=True,  # If you have a GPU with half precision (FP16) support, you can enable this option for faster training.
    logging_dir='./logs',
)

# Set up the Trainer
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

# Fine-tune the BERT model
trainer.train()


In [None]:
from transformers import pipeline

# Load the fine-tuned model
model = BertForMaskedLM.from_pretrained('./results')
model.to(device)

# Set up the mask-filling pipeline
fill_mask = pipeline('fill-mask', model=model, tokenizer=tokenizer, device=device.index if torch.cuda.is_available() else -1)

# Replace masked words with the BERT's prediction
def fill_masks(text, fill_mask_pipeline):
    words = text.split()
    filled_text = []
    for word in words:
        if word in MASK_WORDS:  # Replace the word with a mask token if it's in the vocabulary of mask words
            mask_prediction = fill_mask_pipeline(f"{fill_mask.tokenizer.mask_token} {word}")
            filled_word = mask_prediction[0]['token_str'].strip()
        else:
            filled_word = word
        filled_text.append(filled_word)
    return ' '.join(filled_text)

# Example usage
text_with_masks = "The [MASK] jumped over the [MASK]."
filled_text = fill_masks(text_with_masks, fill_mask)
print(filled_text)
