In [16]:
from transformers import AutoTokenizer, AutoModelForMaskedLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import load_dataset

# Load the tokenizer and model (MuRIL for Indian languages)
model_name = "google/muril-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForMaskedLM.from_pretrained(model_name)

Some weights of the model checkpoint at google/muril-base-cased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [17]:
from datasets import load_dataset
# Load your book data as a dataset
dataset = load_dataset('text', data_files={'train': '../MLM ( masked language modeling)/purana1.txt'})

In [18]:
def tokenize_function(examples):
    return tokenizer(examples['text'], truncation=True, padding=True)

# Tokenize the dataset
tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=["text"])

# Data collator for MLM (Masked Language Modeling)
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, 
    mlm=True, 
    mlm_probability=0.15  # 15% of tokens will be masked
)


Map: 100%|██████████| 5883/5883 [00:00<00:00, 62741.15 examples/s]


In [19]:
# Define the training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    save_steps=10_000,
    save_total_limit=2,
    logging_dir='./logs'
)

# Fine-tune the model using the Trainer API
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    data_collator=data_collator  # Ensure the data collator is included here
)

# Start training
trainer.train()

  0%|          | 0/2208 [07:16<?, ?it/s]
  2%|▏         | 51/2208 [00:37<25:16,  1.42it/s] 

In [None]:

# Save the fine-tuned model
model.save_pretrained('./fine_tuned_muril_mlm')
tokenizer.save_pretrained('./fine_tuned_muril_mlm')