**Installing necessary packages**

In [None]:
!pip install -q simpletransformers
!pip install -q datasets
!pip install -q torch
!pip install -q accelerate

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m250.8/250.8 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m24.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.6/519.6 kB[0m [31m39.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m48.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m19.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m62.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m26.7 MB/s[0m et

**Importing necessary libraries**

In [None]:
import numpy as np
import pandas as pd
import datasets
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import DataLoader
from torch.optim import AdamW
from accelerate import Accelerator
from tqdm.auto import tqdm
import math
from transformers import AutoModelForMaskedLM, AutoTokenizer, DataCollatorForLanguageModeling, default_data_collator, get_scheduler, pipeline

**Reading the dataset**

In [None]:
df = pd.read_csv("/content/drive/MyDrive/agnews.csv")
df.head(3)

Unnamed: 0,Description
0,"RAMALLAH, West Bank, Oct 29 (AFP) - Suha Arafa..."
1,Alessandro Petacchi (Fassa Bortolo) has won St...
2,DoCoMo previously procured FOMA handsets only ...


**Checking for missing values**

In [None]:
df.isnull().sum()

Description    0
dtype: int64

**Removing duplicates**

In [None]:
print (df.shape)
df = df.drop_duplicates()
print (df.shape)

(30000, 1)
(30000, 1)


**Data Cleaning**

In [None]:
df['Description'] = df['Description'].str.replace(r'[^\w\s.\']', '', regex=True)

**Making train-test splits**

In [None]:
train, test = train_test_split (df, test_size=0.2, random_state=0, shuffle = False)

train = Dataset.from_pandas(train)
test = Dataset.from_pandas(test)

**Converting data to arrow format**

In [None]:
dataset = DatasetDict()

dataset['train'] = train
dataset['test'] = test
dataset

DatasetDict({
    train: Dataset({
        features: ['Description', '__index_level_0__'],
        num_rows: 24000
    })
    test: Dataset({
        features: ['Description', '__index_level_0__'],
        num_rows: 6000
    })
})

**Removing index column**

In [None]:
dataset = dataset.remove_columns(["__index_level_0__"])
dataset

DatasetDict({
    train: Dataset({
        features: ['Description'],
        num_rows: 24000
    })
    test: Dataset({
        features: ['Description'],
        num_rows: 6000
    })
})

**Initializing the tokenizer and model**

In [None]:
model_checkpoint = "distilbert-base-uncased"
model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

**Initializing data collator**

In [None]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

**Defining the function for tokenization and tokenizing the dataset**

In [None]:
def tokenize_function(examples):
    result = tokenizer(examples["Description"])
    if tokenizer.is_fast:
        result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
    return result

tokenized_datasets = dataset.map(
    tokenize_function, batched=True, remove_columns=["Description"]
)
tokenized_datasets

Map:   0%|          | 0/24000 [00:00<?, ? examples/s]

Map:   0%|          | 0/6000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 24000
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 6000
    })
})

In [None]:
chunk_size = 128

**Necessary data preprocessing**

In [None]:
def group_texts(examples):
    # Concatenating all texts
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    # Computing length of concatenated texts
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # Dropping the last chunk if it's smaller than chunk_size
    total_length = (total_length // chunk_size) * chunk_size
    # Splitting by chunks of max_len
    result = {
        k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
        for k, t in concatenated_examples.items()
    }
    # Creating a new labels column
    result["labels"] = result["input_ids"].copy()
    return result

In [None]:
lm_datasets = tokenized_datasets.map(group_texts, batched=True)
lm_datasets

Map:   0%|          | 0/24000 [00:00<?, ? examples/s]

Map:   0%|          | 0/6000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 7424
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 1864
    })
})

**Defining the function for introducing random masks**

In [None]:
def insert_random_mask(batch):
    features = [dict(zip(batch, t)) for t in zip(*batch.values())]
    masked_inputs = data_collator(features)
    # Creating a new "masked" column for each column in the dataset
    return {"masked_" + k: v.numpy() for k, v in masked_inputs.items()}

**Removing useless columns and introducing masks**

In [None]:
lm_datasets = lm_datasets.remove_columns(["word_ids"])
eval_dataset = lm_datasets["test"].map(
    insert_random_mask,
    batched=True,
    remove_columns=lm_datasets["test"].column_names,
)
eval_dataset = eval_dataset.rename_columns(
    {
        "masked_input_ids": "input_ids",
        "masked_attention_mask": "attention_mask",
        "masked_labels": "labels",
    }
)

Map:   0%|          | 0/1864 [00:00<?, ? examples/s]

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


**Preparing training and evaluation data loader**

In [None]:
batch_size = 32
train_dataloader = DataLoader(
    lm_datasets["train"],
    shuffle=True,
    batch_size=batch_size,
    collate_fn=data_collator,
)
eval_dataloader = DataLoader(eval_dataset, batch_size=batch_size, collate_fn=default_data_collator)

**Initializing adam optimzer and accelerator**

In [None]:
optimizer = AdamW(model.parameters(), lr=5e-5)

In [None]:
accelerator = Accelerator()
model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(model, optimizer, train_dataloader, eval_dataloader)

**Setting/Defining training arguments and some other necessary steps**

In [None]:
num_train_epochs = 16
num_update_steps_per_epoch = len(train_dataloader)
num_training_steps = num_train_epochs * num_update_steps_per_epoch

lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,)

**Defining model name and output directory for the trained model**

In [None]:
model_name = model_checkpoint.split("/")[-1]
output_dir= "/content/drive/MyDrive/nlp/masked language modelling/ag_news_model"

**Model Training**

In [None]:
progress_bar = tqdm(range(num_training_steps))

for epoch in range(num_train_epochs):
    # Training
    model.train()
    for batch in train_dataloader:
        outputs = model(**batch)
        loss = outputs.loss
        accelerator.backward(loss)

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

    # Evaluation
    model.eval()
    losses = []
    for step, batch in enumerate(eval_dataloader):
        with torch.no_grad():
            outputs = model(**batch)

        loss = outputs.loss
        losses.append(accelerator.gather(loss.repeat(batch_size)))

    losses = torch.cat(losses)
    losses = losses[: len(eval_dataset)]
    try:
        perplexity = math.exp(torch.mean(losses))
    except OverflowError:
        perplexity = float("inf")

    print(f">>> Epoch {epoch}: Perplexity: {perplexity}")

    # Save and upload
    accelerator.wait_for_everyone()
    unwrapped_model = accelerator.unwrap_model(model)
    unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)
    if accelerator.is_main_process:
        tokenizer.save_pretrained(output_dir)

  0%|          | 0/3712 [00:00<?, ?it/s]

>>> Epoch 0: Perplexity: 16.445947713904776
>>> Epoch 1: Perplexity: 14.491306774530374
>>> Epoch 2: Perplexity: 13.280370219537994
>>> Epoch 3: Perplexity: 12.488642265488018
>>> Epoch 4: Perplexity: 11.900936539923157
>>> Epoch 5: Perplexity: 11.37212808336508
>>> Epoch 6: Perplexity: 11.02653669417355
>>> Epoch 7: Perplexity: 10.810085859332137
>>> Epoch 8: Perplexity: 10.605967524440468
>>> Epoch 9: Perplexity: 10.424960672965014
>>> Epoch 10: Perplexity: 10.311426697031775
>>> Epoch 11: Perplexity: 10.147429776668359
>>> Epoch 12: Perplexity: 10.040455735224072
>>> Epoch 13: Perplexity: 9.978651299160546
>>> Epoch 14: Perplexity: 9.917808974895038
>>> Epoch 15: Perplexity: 9.905506556257043


**Taking user input and making predictions from the trained model**

In [None]:
mask_filler = pipeline("fill-mask", model="/content/drive/MyDrive/nlp/masked language modelling/ag_news_model")

user_input = input("Please enter the text: ")

preds = mask_filler(user_input)

for pred in preds:
    print(f">>> {pred['sequence']}")

Please enter the text: LEVI Strauss has scrapped plans to sell its Dockers brand because potential [MASK] would not meet the companys price for the casual clothing line.
>>> levi strauss has scrapped plans to sell its dockers brand because potential buyers would not meet the companys price for the casual clothing line.
>>> levi strauss has scrapped plans to sell its dockers brand because potential sales would not meet the companys price for the casual clothing line.
>>> levi strauss has scrapped plans to sell its dockers brand because potential buyer would not meet the companys price for the casual clothing line.
>>> levi strauss has scrapped plans to sell its dockers brand because potential customers would not meet the companys price for the casual clothing line.
>>> levi strauss has scrapped plans to sell its dockers brand because potential sale would not meet the companys price for the casual clothing line.
