<a href="https://colab.research.google.com/github/kamat-v/HF_Transformers_Notebooks/blob/main/Masked_Language_Modeling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# https://www.kaggle.com/shivamkushwaha/bbc-full-text-document-classification
!wget -nc https://lazyprogrammer.me/course_files/nlp/bbc_text_cls.csv

In [None]:
!pip install transformers

In [3]:
import numpy as np
import pandas as pd
import textwrap
from pprint import pprint

from transformers import pipeline

In [4]:
df = pd.read_csv('bbc_text_cls.csv')

In [None]:
df.head()

In [None]:
labels=set(df['labels'])
labels

In [None]:
texts=df[df['labels']=='business']['text']
texts.head()

In [None]:
np.random.seed(1234)
i=np.random.choice(texts.shape[0])
doc=texts.iloc[i]
pprint(textwrap.fill(doc,replace_whitespace=False,fix_sentence_endings=True))

In [None]:
mlm=pipeline('fill-mask')

In [None]:
text='Shares in train and plane-making ' + \
    'giant Bombardier have fallen to a 10-year <mask> following the departure ' + \
    'of its chief executive and two members of the board.'
pprint(mlm(text))

In [None]:
from google.colab import files

uploaded = files.upload()

In [None]:
def load_document(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        text = f.read()
    return text

doc_text = load_document("New Text Document.txt")
print(doc_text[:500])

![](https://deeplearningcourses.com/notebooks_v3_pxl?sc=8Jty5lhP77FmKx1Scgr7YA&n=Pipeline+Masked+Language+Modeling)

In [56]:
text='Graph theory is a branch of mathematics focused on studying graphs, ' + \
     'which are mathematical structures used to represent relationships between objects'

In [57]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
import nltk
nltk.download('punkt')

from nltk.tokenize import sent_tokenize

sentences = sent_tokenize(doc_text)

vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(sentences)

feature_names = vectorizer.get_feature_names_out()

# Average TF-IDF score per word across sentences
average_scores = tfidf_matrix.mean(axis=0).A1  # convert matrix to flat array

word_scores = dict(zip(feature_names, average_scores))

# Sort and pick top 10
top_words = sorted(word_scores, key=word_scores.get, reverse=True)[:10]
print("Top 10 TF-IDF words:", top_words)

In [None]:
mlm = pipeline("fill-mask", model="bert-base-uncased")
mask_token = mlm.tokenizer.mask_token

def mask_first_occurrence(text, word, mask_token):
    pattern = re.compile(r'\b' + re.escape(word) + r'\b', re.IGNORECASE)
    return pattern.sub(mask_token, text, count=1)

In [None]:
def sample_from_fill_mask_results(results):
    tokens = [r['token_str'] for r in results]
    scores = np.array([r['score'] for r in results])
    probs = scores / scores.sum()
    return np.random.choice(tokens, p=probs)

modified_text = doc_text

for word in top_words:
    # Mask one occurrence of the current word
    masked_text = mask_first_occurrence(modified_text, word, mask_token)

    # Check mask token presence
    if masked_text.count(mask_token) != 1:
        print(f"Skipping '{word}': no single occurrence found.")
        continue

    # Get predictions
    results = mlm(masked_text)

    # Sample a replacement token
    replacement = sample_from_fill_mask_results(results)

    print(f"Replacing '{word}' with '{replacement}'")

    # Replace the masked token with the chosen replacement
    modified_text = masked_text.replace(mask_token, replacement, 1)

In [None]:
output_file = "modified_text.txt"
with open(output_file, 'w', encoding='utf-8') as f:
    f.write(modified_text)

print(f"Modified text saved to {output_file}")

In [None]:
with open("modified_text.txt", "r", encoding="utf-8") as f:
    modified_content = f.read()

print(modified_content)