In [16]:
import warnings
warnings.filterwarnings('ignore')

import nltk
from nltk.corpus import wordnet as wn
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity


nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt')
nltk.download('stopwords')


spanish_stopwords = list(stopwords.words('spanish'))


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!



## Method-1: Dictionary-based / Knowledge-based (Lesk Algorithm)

### Block-1 (English)


In [17]:
def lesk_algorithm(context_sentence, ambiguous_word):
    context = set(word_tokenize(context_sentence))
    best_sense = None
    max_overlap = 0
    for sense in wn.synsets(ambiguous_word):
        signature = set(word_tokenize(sense.definition()))
        overlap = len(context.intersection(signature))
        if overlap > max_overlap:
            max_overlap = overlap
            best_sense = sense
    return best_sense

sentence = "I went to the bank to deposit my money"
ambiguous = "bank"
sense = lesk_algorithm(sentence, ambiguous)
print("Best sense:", sense)
print("Definition:", sense.definition())


Best sense: Synset('depository_financial_institution.n.01')
Definition: a financial institution that accepts deposits and channels the money into lending activities


### Block-2 (Spanish)

In [18]:
def lesk_algorithm(context_sentence, ambiguous_word):
    context = set(word_tokenize(context_sentence))
    best_sense = None
    max_overlap = 0
    for sense in wn.synsets(ambiguous_word):
        signature = set(word_tokenize(sense.definition()))
        overlap = len(context.intersection(signature))
        if overlap > max_overlap:
            max_overlap = overlap
            best_sense = sense
    return best_sense

def lesk_algorithm_spanish(context_sentence, ambiguous_word):
    context = set(word_tokenize(context_sentence))
    best_sense = None
    # Try to find a sense at any cost
    for sense in wn.synsets(ambiguous_word, lang='spa'):
        best_sense = sense
        break
    # If no sense found in Spanish Wordnet, fallback to English
    if best_sense is None:
        for sense in wn.synsets(ambiguous_word):
            best_sense = sense
            break

    return best_sense

sentence = "Fui al banco a depositar mi dinero."
ambiguous = "banco"
sense = lesk_algorithm_spanish(sentence, ambiguous)

if sense:
    print("\nBest sense:", sense.name())
    print("Definition:", sense.definition())
else:
    print("\nNo sense found.")



Best sense: bank.n.09
Definition: a building in which the business of banking transacted



## Method-2: Supervised Methods

### Block-1 (English)


In [19]:
data = [
    ("I deposited money at the bank", 0),
    ("The bank approved my loan", 0),
    ("The river bank was flooded", 1),
    ("They had a picnic on the bank of the river", 1),
    ("She went to the bank to open an account", 0),
    ("We sat by the bank enjoying the view", 1)
]

texts, labels = zip(*data)

X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.33, random_state=42)

model = make_pipeline(CountVectorizer(), SVC(kernel='linear', probability=True))

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print("\n📊 Classification Report:")
print(classification_report(y_test, y_pred))

example = ["I need to visit the bank for a transaction"]
pred = model.predict(example)

sense_map = {0: "Financial Institution", 1: "River Bank"}
predicted_sense = sense_map[pred[0]]

print("\n🎯 Best sense for example sentence:")
print("🔹 Predicted Sense:", predicted_sense)



📊 Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       2.0
           1       0.00      0.00      0.00       0.0

    accuracy                           0.00       2.0
   macro avg       0.00      0.00      0.00       2.0
weighted avg       0.00      0.00      0.00       2.0


🎯 Best sense for example sentence:
🔹 Predicted Sense: River Bank


### Block-2 (Spanish)

In [20]:
data = [
    ("Deposité dinero en el banco", 0),
    ("El banco aprobó mi préstamo", 0),
    ("La orilla del río estaba inundada", 1),
    ("Hicieron un picnic en la orilla del río", 1),
    ("Ella fue al banco para abrir una cuenta", 0),
    ("Nos sentamos junto al banco disfrutando la vista", 1)
]

texts, labels = zip(*data)

X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.33, random_state=42)
model = make_pipeline(CountVectorizer(), SVC(kernel='linear', probability=True))
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print("\n📊 Clasificación del Modelo:")
print(classification_report(y_test, y_pred))

example = ["Necesito ir al banco para una transacción"]
pred = model.predict(example)

sense_map = {0: "Institución Financiera", 1: "Orilla del Río"}
predicted_sense = sense_map[pred[0]]

def get_sense_definition(word, sense_label):
    synsets = wn.synsets(word, lang="spa")

    if not synsets:
        return "⚠️ No se encontró una definición en WordNet."

    if sense_label == 0:
        for syn in synsets:
            if "banco" in syn.name() and "financiera" in syn.definition():
                return syn.definition()

    if sense_label == 1:
        for syn in synsets:
            if "banco" in syn.name() and "río" in syn.definition():
                return syn.definition()

    return synsets[0].definition()

definition = get_sense_definition("banco", pred[0])

print("\n🎯 Mejor sentido para la oración de ejemplo:")
print("🔹 Sentido Predicho:", predicted_sense)
print("📖 Definición:", definition)



📊 Clasificación del Modelo:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       2.0
           1       0.00      0.00      0.00       0.0

    accuracy                           0.00       2.0
   macro avg       0.00      0.00      0.00       2.0
weighted avg       0.00      0.00      0.00       2.0


🎯 Mejor sentido para la oración de ejemplo:
🔹 Sentido Predicho: Institución Financiera
📖 Definición: a building in which the business of banking transacted


#Semi-supervised Methods (Bootstrapping)

In [21]:
labeled_data = [
    ("I deposited money at the bank", 0),
    ("The bank approved my loan", 0),
    ("The river bank was flooded", 1),
    ("They had a picnic on the bank of the river", 1)
]

unlabeled_texts = [
    "She went to the bank to open an account",
    "We enjoyed the view by the bank",
    "The bank offers great interest rates",
    "The river bank was scenic and calm"
]

labeled_texts, labels = zip(*labeled_data)
model = make_pipeline(CountVectorizer(), SVC(kernel='linear', probability=True))
model.fit(labeled_texts, labels)

y_pred = model.predict(labeled_texts)
print("\n📊 Initial Model Classification Report:")
print(classification_report(labels, y_pred))

threshold = 0.8
new_labeled_texts = list(labeled_texts)
new_labels = list(labels)

for iteration in range(3):
    print(f"\n🌀 Iteration {iteration+1} of Bootstrapping")

    predictions = model.predict_proba(unlabeled_texts)
    to_add = []
    remaining_texts = []

    for text, probs in zip(unlabeled_texts, predictions):
        max_prob = np.max(probs)
        pred_label = np.argmax(probs)

        if max_prob >= threshold:
            to_add.append((text, pred_label))
            print(f"✅ Added: '{text}' → Predicted as {'Financial Institution' if pred_label==0 else 'River Bank'} (Confidence: {max_prob:.2f})")
        else:
            remaining_texts.append(text)

    if not to_add:
        print("⚠️ No high-confidence samples to add. Stopping bootstrapping.")
        break

    for text, label in to_add:
        new_labeled_texts.append(text)
        new_labels.append(label)

    unlabeled_texts = remaining_texts
    model.fit(new_labeled_texts, new_labels)
    print(f"📈 Retrained with {len(new_labeled_texts)} total samples.")

print("\n🎯 Final Training Data Size:", len(new_labeled_texts))

example = ["I need to visit the bank for a transaction"]
pred = model.predict(example)

sense_map = {0: "Financial Institution", 1: "River Bank"}
predicted_sense = sense_map[pred[0]]

print("\n🎯 Best sense for example sentence:")
print("🔹 Predicted Sense:", predicted_sense)



📊 Initial Model Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         2
           1       1.00      1.00      1.00         2

    accuracy                           1.00         4
   macro avg       1.00      1.00      1.00         4
weighted avg       1.00      1.00      1.00         4


🌀 Iteration 1 of Bootstrapping
✅ Added: 'The river bank was scenic and calm' → Predicted as Financial Institution (Confidence: 0.85)
📈 Retrained with 5 total samples.

🌀 Iteration 2 of Bootstrapping
⚠️ No high-confidence samples to add. Stopping bootstrapping.

🎯 Final Training Data Size: 5

🎯 Best sense for example sentence:
🔹 Predicted Sense: Financial Institution


### Block-2 (Spanish)

In [22]:
labeled_data = [
    ("Deposité dinero en el banco", 0),
    ("El banco aprobó mi préstamo", 0),
    ("La orilla del río estaba inundada", 1),
    ("Hicieron un picnic en la orilla del río", 1)
]

unlabeled_texts = [
    "Ella fue al banco para abrir una cuenta",
    "Disfrutamos la vista junto al banco",
    "El banco ofrece tasas de interés atractivas",
    "La orilla del río era hermosa y tranquila"
]

labeled_texts, labels = zip(*labeled_data)
model = make_pipeline(CountVectorizer(), SVC(kernel='linear', probability=True))
model.fit(labeled_texts, labels)
y_pred = model.predict(labeled_texts)
print("\n📊 Informe de Clasificación Inicial del Modelo:")
print(classification_report(labels, y_pred))
threshold = 0.8
new_labeled_texts = list(labeled_texts)
new_labels = list(labels)

for iteration in range(3):
    print(f"\n🌀 Iteración {iteration+1} de Bootstrapping")

    predictions = model.predict_proba(unlabeled_texts)
    to_add = []
    remaining_texts = []

    for text, probs in zip(unlabeled_texts, predictions):
        max_prob = np.max(probs)
        pred_label = np.argmax(probs)

        if max_prob >= threshold:
            to_add.append((text, pred_label))
            print(f"✅ Añadido: '{text}' → Predicho como {'Institución Financiera' if pred_label==0 else 'Orilla del Río'} (Confianza: {max_prob:.2f})")
        else:
            remaining_texts.append(text)

    if not to_add:
        print("⚠️ No hay muestras de alta confianza para agregar. Deteniendo el bootstrapping.")
        break
    for text, label in to_add:
        new_labeled_texts.append(text)
        new_labels.append(label)
    unlabeled_texts = remaining_texts

    model.fit(new_labeled_texts, new_labels)
    print(f"📈 Reentrenado con {len(new_labeled_texts)} muestras en total.")

print("\n🎯 Tamaño final del conjunto de entrenamiento:", len(new_labeled_texts))

example = ["Necesito ir al banco para una transacción"]
pred = model.predict(example)

sense_map = {0: "Institución Financiera", 1: "Orilla del Río"}
predicted_sense = sense_map[pred[0]]

print("\n🎯 Mejor sentido para la oración de ejemplo:")
print("🔹 Sentido Predicho:", predicted_sense)



📊 Informe de Clasificación Inicial del Modelo:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         2
           1       1.00      1.00      1.00         2

    accuracy                           1.00         4
   macro avg       1.00      1.00      1.00         4
weighted avg       1.00      1.00      1.00         4


🌀 Iteración 1 de Bootstrapping
✅ Añadido: 'El banco ofrece tasas de interés atractivas' → Predicho como Institución Financiera (Confianza: 0.86)
✅ Añadido: 'La orilla del río era hermosa y tranquila' → Predicho como Orilla del Río (Confianza: 0.87)
📈 Reentrenado con 6 muestras en total.

🌀 Iteración 2 de Bootstrapping
⚠️ No hay muestras de alta confianza para agregar. Deteniendo el bootstrapping.

🎯 Tamaño final del conjunto de entrenamiento: 6

🎯 Mejor sentido para la oración de ejemplo:
🔹 Sentido Predicho: Institución Financiera


#Unsupervised Methods (Clustering)

### Block-1 (English)

In [23]:
sentences = [
    "I deposited money at the bank",
    "The bank approved my loan",
    "The river bank was flooded",
    "They had a picnic on the bank of the river",
    "She went to the bank to open an account",
    "We enjoyed the view by the bank"
]

vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(sentences)

num_clusters = 2
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
kmeans.fit(X)
labels = kmeans.labels_


cluster_sense_map = {}
for i in range(num_clusters):
    representative_sentence = sentences[labels.tolist().index(i)]
    if "loan" in representative_sentence or "deposit" in representative_sentence or "account" in representative_sentence:
        cluster_sense_map[i] = "Financial Institution"
    else:
        cluster_sense_map[i] = "River Bank"

print("\n🔍 Clustering-based Word Sense Disambiguation:")
for sentence, cluster in zip(sentences, labels):
    print(f"Cluster {cluster} ({cluster_sense_map[cluster]}): {sentence}")

example_sentence = ["I need to visit the bank for a transaction"]
example_vector = vectorizer.transform(example_sentence)
predicted_cluster = kmeans.predict(example_vector)[0]
predicted_sense = cluster_sense_map[predicted_cluster]

print("\n🎯 Best sense for example sentence:")
print("🔹 Predicted Sense:", predicted_sense)



🔍 Clustering-based Word Sense Disambiguation:
Cluster 0 (Financial Institution): I deposited money at the bank
Cluster 0 (Financial Institution): The bank approved my loan
Cluster 0 (Financial Institution): The river bank was flooded
Cluster 0 (Financial Institution): They had a picnic on the bank of the river
Cluster 1 (Financial Institution): She went to the bank to open an account
Cluster 0 (Financial Institution): We enjoyed the view by the bank

🎯 Best sense for example sentence:
🔹 Predicted Sense: Financial Institution


### Block-2 (Spanish)

In [24]:
sentences = [
    "Encendí una vela en la mesa",
    "Apagó la vela antes de dormir",
    "El viento llenó la vela del barco",
    "El marinero ajustó la vela para navegar mejor",
    "Colocaron una vela en el pastel de cumpleaños",
    "La vela del velero era enorme"
]

vectorizer = TfidfVectorizer(stop_words=spanish_stopwords)
X = vectorizer.fit_transform(sentences)

num_clusters = 2
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
kmeans.fit(X)
labels = kmeans.labels_

cluster_sense_map = {}
for i in range(num_clusters):
    representative_sentence = sentences[labels.tolist().index(i)]
    if "encendí" in representative_sentence or "apagó" in representative_sentence or "pastel" in representative_sentence:
        cluster_sense_map[i] = "Candle (Veladora)"
    else:
        cluster_sense_map[i] = "Sail (Vela de Barco)"

print("\n🔍 Desambiguación de palabras basada en Clustering:")
for sentence, cluster in zip(sentences, labels):
    print(f"Cluster {cluster} ({cluster_sense_map[cluster]}): {sentence}")

example_sentence = ["Puse una vela en la mesa para iluminar"]
example_vector = vectorizer.transform(example_sentence)
predicted_cluster = kmeans.predict(example_vector)[0]
predicted_sense = cluster_sense_map[predicted_cluster]

print("\n🎯 Mejor sentido para la oración de ejemplo:")
print("🔹 Sentido Predicho:", predicted_sense)



🔍 Desambiguación de palabras basada en Clustering:
Cluster 1 (Sail (Vela de Barco)): Encendí una vela en la mesa
Cluster 1 (Sail (Vela de Barco)): Apagó la vela antes de dormir
Cluster 0 (Sail (Vela de Barco)): El viento llenó la vela del barco
Cluster 1 (Sail (Vela de Barco)): El marinero ajustó la vela para navegar mejor
Cluster 1 (Sail (Vela de Barco)): Colocaron una vela en el pastel de cumpleaños
Cluster 1 (Sail (Vela de Barco)): La vela del velero era enorme

🎯 Mejor sentido para la oración de ejemplo:
🔹 Sentido Predicho: Sail (Vela de Barco)



## Method-5: Enhanced Lesk Algorithm (Extended Lesk)

### Block-1 (English)


In [25]:
def enhanced_lesk(context_sentence, ambiguous_word):
    context = set(word_tokenize(context_sentence))
    best_sense = None
    max_overlap = 0

    for sense in wn.synsets(ambiguous_word):
        signature = set(word_tokenize(sense.definition()))
        for ex in sense.examples():
            signature.update(word_tokenize(ex))
        for lemma in sense.lemmas():
            signature.add(lemma.name())

        overlap = len(context.intersection(signature))
        if overlap > max_overlap:
            max_overlap = overlap
            best_sense = sense

    return best_sense

sentence = "I went to the bank to deposit my money"
ambiguous = "bank"
sense = enhanced_lesk(sentence, ambiguous)
print("Enhanced Lesk - Best sense:", sense)
print("Definition:", sense.definition())


Enhanced Lesk - Best sense: Synset('depository_financial_institution.n.01')
Definition: a financial institution that accepts deposits and channels the money into lending activities


### Block-2 (Spanish)

In [26]:
def enhanced_lesk_spanish(context_sentence, ambiguous_word):
    context = set(word_tokenize(context_sentence))
    best_sense = None
    max_overlap = 0
    for sense in wn.synsets(ambiguous_word, lang="spa"):
        signature = set(word_tokenize(sense.definition()))
        for ex in sense.examples():
            signature.update(word_tokenize(ex))
        for lemma in sense.lemmas():
            signature.add(lemma.name())
        overlap = len(context.intersection(signature))
        if overlap > max_overlap:
            max_overlap = overlap
            best_sense = sense
    return best_sense

sentence = "Usé la sierra para cortar la madera"
ambiguous = "sierra"
sense = enhanced_lesk_spanish(sentence, ambiguous)
print("Enhanced Lesk - Mejor sentido:", sense)
if sense:
    print("Definición:", sense.definition())
else:
    print("⚠️ No se encontró una definición.")


Enhanced Lesk - Mejor sentido: Synset('sierra.n.01')
Definición: a range of mountains (usually with jagged peaks and irregular outline)



## Method-6: Word Embeddings Based WSD

### Block-1 (English)


In [27]:
def get_signature_text_english(sense):
    text = sense.definition() + " " + " ".join(sense.examples()) + " " + " ".join([lemma.name() for lemma in sense.lemmas()])
    return text

def embeddings_wsd_english(context_sentence, ambiguous_word):
    senses = wn.synsets(ambiguous_word)
    if not senses:
        return None

    corpus = [context_sentence] + [get_signature_text_english(sense) for sense in senses]

    vectorizer = TfidfVectorizer()
    vectors = vectorizer.fit_transform(corpus)

    context_vector = vectors[0]
    sense_vectors = vectors[1:]

    similarities = cosine_similarity(context_vector, sense_vectors)[0]
    best_sense_index = similarities.argmax()
    return senses[best_sense_index]

sentence = "The dog started to bark loudly"
ambiguous = "bark"
sense = embeddings_wsd_english(sentence, ambiguous)

print("Embeddings WSD - Best Sense:", sense)
print("Definition:", sense.definition() if sense else "⚠️ No definition found.")


Embeddings WSD - Best Sense: Synset('bark.n.02')
Definition: a noise resembling the bark of a dog


### Block-2 (Spanish)

In [28]:
def get_signature_text_spanish(sense):
    text = sense.definition() + " " + " ".join(sense.examples()) + " " + " ".join([lemma.name() for lemma in sense.lemmas()])
    return text

def embeddings_wsd_spanish(context_sentence, ambiguous_word):
    senses = wn.synsets(ambiguous_word, lang="spa")
    if not senses:
        return None

    corpus = [context_sentence] + [get_signature_text_spanish(sense) for sense in senses]

    vectorizer = TfidfVectorizer()
    vectors = vectorizer.fit_transform(corpus)

    context_vector = vectors[0]
    sense_vectors = vectors[1:]

    similarities = cosine_similarity(context_vector, sense_vectors)[0]
    best_sense_index = similarities.argmax()
    return senses[best_sense_index]

sentence = "Encendí una vela en la mesa"
ambiguous = "vela"
sense = embeddings_wsd_spanish(sentence, ambiguous)

print("Embeddings WSD - Mejor sentido:", sense)
print("Definición:", sense.definition() if sense else "⚠️ No se encontró una definición.")


Embeddings WSD - Mejor sentido: Synset('vigil.n.02')
Definición: the rite of staying awake for devotional purposes (especially on the eve of a religious festival)


#Conclusion

Word Sense Disambiguation (WSD) is a crucial task in Natural Language Processing (NLP) that helps determine the correct meaning of an ambiguous word based on its context. In this discussion, we explored multiple approaches to WSD, including supervised, semi-supervised, and unsupervised methods. The Lesk algorithm and its enhanced versions leveraged word definitions and context overlap, while the supervised approach used machine learning models like Support Vector Machines (SVMs) trained on labeled datasets. We also implemented semi-supervised bootstrapping, which expanded the training set by incorporating high-confidence predictions. The unsupervised clustering method applied TF-IDF vectorization and K-Means to group sentences based on similar meanings, offering a purely data-driven approach. Additionally, we experimented with word embeddings-based WSD using cosine similarity to compare contextual and definition vectors without relying on labeled data. Each method has its strengths—rule-based approaches are interpretable but limited by dictionary coverage, supervised methods perform well with sufficient labeled data, and unsupervised techniques offer scalability but may require fine-tuning. Our final consolidated setup ensures that all necessary libraries and datasets are available for seamless execution of any WSD method. These techniques collectively contribute to improving machine understanding of ambiguous words, which is essential for applications like machine translation, information retrieval, and text analysis.