In [1]:
import xml.etree.cElementTree as ET
import codecs

import nltk
from nltk.corpus import wordnet as wn
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.wsd import lesk

import numpy as np
from gensim.models import KeyedVectors
from sklearn.metrics.pairwise import cosine_similarity

import gensim.downloader as api

nltk.download("wordnet")
nltk.download("punkt")
nltk.download("stopwords")
nltk.download("punkt_tab")
word2vec = api.load("word2vec-google-news-300")


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!




In [56]:
class WSDInstance:
    def __init__(self, my_id, lemma, context, index):
        self.id = my_id         # id of the WSD instance
        self.lemma = lemma      # lemma of the word whose sense is to be resolved
        self.context = context  # lemma of all the words in the sentential context
        self.index = index      # index of lemma within the context
    def __str__(self):
        '''
        For printing purposes.
        '''
        return '%s\t%s\t%s\t%d' % (self.id, self.lemma, ' '.join(self.context), self.index)

def load_instances(f):
    '''
    Load two lists of cases to perform WSD on. The structure that is returned is a dict, where
    the keys are the ids, and the values are instances of WSDInstance.
    '''
    tree = ET.parse(f)
    root = tree.getroot()

    dev_instances = {}
    test_instances = {}

    for text in root:
        if text.attrib['id'].startswith('d001'):
            instances = dev_instances
        else:
            instances = test_instances
        for sentence in text:
            # construct sentence context
            context = [to_ascii(el.attrib['lemma']) for el in sentence]
            context = [word.decode('utf-8') if isinstance(word, bytes) else word for word in context]
            for i, el in enumerate(sentence):
                if el.tag == 'instance':
                    my_id = el.attrib['id']
                    my_id = my_id.decode('utf-8') if isinstance(my_id,bytes) else my_id
                    lemma = to_ascii(el.attrib['lemma'])
                    lemma = lemma.decode('utf-8') if isinstance(lemma,bytes) else lemma
                    lemma = lemma.lower()
                    instances[my_id] = WSDInstance(my_id, lemma, context, i)
    return dev_instances, test_instances

def load_key(f):
    '''
    Load the solutions as dicts.
    Key is the id
    Value is the list of correct sense keys.
    '''
    dev_key = {}
    test_key = {}
    for line in open(f):
        if len(line) <= 1: continue
        #print (line)
        doc, my_id, sense_key = line.strip().split(' ', 2)
        if doc == 'd001':
            dev_key[my_id] = sense_key.split()
        else:
            test_key[my_id] = sense_key.split()
    return dev_key, test_key

def to_ascii(s):
    # remove all non-ascii characters
    return codecs.encode(s, 'ascii', 'ignore')




# Preprocess a sentence
def preprocess_sentence(sentence):
    """
    Preprocess a sentence: tokenize, lemmatize, remove stopwords, and handle multi-word phrases.
    """
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words("english"))

    tokens = word_tokenize(sentence.lower())  # Lowercase for consistency
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token.isalnum()]
    tokens = [token.replace("_", " ") for token in tokens]  # Convert underscores to spaces

    return [token for token in tokens if token not in stop_words]


# Most Frequent Sense
def most_frequent_sense(lemma):
    """
    Retrieve the most frequent sense of a word from WordNet.
    """
    synsets = wn.synsets(lemma)
    return synsets[0] if synsets else None

# Lesk's Algorithm
def lesk_method(context, lemma):
    """
    Apply Lesk's algorithm using NLTK.
    """
    return lesk(context, lemma)

def synset_to_sense_keys(synset):
    """
    Convert a WordNet synset to its corresponding lemma sense keys.
    """
    return [lemma.key() for lemma in synset.lemmas()]


# Evaluate WSD methods
def evaluate_wsd_with_keys(instances, keys):
    """
    Evaluate Most Frequent Sense and Lesk methods on the dataset,
    using lemma sense keys for comparison.
    """
    correct_mfs = 0
    correct_lesk = 0
    total_cases = len(instances)

    for instance_id, instance in instances.items():
        lemma = instance.lemma
        context = instance.context
        gold_sense_keys = keys.get(instance_id, [])

        # Most Frequent Sense
        mfs_sense = most_frequent_sense(lemma)
        if mfs_sense:
            mfs_sense_keys = synset_to_sense_keys(mfs_sense)
            if any(key in gold_sense_keys for key in mfs_sense_keys):
                correct_mfs += 1

        # Lesk Algorithm
        lesk_sense = lesk_method(context, lemma)
        if lesk_sense:
            lesk_sense_keys = synset_to_sense_keys(lesk_sense)
            if any(key in gold_sense_keys for key in lesk_sense_keys):
                correct_lesk += 1

    mfs_accuracy = correct_mfs / total_cases
    lesk_accuracy = correct_lesk / total_cases

    return mfs_accuracy, lesk_accuracy



def lesk_disambiguate_tune(instance, pos_tag):
    # Tokenize, lemmatize, and remove stopwords from the context
    context = preprocess_sentence(' '.join(instance.context))

    # Apply Lesk's algorithm for the specified POS tag
    sense = lesk(context, instance.lemma, pos_tag)

    # Extract only the lemma from the predicted sense
    predicted_sense = sense.name().split('.')[0] if sense else None

    return predicted_sense

best_accuracy = 0
pos_tags_to_evaluate = ['n', 'v', 'a']  # You can add more POS tags as needed

for pos_tag in pos_tags_to_evaluate:
        # Use Lesk's algorithm on the dev set for the current POS tag
        dev_predictions_lesk = {k: lesk_disambiguate_tune(v, pos_tag) for k, v in dev_instances.items()}

        # Evaluate accuracy for Lesk's algorithm for the current POS tag
        accuracy_lesk = sum(1 for k, v in dev_predictions_lesk.items() if v in [sense.split('%')[0] for sense in dev_key.get(k, [])]) / len(dev_predictions_lesk)

        accuracy_percentage = accuracy_lesk * 100
        print(f'Accuracy for Lesk\'s algorithm ({pos_tag}): {accuracy_percentage:.2f}%')


        # Update the best accuracy and POS tag if the current accuracy is higher
        if accuracy_lesk > best_accuracy:
            best_accuracy = accuracy_lesk
            best_pos_tag = pos_tag

    # Print the result with the highest accuracy
print(f'Highest Accuracy: {best_accuracy * 100:.2f}% (POS Tag: {best_pos_tag})')




def get_embedding(word, model):
    """
    Get the Word2Vec embedding for a word. Return a zero vector if the word is not in the vocabulary.
    """
    if word in model:
        return model[word]
    return np.zeros(model.vector_size)


def compute_sense_embedding(synset, model):
    """
    Compute the embedding for a WordNet synset based on its definition and examples.
    """
    definition = synset.definition()
    examples = synset.examples()
    words = preprocess_sentence(definition + " " + " ".join(examples))
    embeddings = [get_embedding(word, model) for word in words if word in model]
    if embeddings:
        return np.mean(embeddings, axis=0)
    return np.zeros(model.vector_size)


def compute_context_embedding(context, target_index, model, window=3):
    """
    Compute the embedding for the context around a target word.
    """
    start = max(0, target_index - window)
    end = min(len(context), target_index + window + 1)
    words = [word for i, word in enumerate(context) if i != target_index]
    embeddings = [get_embedding(word, model) for word in words if word in model]
    if embeddings:
        return np.mean(embeddings, axis=0)
    return np.zeros(model.vector_size)


def word2vec_wsd(instance, model):
    """
    Perform WSD using pre-trained Word2Vec embeddings.
    """
    lemma = instance.lemma
    context = preprocess_sentence(" ".join(instance.context))
    target_index = instance.index

    context_embedding = compute_context_embedding(context, target_index, model)
    if np.linalg.norm(context_embedding) == 0:
        return None  # No context embedding available

    best_sense = None
    max_similarity = -1

    for synset in wn.synsets(lemma):
        sense_embedding = compute_sense_embedding(synset, model)
        if np.linalg.norm(sense_embedding) == 0:
            continue
        similarity = cosine_similarity(
            [context_embedding], [sense_embedding]
        )[0][0]
        if similarity > max_similarity:
            max_similarity = similarity
            best_sense = synset

    return best_sense

def evaluate_wsd_with_word2vec(instances, keys, model):
    """
    Evaluate the Word2Vec-based WSD method on the given instances and keys.
    """
    correct = 0
    total = len(instances)

    for instance_id, instance in instances.items():
        gold_sense_keys = keys.get(instance_id, [])
        predicted_synset = word2vec_wsd(instance, model)

        if predicted_synset:
            predicted_sense_keys = [lemma.key() for lemma in predicted_synset.lemmas()]
            if any(key in gold_sense_keys for key in predicted_sense_keys):
                correct += 1

    accuracy = correct / total
    return accuracy

# Main script
if __name__ == "__main__":
    # File paths (update these to your dataset's actual paths)
    data_f = '/content/drive/MyDrive/multilingual-all-words.en.xml'
    key_f = '/content/drive/MyDrive/wordnet.en.key'

    # Load instances and keys
    dev_instances, test_instances = load_instances(data_f)
    dev_key, test_key = load_key(key_f)

    # Filter instances not in keys
    dev_instances = {k: v for k, v in dev_instances.items() if k in dev_key}
    test_instances = {k: v for k, v in test_instances.items() if k in test_key}

    # Evaluate on dev and test sets
    dev_mfs_acc, dev_lesk_acc = evaluate_wsd_with_keys(dev_instances, dev_key)
    print(f"Development Set Accuracy (Most Frequent Sense): {dev_mfs_acc * 100:.2f}%")
    print(f"Development Set Accuracy (Lesk): {dev_lesk_acc * 100:.2f}%")

    test_mfs_acc, test_lesk_acc = evaluate_wsd_with_keys(test_instances, test_key)
    print(f"Test Set Accuracy (Most Frequent Sense): {test_mfs_acc * 100:.2f}%")
    print(f"Test Set Accuracy (Lesk): {test_lesk_acc * 100:.2f}%")

    print("Evaluating Development Set...")
    dev_accuracy = evaluate_wsd_with_word2vec(dev_instances, dev_key, word2vec)
    print(f"Development Set Accuracy: {dev_accuracy * 100:.2f}%")

    print("\nEvaluating Test Set...")
    test_accuracy = evaluate_wsd_with_word2vec(test_instances, test_key, word2vec)
    print(f"Test Set Accuracy: {test_accuracy * 100:.2f}%")


Accuracy for Lesk's algorithm (n): 49.34%
Accuracy for Lesk's algorithm (v): 18.94%
Accuracy for Lesk's algorithm (a): 2.20%
Highest Accuracy: 49.34% (POS Tag: n)
Development Set Accuracy (Most Frequent Sense): 67.53%
Development Set Accuracy (Lesk): 34.02%
Test Set Accuracy (Most Frequent Sense): 62.34%
Test Set Accuracy (Lesk): 34.07%
Evaluating Development Set...
Development Set Accuracy: 47.94%

Evaluating Test Set...
Test Set Accuracy: 48.55%


In [59]:
import random
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score
import ast

def load_unlabeled_data(file_path):
    with open(file_path, "r") as f:
        lines = f.readlines()
    return [preprocess_sentence(line.strip()) for line in lines]

def load_annotated_data(file_path):
    # Load annotated data from a file where each line contains a tuple of (sentence, label).
    annotated_data = []
    try:
        with open(file_path, "r", encoding="utf-8") as file:
            for line in file:
                line = line.strip()
                if not line:
                    continue  # Skip empty lines
                try:
                    # Use `ast.literal_eval` to safely evaluate the tuple
                    sentence_label_tuple = ast.literal_eval(line)
                    if isinstance(sentence_label_tuple, tuple) and len(sentence_label_tuple) == 2:
                        annotated_data.append(sentence_label_tuple)
                    else:
                        raise ValueError(f"Invalid format in line: {line}")
                except Exception as e:
                    print(f"Error parsing line: {line}. Error: {e}")
    except FileNotFoundError:
        print(f"Error: The file {file_path} does not exist.")

    return annotated_data

training_set = [
    ("The fishermen sat by the bank of the river, waiting for a catch.", "bank.n.01"),  # Geography
    ("The river's bank was covered in lush green grass.", "bank.n.01"),
    ("A flock of birds rested on the bank of the tranquil stream.", "bank.n.01"),
    ("The children played on the muddy bank after the rainstorm.", "bank.n.01"),
    ("The hikers paused on the bank to admire the flowing water.", "bank.n.01"),
    ("The boat drifted close to the bank before coming to a stop.", "bank.n.01"),
    ("She deposited her paycheck into her savings account at the bank.", "bank.n.02"),  # Financial
    ("The bank approved their loan application for the new house.", "bank.n.02"),
    ("The investment bank advised the corporation on its merger.", "bank.n.02"),
    ("He withdrew cash from the ATM located inside the bank.", "bank.n.02"),
    ("The bank offers competitive interest rates on savings accounts.", "bank.n.02"),
    ("The local bank recently opened a new branch in the city.", "bank.n.02"),
    ("The blood bank ensures that there is always a supply for emergencies.", "bank.n.03"),  # Storage
    ("She donated her old books to the library's book bank.", "bank.n.03"),
    ("The seed bank preserves rare plant species for future generations.", "bank.n.03"),
    ("The knowledge bank is a valuable resource for researchers.", "bank.n.03"),
    ("They relied on the food bank to get through the tough month.", "bank.n.03"),
    ("The power bank kept his phone charged during the road trip.", "bank.n.03"),
    ("The airplane began to bank sharply to the left.", "bank.v.01"),  # To tilt or incline
    ("The pilot banked the aircraft to avoid turbulence.", "bank.v.01"),
    ("The racing car banked as it sped around the sharp curve.", "bank.v.01"),
    ("The glider banked slightly, following the flow of the wind.", "bank.v.01"),
    ("The bird banked sharply to avoid the predator chasing it.", "bank.v.01"),
    ("The plane banked and descended toward the runway.", "bank.v.01"),
]


def yarowsky_bootstrapping_wordnet(training_set, unlabeled_data, confidence_threshold=0.8):
    """
    Implements Yarowsky's algorithm with logistic regression and bootstrapping using WordNet senses.
    """
    # Separate seed data into features and labels
    seed_texts = [text for text,label in training_set]
    seed_labels = [label for text,label in training_set]
    # Initialize vectorizer
    vectorizer = CountVectorizer()
    X_seed = vectorizer.fit_transform(seed_texts)
    y_seed = np.array(seed_labels)

    # Initialize Logistic Regression
    model = LogisticRegression(C= 1,max_iter=100)

    # Iterative bootstrapping
    for iteration in range(5):
        print(f"Iteration {iteration + 1}")

        # Train on the current labeled data
        model.fit(X_seed, y_seed)

        # Predict on unlabeled data
        X_unlabeled = vectorizer.transform(unlabeled_data)
        probs = model.predict_proba(X_unlabeled)
        predictions = model.predict(X_unlabeled)

        # Select confident predictions
        confident_indices = np.where(probs.max(axis=1) >= confidence_threshold)[0]
        confident_texts = [unlabeled_data[i] for i in confident_indices]
        confident_labels = [predictions[i] for i in confident_indices]


        if not confident_texts:
            print("No confident predictions, stopping early.")
            break

        # Update seed set with confident predictions
        X_new = vectorizer.transform(confident_texts)
        X_seed = np.vstack((X_seed, X_new))
        y_seed = np.hstack((y_seed, confident_labels))

        # Remove confident predictions from unlabeled data
        unlabeled_data = [text for i, text in enumerate(unlabeled_data) if i not in confident_indices]

        print(f"Added {len(confident_texts)} new examples to the seed set.")

        # Terminate if no unlabeled data remains
        if not unlabeled_data:
            print("All unlabeled data processed, stopping.")
            break

    return model, vectorizer

def evaluate_model_wordnet(model, vectorizer, test_instances):
    """
    Evaluate the trained model on labeled test instances using WordNet synsets.
    """
    test_texts = [text for text, label in test_instances]
    test_labels = [label for text, label in test_instances]

    test_texts = [preprocess_sentence(text) for text in test_texts]

    test_texts = [" ".join(row) for row in test_texts]
    X_test = vectorizer.transform(test_texts)

    # Predict and calculate accuracy
    y_pred = model.predict(X_test)

    # Display overall accuracy
    accuracy = accuracy_score(test_labels, y_pred)
    print(f"\n--- Final Test Accuracy: {accuracy:.2%} ---")

test_set = [
    ("He slipped and fell on the muddy bank of the lake.", "bank.n.01"),
    ("A flock of geese gathered on the bank of the stream.", "bank.n.01"),
    ("The tree's roots extended down to the bank of the creek.", "bank.n.01"),
    ("The erosion caused the bank of the river to collapse.", "bank.n.01"),
    ("The river overflowed, flooding the bank near the village.", "bank.n.01"),
    ("A picnic area was set up on the bank of the peaceful pond.", "bank.n.01"),
    ("The fishermen relaxed on the bank as they waited for a bite.", "bank.n.01"),
    ("The kayakers paddled close to the bank to avoid the strong current.", "bank.n.01"),
    ("He withdrew some cash from the bank's ATM.", "bank.n.02"),
    ("The bank offers competitive interest rates on savings accounts.", "bank.n.02"),
    ("The investment bank handled the company's IPO.", "bank.n.02"),
    ("They visited the bank to discuss their mortgage options.", "bank.n.02"),
    ("The bank recently introduced a new credit card with no annual fees.", "bank.n.02"),
    ("The local bank opened a new branch in the neighboring town.", "bank.n.02"),
    ("The customer was impressed with the bank's mobile app.", "bank.n.02"),
    ("The bank is known for its excellent customer service.", "bank.n.02"),
    ("The seed bank preserves plant species for future generations.", "bank.n.03"),
    ("They maintained a data bank of customer preferences.", "bank.n.03"),
    ("The knowledge bank provides resources for research and development.", "bank.n.03"),
    ("The power bank kept his phone charged during the trip.", "bank.n.03"),
    ("The blood bank issued an urgent call for donations.", "bank.n.03"),
    ("The library created a book bank to support underprivileged students.", "bank.n.03"),
    ("The researchers accessed a data bank to analyze historical trends.", "bank.n.03"),
    ("The hikers carried a power bank to keep their devices charged.", "bank.n.03"),
    ("The eagle banked gracefully as it glided through the air.", "bank.v.01"),
    ("He watched the jet bank and descend toward the runway.", "bank.v.01"),
    ("The drone banked sharply to avoid the obstacle.", "bank.v.01"),
    ("The plane banked as it prepared for landing.", "bank.v.01"),
    ("The pilot banked the airplane to avoid turbulence.", "bank.v.01"),
    ("The racing car banked as it sped around the sharp corner.", "bank.v.01"),
    ("The glider banked smoothly, following the wind currents.", "bank.v.01"),
    ("The bird banked sharply to avoid the oncoming predator.", "bank.v.01"),
]

unlabeled_data = load_unlabeled_data("/content/drive/MyDrive/bank.txt")
unlabeled_data = [" ".join(row) for row in unlabeled_data]
model, vectorizer = yarowsky_bootstrapping_wordnet(training_set, unlabeled_data)
evaluate_model_wordnet(model, vectorizer, test_set)


Iteration 1
No confident predictions, stopping early.

--- Final Test Accuracy: 56.25% ---
