# Programming assignment 2

WordNet interface documentation: https://www.nltk.org/howto/wordnet.html

In [1]:
import copy
import xml.etree.cElementTree as ET

import nltk
nltk.download('wordnet')
nltk.download('stopwords')
from nltk.corpus import wordnet
from nltk.corpus import wordnet as wn
from nltk.corpus.reader.wordnet import WordNetError
import nltk.wsd

import numpy as np

import sklearn
import sklearn.naive_bayes
import sklearn.utils.testing

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Helgi\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Helgi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# Load dataset using loader.py
# Apply word tokenization
# Apply word lemmatization
# Remove stop words

# Compare the following two methods for Word Sense Disambiguation:
# - The most frequenst sense baseline (First synset)
# - NLTK's implementation of Lesk's algorithm (nltk.wsd.lesk)

# Develop two additional methods to solve this problem.
# - One must use the idea of bootstrapping. This may require you to acquire additional texts in English.
#   Since bootstrapping often requires you to specify knowledge about words using heuristics or by
#   specifying a seed set, be sure that your method to start the bootstrapping process covers at least
#   five different lexical items for which you are performing WSD.
# - Any other method of your design. The two methods must be entirely different

# Justify decisions about any other parameters to the algorithms, such as what exactly to include
#   in the sense and context representations, how to compute overlap, the use of the development set,
#   which the started code will load for you.

# You may use any heauristic, probabilistic model or any other statistical method that we have discussed
#   in class.

# Evaluation metric: Accuracy

## Load data

In [3]:
class WSDInstance:
    def __init__(self, my_id, lemma, pos, context, index):
        self.id = my_id         # id of the WSD instance
        self.lemma = lemma      # lemma of the word whose sense is to be resolved
        self.pos = pos          # The position tag
        self.context = context  # lemma of all the words in the sentential context
        self.index = index      # index of lemma within the context
    def __str__(self):
        '''
        For printing purposes.
        '''
        return f"{self.id}\\{self.lemma}\\{self.pos}\\{' '.join(self.context)}\\{self.index}"

class WSDKey:
    def __init__(self, sense_key):
        self.lemma, lex_sense = sense_key.split('%')
        self.ss_type, self.lex_filenum, self.lex_id, self.head_word, self.head_id = lex_sense.split(':')
    def __str__(self):
        '''
        For printing purposes.
        '''
        return f"{self.lemma}%{self.ss_type}:{self.lex_filenum}:{self.lex_id}:{self.head_word}:{self.head_id}"

def ss_type_to_str(ss_type):
    return {
        '1': 'n', # Noun
        '2': 'v', # Verb
        '3': 'a', # Adjective
        '4': 's', # Adjective satellite
        '5': 'r'  # Adverb
    }[ss_type]

def to_synset(wsd_key):
    ss_type_str = ss_type_to_str(wsd_key.ss_type)
    synset_name = f'{wsd_key.lemma}.{ss_type_str}.{wsd_key.lex_id}'
    try:
        return wn.synset(synset_name)
    except WordNetError as e:
        print(f'WordNetError: WSDKey {wsd_key} has no corresponding synset {synset_name}.')
        return None

def load_instances(file_name):
    '''
    Load two lists of cases to perform WSD on. The structure that is returned is a dict, where
    the keys are the ids, and the values are instances of WSDInstance.
    '''
    tree = ET.parse(file_name)
    root = tree.getroot()
    
    train_instances = {}
    valid_instances = {}
    test_instances = {}
    
    for text in root:
        if text.attrib['id'].startswith('d001'):
            instances = test_instances
        elif text.attrib['id'].startswith('d007'):
            instances = valid_instances
        else:
            instances = train_instances
        for sentence in text:
            # construct sentence context
            context = [el.attrib['lemma'] for el in sentence]
            for i, el in enumerate(sentence):
                if el.tag == 'instance':
                    my_id = el.attrib['id']
                    lemma = el.attrib['lemma']
                    pos = el.attrib['pos'][0].lower()
                    instances[my_id] = WSDInstance(my_id, lemma, pos, context, i)
    return train_instances, valid_instances, test_instances

def load_key(file_name):
    '''
    Load the solutions as dicts.
    Key is the id
    Value is the list of correct sense keys. 
    '''
    train_key = {}
    valid_key = {}
    test_key = {}
    
    for line in open(file_name, encoding="utf-8"):
        if len(line) <= 1:
            continue
        
        doc, my_id, sense_keys = line.strip().split(' ', 2)
        synsets = [to_synset(WSDKey(sense_key)) for sense_key in sense_keys.split()]
        synsets = [synset for synset in synsets if synset is not None]
        if len(synsets) is 0:
            print(f'\tID {my_id} has no synsets. Dropping.')
            continue
        
        if doc == 'd001':
            test_key[my_id] = synsets
        elif doc == 'd007':
            valid_key[my_id] = synsets
        else:
            train_key[my_id] = synsets
    return train_key, valid_key, test_key

data_folder = 'E:\\Repos\\comp550\\assignment2\\data'
data_file = f'{data_folder}\\multilingual-all-words.en.xml'
key_file = f'{data_folder}\\wordnet.en.key'

train_instances, valid_instances, test_instances = load_instances(data_file)
train_key, valid_key, test_key = load_key(key_file)

# IMPORTANT: keys contain fewer entries than the instances; need to remove them
train_instances = {k:v for (k,v) in train_instances.items() if k in train_key}
valid_instances = {k:v for (k,v) in valid_instances.items() if k in valid_key}
test_instances = {k:v for (k,v) in test_instances.items() if k in test_key}

print('Dataset stats:')
total_instances = len(train_instances) + len(valid_instances) + len(test_instances)
print(f'Train instances: {len(train_instances)}/{total_instances} = {100*(len(train_instances)/total_instances):.2f}%')
print(f'Valid instances: {len(valid_instances)}/{total_instances} = {100*(len(valid_instances)/total_instances):.2f}%')
print(f'Test instances: {len(test_instances)}/{total_instances} = {100*(len(test_instances)/total_instances):.2f}%')

WordNetError: WSDKey budget%1:21:03:: has no corresponding synset budget.n.03.
	ID d012.s022.t005 has no synsets. Dropping.
WordNetError: WSDKey budget%1:21:03:: has no corresponding synset budget.n.03.
	ID d012.s023.t013 has no synsets. Dropping.
Dataset stats:
Train instances: 1270/1642 = 77.34%
Valid instances: 178/1642 = 10.84%
Test instances: 194/1642 = 11.81%


## Helper functions

In [4]:
def read_lines(file_path):
    with open(file_path, 'r', encoding='ISO-8859-1') as file_handler:
        return [sentence.rstrip() for sentence in file_handler.readlines()]

class Lemmatizer:
    def __init__(self):
        self.normalizer = nltk.stem.WordNetLemmatizer()
        self.tag_prefix_dict = {
            'J': nltk.corpus.wordnet.ADJ,
            'N': nltk.corpus.wordnet.NOUN,
            'V': nltk.corpus.wordnet.VERB,
            'R': nltk.corpus.wordnet.ADV
        }
    
    def __call__(self, document):
        tokens = nltk.word_tokenize(document)
        return [
            self.normalizer.lemmatize(token, pos=self.get_tag_class(tag))
            for token, tag in nltk.pos_tag(tokens)
        ]
    
    def get_tag_class(self, tag):
        prefix = tag[0].upper()
        return self.tag_prefix_dict.get(prefix, nltk.corpus.wordnet.NOUN)

class Stemmer:
    def __init__(self):
        self.normalizer = nltk.stem.PorterStemmer()
    
    def __call__(self, document):
        return [
            self.normalizer.stem(token)
            for token in nltk.word_tokenize(document)
        ]

def fit_vectorizer(X_data, tokenizer, stop_words, min_df):    
    vectorizer = sklearn.feature_extraction.text.CountVectorizer(
        tokenizer=tokenizer,
        stop_words=stop_words,
        min_df=min_df
    )
    vectorizer.fit_transform(X_data)
    return vectorizer

@sklearn.utils.testing.ignore_warnings(category=sklearn.exceptions.ConvergenceWarning)
def random_search(models, search_params, n_datasets=1, n_models=1):
    data_sets = []
    results = [{} for i in range(n_datasets)]

    data_set_variations = [
        choose_random_params(search_params['data'])
        for i in range(n_datasets)
    ]
    model_variations = {
        model_name: [choose_random_params(search_params['model'][model_name]) for i in range(n_models)]
        for model_name in search_params['model'].keys()
    }
    
    for i in range(n_datasets):
        print(f'Data set variation {i+1}/{n_datasets}')

        data_params = data_set_variations[i]
        data_sets.append(data_params)

        print('\tFitting vectorizer...')
        vectorizer = fit_vectorizer(X_train_raw, **data_params)

        X_train = vectorizer.transform(X_train_raw)
        X_valid = vectorizer.transform(X_valid_raw)
        X_test = vectorizer.transform(X_test_raw)

        for model_name, model_class in models.items():
            for j in range(n_models):
                print(f'\t{model_name} {j+1}/{n_models}')

                model_params = model_variations[model_name][j]

                model = model_class(**model_params)

                model.fit(X_train, y_train)
                valid_predictions = model.predict(X_valid)
                test_predictions = model.predict(X_test)
                
                valid_accuracy = sklearn.metrics.accuracy_score(y_valid, valid_predictions)
                
                # This number is only looked at once at the very end when the best models have been chosen based on validation accuracy
                test_accuracy = sklearn.metrics.accuracy_score(y_test, test_predictions)
                test_confusion_matrix = sklearn.metrics.confusion_matrix(y_test, test_predictions)

                if results[i].get(model_name, None) is None:
                    results[i][model_name] = []

                results[i][model_name].append({
                    'model_params': model_params,
                    'valid_accuracy': valid_accuracy,
                    'test_accuracy': test_accuracy,
                    'test_confusion_matrix': test_confusion_matrix
                })
    
    return data_sets, results

def choose_random_params(parameters):
    return {
        name: np.random.choice(values)
        for name, values in parameters.items()
    }


## Models

In [5]:
class MostFrequentSense:
    def predict(self, instances):
        return [wn.synsets(instance.lemma)[0] for key, instance in instances.items()]

class NltkLesk:
    def predict(self, instances, use_pos):
        results = []
        for key, instance in instances.items():
            pos = instance.pos if use_pos else None
            results.append(nltk.wsd.lesk(instance.context, instance.lemma, pos=pos))
        return results

class YarowskyBootstrapping:
    """
    Yarowsky bootstrapping algorithm implementation
    
    Paper: https://www.aclweb.org/anthology/P95-1026.pdf

    Description of algorithm. For every ambiguous word:
        1. Gather all sentences that contain the ambiguous word as an initially untagged training set.
        2. Initialise a supervised classifier of n class outputs, where n is the amount of senses the word posesses in WordNet.
        3. For each sense of the ambiguous word, tag several corresponding examples using known labels, thus constructing the first seed set. Notes:
            * Yarowsky creates his seed set by tagging 2-15% of the ambiguous word examples with their true senses.
            * The rest of the untagged examples is called a residual.
        4. Train the supervised model on the seed set.
        5. Apply the algorithm to all the residual, only keeping high accuracy hits. Add the high accuracy hits to the seed sets.
            * The hyper-parameter threshold controls how certain the model needs to be to classify the word
        6. Repeat until num_iter is hit or the residual runs out.
    """
    def __init__(self, base_model, base_model_params, threshold, num_iter):
        self.models = {}
        self.base_model = base_model
        self.base_model_params = base_model_params
        self.threshold = threshold
        self.num_iter = num_iter
    
    def _construct_word_instances_dict(instances, synsets_lists):
        word_instances_dict = {}
        for i in range(len(instances)):
            instance = instances[i]
            synsets = synsets_lists[i]
            if not instance.lemma in word_instances_dict:
                word_instances_dict[instance.lemma] = []
            word_instances_dict[instance.lemma].append((instance, synsets))
        return word_instances_dict
    
    def train(self, instances, synsets_lists):
        pass
#         assert type(instances) == list
#         assert type(synsets_lists) == list
#         assert len(instances) == len(synsets_lists)
        
#         word_instances_dict = _construct_word_instances_dict(instances, synsets_lists)
#         self.models = {lemma: self.base_model(**self.base_model_params) for word_instances_dict.keys()}
        
#         for lemma, model in self.models.items():
#             word_instances_dict[lemma]
#             for synset in wn.synsets(lemma):
                
        
    
    def predict(self):
        pass

def calc_accuracy(truth, predictions):
    """
    truth: A list of lists of correct nltk.synsets
    predictions: A list of synsets predicted by the model to evaluate
    """
    assert len(truth) == len(predictions)
    hits = 0
    for i in range(len(predictions)):
        if predictions[i] in truth[i]:
            hits += 1
    return hits/len(truth)


### Most frequent sense

In [6]:
mfs_model = MostFrequentSense()
predictions = mfs_model.predict(valid_instances)
accuracy = calc_accuracy([v for k, v in valid_key.items()], predictions)
print(f'Most frequent sense valid accuracy: {accuracy}')

Most frequent sense valid accuracy: 0.33146067415730335


In [7]:
predictions = mfs_model.predict(test_instances)
accuracy = calc_accuracy([v for k, v in test_key.items()], predictions)
print(f'Most frequent sense test accuracy: {accuracy}')

Most frequent sense test accuracy: 0.3556701030927835


### nltk.wsd.lesk

In [13]:
from nltk.corpus import stopwords

params = {
    'remove_stopwords': [False, True, False, True],
    'use_pos' : [False, False, True, True]
}

nltk_lesk = NltkLesk()
for i in range(4):
    model_train_instances = copy.deepcopy(train_instances)
    
    remove_stopwords = params['remove_stopwords'][i]
    use_pos = params['use_pos'][i]
    
    if remove_stopwords:
        stopwords_list = stopwords.words('english')
        for key, instance in model_train_instances.items():
            for word in stopwords_list:
                if word in instance.context:
                    instance.context.remove(word)
    
    predictions = nltk_lesk.predict(model_train_instances, use_pos)
    
    print(f'remove_stopwords: {remove_stopwords}, use_pos: {use_pos}')
    print(f'\tAccuracy: {calc_accuracy([v for k, v in train_key.items()], predictions)}')


remove_stopwords: False, use_pos: False
	Accuracy: 0.39921259842519685
remove_stopwords: True, use_pos: False
	Accuracy: 0.4094488188976378
remove_stopwords: False, use_pos: True
	Accuracy: 0.4440944881889764
remove_stopwords: True, use_pos: True
	Accuracy: 0.494488188976378


In [12]:
print('nltk.wsd.lesk test accuracy:', end='\n\n')
remove_stopwords = False
use_pos = True



nltk.wsd.lesk test accuracy:



### Yarowsky's algorithm (bootstrapping)

In [None]:
# threshold + num_iter

params = {
    'remove_stopwords': [False, True, False, True],
    'use_pos' : [False, False, True, True]
}

for i in range(4):

In [80]:

np.arange(start=0.8, stop=1.01, step=0.05)

RuntimeError: The current Numpy installation ('E:\\Repos\\comp550\\venv\\lib\\site-packages\\numpy\\__init__.py') fails to pass a sanity check due to a bug in the windows runtime. See this issue for more information: https://tinyurl.com/y3dm3h86

In [None]:
search_params = {
    'data': {
        'tokenizer': [Lemmatizer(), Stemmer()],
        'stop_words': ['english', None],
        'min_df': [1, 2, 3] # Minimum token frequency
    },
    'model': {
        'logistic_regression': {
            'eta0': [1e-3, 1e-2, 1e-1], # learning rate
            'alpha': [1e-3, 1e-2, 1e-1], # regularization
            'max_iter': np.arange(start=1, stop=5), # epochs
            'random_state': [random_state]
        },
        'yarowsky_bootstrapping': {
            'base_model': sklearn.linear_model.SGDClassifier
            'base_model_params': {
                'kernel': ['linear'],
                'max_iter': np.arange(start=1, stop=5), # epochs
                'C': [1e-3, 1e-2, 1e-1], # L2 regularization
                'random_state': [random_state]
            },
            'yarowsky_params': {
                'threshold': np.arange(start=0.8, stop=1.01, step=0.05)
            }
        },
        'naive_bayes': {
            'alpha': np.arange(start=0.1, stop=1.1, step=0.1)
        },
        'random_forest': {
            'n_estimators': np.arange(start=10, stop=1000, step=10),
            'max_depth': np.append(np.array(None), np.arange(start=1, stop=5, step=2)),
            'random_state': [random_state]
        }
    }
}

models = {
    'yarowsky_bootstrapping': YarowskyBootstrapping,
    'naive_bayes': sklearn.naive_bayes.MultinomialNB
}

data_sets, results = random_search(models, search_params, n_datasets=3, n_models=4)