Implementation for Task 5. Only using a part of the corpus since the runtimes would be too long using the whole 100 million word dataset. Aim of the task is to classify whether an adjective/adverb-noun pair in a sentence is a metaphor or not. The procedure of task 5 is also used as a base for the later tasks.

In [3]:
import nltk
from nltk.corpus import wordnet as wn
from nltk.corpus.reader.bnc import BNCCorpusReader
from nltk.stem.wordnet import WordNetLemmatizer
import numpy as np
from operator import itemgetter
import re

In [4]:
#only using folders A-C for now
reader = BNCCorpusReader(root='BNC/Texts/', fileids=r'[A-C]/\w*/\w*\.xml')

In [5]:
'''Casting reader output to list. For some reason,
this makes all other processing to go tens of times faster.
The downside is that the casting takes significant amount of time,
but it has to be done only once.'''

#Runtime with CPU Ryzen 5 5600X ~12 minutes
sents = list(reader.tagged_sents())

In [6]:
#Runtime with CPU Ryzen 5 5600X ~12 minutes
words = list(reader.words())

In [7]:
#some preprocessing (removing special characters and numbers from the list of words and changing all the words into lower case)
#runtime would be about 1.5 hours if using the whole corpus, though probably (and hopefully) shorter on a better computer

wordlist = []
special_chars = ['(',')',',','"','.','!','?','-','\'','‘','’','—',':']
for w in words:
    if w not in special_chars and not w.isnumeric():
        wordlist.append(w.lower())

In [8]:
len(wordlist)

37525438

In [9]:
from nltk import FreqDist
freqDist = FreqDist(wordlist)

In [14]:
def find_pair(sentence):
    '''
    Finds adjective/adverb-noun part-of-speech in a given sentence using nltk part-of-speech tagging. 
    Returns only the first occurence of such pair in a sentence.
    '''
    pair = []
    tagged_words = nltk.pos_tag(sentence.split())
    adjectives = ['JJ', 'JJR', 'JJS', 'RB', 'RBR', 'RBS']
    nouns = ['NN', 'NNS', 'NNPS', 'NNP']
    for i in range(len(tagged_words) - 1):
        word1_category = tagged_words[i][1]
        word2_category = tagged_words[i + 1][1]
        if word1_category in adjectives and word2_category in nouns:
            pair = [tagged_words[i][0].lower(), re.sub('\W+','', tagged_words[i + 1][0]).lower()]
            return pair
    return pair

def check_senses(pair):
    '''
    Given an adjective/adverb-noun pair checks that the adjective/adverb has more than one sense and the noun has an
    entry in WordNet.
    '''
    adj = pair[0]
    noun = pair[1]
    if len(wn.synsets(adj)) == 1:
        print('Adjective has only one sense!')
        return 2
    elif len(wn.synsets(noun)) == 0:
        print('Noun has no entry in WordNet!')
        return 3
    return True

def find_words_near(node):
    '''
    Finds nouns appearing next to a given node word by checking each sentence of the corpus individually.
    '''
    print('Looking for words appearing next to', node)
    words_near = []
    for sentence in sents:
        for i in range(len(sentence)):
            if sentence[i][0] == node:
                indexesToTry = [i - 1, i + 1]
                for index in indexesToTry:
                    if index >= 0 and index < len(sentence):
                        if sentence[index][1] and sentence[index][1] == 'SUBST':
                            words_near.append(sentence[index][0].lower())
    return words_near

def find_collocates(node, words_near):
    '''
    Finds all unique collocate nouns from a list of nouns that appear near the node word. A noun is considered a collocate
    when its mutual information to the node is greater or equal to 3. Only considers nouns that appear at least twice
    near the node word.
    '''
    print('Determining the collocates of', node)
    collocates = []
    checked = []
    freq_node = freqDist[node]
    for word in words_near:
        if word not in checked:
            checked.append(word)
            freq_near = words_near.count(word)
            if freq_near >= 2:
                freq_collocate = freqDist[word]
                mutual_information = calculate_mutual_information(freq_node, freq_collocate, freq_near)
                if mutual_information >= 3:
                    if (word, mutual_information) not in collocates:
                        collocates.append((word, mutual_information))
    return collocates

def calculate_mutual_information(freq_node, freq_collocate, freq_near):
    '''
    Calculates the mutual information between a node and a possible collocate using the expression (2) in
    the article https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0062343 by Neuman et al (2013).
    '''
    corpus_size = len(wordlist)
    span = 2 # span = 2 * lexical window
    mutual_information = np.log10((freq_near * corpus_size)/(freq_node * freq_collocate * span))/np.log10(2)
    return mutual_information

with open('WordNet2/Words.cat', 'r') as file:
    data = file.readlines()
    nouns = data[25040:94946]
    
#tidying the noun categorization data
tidy_nouns = []
for noun in nouns:
    noun = noun.replace(' (1)', '')
    noun = noun.replace('\t', '')
    noun = noun.replace('\n', '')
    noun = noun.lower()
    tidy_nouns.append(noun)

def find_classes(collocates):
    '''
    Classifies a list of nouns using WordStat. Returns a list of nouns (and their mutual information also given in
    the input list) that belong to a concrete class. The list is sorted by the mutual information value in ascending order.
    '''
    Lem = WordNetLemmatizer()

    current_class = ''
    classified_nouns = []
    
    #there should be 13 concrete classes out of the 25 noun classes
    concrete_classes = ['animal', 'artifact', 'body', 'event', 'food', 'group', 'location', 'object', 'person', 'possession',
                       'plant', 'shape', 'substance']

    for target in collocates:
        if tidy_nouns.count(target[0]) > 0:
            for noun in tidy_nouns:
                #changing the noun class (e.g. when NOUN.FOOD is encountered, class is FOOD until the next noun
                #class is encountered)
                if 'noun.' in noun:
                    noun_split = noun.split('.')
                    current_class = noun_split[1]
                elif noun == Lem.lemmatize(target[0]) and current_class in concrete_classes:
                    #only adding nouns that are not already in the list (some might be in more than one conrete class)
                    if (noun, target[1]) not in classified_nouns:
                        classified_nouns.append((noun, target[1]))
    #sorting by the mutual information value
    classified_nouns = sorted(classified_nouns, key=itemgetter(1))
    return classified_nouns

def calculate_compatibility(classified_nouns, node):
    '''
    Calulates the Wu and Palmer semantic similarity between a node noun and a list of nouns.
    '''
    compatible = []
    for word in classified_nouns:
        syn1 = wn.synsets(word[0])[0]
        syn2 = wn.synsets(node)[0]
        similarity = syn1.wup_similarity(syn2)
        if similarity >= 0.3:
            compatible.append(word[0])
    return compatible

In [15]:
def is_a_metaphor(sentence):
    '''
    Determines whether a sentence includes a type III metaphor (adjective/adverb-noun metaphor) by going through a set of steps.
    '''
    pair = find_pair(sentence)
    if not pair:
        print('No pair!')
        return
    print('Pair found: ', pair)
        
    adjective = pair[0]
    noun = pair[1]
    
    check = check_senses(pair)
    if check == 2:
        print(adjective, noun, 'is not a metaphore')
        return False
    elif check == 3:
        return 'Result is unknown'
    
    words_near = find_words_near(adjective)
    
    collocates = find_collocates(adjective, words_near)
    print('Found', len(collocates), 'unique collocates')
    
    if not collocates:
        print('No collocates found')
        return 'Result is unknown'

    classified_nouns = find_classes(collocates)
    print(len(classified_nouns), 'collocate words appear in concrete classes')
    
    if not classified_nouns:
        print(adjective, noun, 'is a metaphore')
        return True
    
    top_three = classified_nouns[-3:]
    print('The top three collocates are', top_three)
    
    compatible = calculate_compatibility(top_three, noun)
    print('Out of the top three collocates belonging to concrete classes,', len(compatible), 'are compatible with the noun')
    
    if compatible:
        print(adjective, noun, 'is not a metaphore')
        return False
    print(adjective, noun, 'is a metaphore')
    return True

Below are a few example sentences. Most of the test phrases are classified correctly, but the phrase 'dead center' is classified incorrectly.

In [16]:
is_a_metaphor('She is such a dramatic person!')

Pair found:  ['dramatic', 'person']
Looking for words appearing next to dramatic
Determining the collocates of dramatic
Found 141 unique collocates
57 collocate words appear in concrete classes
The top three collocates are [('skyline', 7.286312210033967), ('scenery', 8.366340591449612), ('turnaround', 8.73769155637653)]
Out of the top three collocates belonging to concrete classes, 2 are compatible with the noun
dramatic person is not a metaphore


False

In [17]:
is_a_metaphor('He has a green thumb.')

Pair found:  ['green', 'thumb']
Looking for words appearing next to green
Determining the collocates of green
Found 203 unique collocates
132 collocate words appear in concrete classes
The top three collocates are [('sward', 8.68512566908715), ('malachite', 9.872211221687644), ('baize', 9.944619717215694)]
Out of the top three collocates belonging to concrete classes, 0 are compatible with the noun
green thumb is a metaphore


True

In [18]:
is_a_metaphor('I have a curious cat who likes to get into trouble.')

Pair found:  ['curious', 'cat']
Looking for words appearing next to curious
Determining the collocates of curious
Found 53 unique collocates
25 collocate words appear in concrete classes
The top three collocates are [('traveller', 6.581917043277252), ('mixture', 7.392994339811771), ('coincidence', 8.600964498014095)]
Out of the top three collocates belonging to concrete classes, 1 are compatible with the noun
curious cat is not a metaphore


False

In [19]:
is_a_metaphor('They stood in the dead center of the room.')

Pair found:  ['dead', 'center']
Looking for words appearing next to dead
Determining the collocates of dead
Found 93 unique collocates
54 collocate words appear in concrete classes
The top three collocates are [('magpie', 8.682252979315605), ('aspidistra', 9.438486907186588), ('ringer', 10.356024746994615)]
Out of the top three collocates belonging to concrete classes, 2 are compatible with the noun
dead center is not a metaphore


False

In [20]:
is_a_metaphor('He has a cold heart.')

Pair found:  ['cold', 'heart']
Looking for words appearing next to cold
Determining the collocates of cold
Found 98 unique collocates
55 collocate words appear in concrete classes
The top three collocates are [('chisel', 7.92888218876678), ('lino', 8.117327278179891), ('compress', 8.422576631920624)]
Out of the top three collocates belonging to concrete classes, 0 are compatible with the noun
cold heart is a metaphore


True