Task 7. Instead of cheking the Wu & Palmer similarity like in Task 5, the compatibility between the node noun and the collocates (of the adjective) is checked using domain information. If at least one of the collocates belong to the same domain as the noun, they are compatible and the phrase is classified as not a metaphor. 

In [1]:
import nltk
from nltk.corpus import wordnet as wn
from nltk.corpus.reader.bnc import BNCCorpusReader
from nltk.stem.wordnet import WordNetLemmatizer
import numpy as np
from operator import itemgetter
import re

In [2]:
#only using folders A-C for now
reader = BNCCorpusReader(root='BNC/Texts/', fileids=r'[A-C]/\w*/\w*\.xml')
sents = reader.tagged_sents()
words = reader.words()

In [3]:
#some preprocessing (removing special characters and numbers from the list of words and changing all the words into lower case)
#runtime would be about 1.5 hours if using the whole corpus, though probably (and hopefully) shorter on a better computer

wordlist = []
special_chars = ['(',')',',','"','.','!','?','-','\'','‘','’','—',':']
for w in words:
    if w not in special_chars and not w.isnumeric():
        wordlist.append(w.lower())

In [4]:
len(wordlist)

37525438

In [5]:
def find_pair(sentence):
    '''
    Finds adjective/adverb-noun part-of-speech in a given sentence using nltk part-of-speech tagging. 
    Returns only the first occurence of such pair in a sentence.
    '''
    pair = []
    tagged_words = nltk.pos_tag(sentence.split())
    adjectives = ['JJ', 'JJR', 'JJS', 'RB', 'RBR', 'RBS']
    nouns = ['NN', 'NNS', 'NNPS', 'NNP']
    for i in range(len(tagged_words) - 1):
        word1_category = tagged_words[i][1]
        word2_category = tagged_words[i + 1][1]
        if word1_category in adjectives and word2_category in nouns:
            pair = [tagged_words[i][0].lower(), re.sub('\W+','', tagged_words[i + 1][0]).lower()]
            return pair
    return pair

def check_senses(pair):
    '''
    Given an adjective/adverb-noun pair checks that the adjective/adverb has more than one sense and the noun has an
    entry in WordNet.
    '''
    adj = pair[0]
    noun = pair[1]
    if len(wn.synsets(adj)) == 1:
        print('Adjective has only one sense!')
        return False
    elif len(wn.synsets(noun)) == 0:
        print('Noun has no entry in WordNet!')
        return False
    return True

def find_words_near(node):
    '''
    Finds nouns appearing next to a given node word by checking each sentence of the corpus individually.
    '''
    print('Looking for words appearing next to', node)
    words_near = []
    for sentence in sents:
        for i in range(len(sentence)):
            if sentence[i][0] == node:
                indexesToTry = [i - 1, i + 1]
                for index in indexesToTry:
                    if index >= 0 and index < len(sentence):
                        if sentence[index][1] and sentence[index][1] == 'SUBST':
                            words_near.append(sentence[index][0].lower())
    return words_near

def find_collocates(node, words_near):
    '''
    Finds all unique collocate nouns from a list of nouns that appear near the node word. A noun is considered a collocate
    when its mutual information to the node is greater or equal to 3. Only considers nouns that appear at least twice
    near the node word.
    '''
    print('Determining the collocates of', node)
    collocates = []
    checked = []
    freq_node = wordlist.count(node)
    for word in words_near:
        if word not in checked:
            checked.append(word)
            freq_near = words_near.count(word)
            if freq_near >= 2:
                freq_collocate = wordlist.count(word)
                mutual_information = calculate_mutual_information(freq_node, freq_collocate, freq_near)
                if mutual_information >= 3:
                    if (word, mutual_information) not in collocates:
                        collocates.append((word, mutual_information))
    return collocates

def calculate_mutual_information(freq_node, freq_collocate, freq_near):
    '''
    Calculates the mutual information between a node and a possible collocate using the expression (2) in
    the article https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0062343 by Neuman et al (2013).
    '''
    corpus_size = len(wordlist)
    span = 3 #maybe?
    mutual_information = np.log10((freq_near * corpus_size)/(freq_node * freq_collocate * span))/np.log10(2)
    return mutual_information

with open('WordNet2/Words.cat', 'r') as file:
    data = file.readlines()
    nouns = data[25040:94946]
    
tidy_nouns = []
for noun in nouns:
    noun = noun.replace(' (1)', '')
    noun = noun.replace('\t', '')
    noun = noun.replace('\n', '')
    noun = noun.lower()
    tidy_nouns.append(noun)

def find_classes(collocates):
    '''
    Classifies a list of nouns using WordStat. Returns a list of nouns (and their mutual information also given in
    the input list) that belong to a concrete class. The list is sorted by the mutual information value in ascending order.
    '''
    Lem = WordNetLemmatizer()

    current_class = ''
    classified_nouns = []
    
    #there should be 13 concrete classes out of the 25 classes, now there is 12 and some might be incorrect :D
    concrete_classes = ['animal', 'artifact', 'body', 'event', 'food', 'location', 'object', 'person', 'possession',
                       'plant', 'shape', 'substance']

    for target in collocates:
        if tidy_nouns.count(target[0]) > 0:
            for noun in tidy_nouns:
                #changing the noun class (e.g. when NOUN.FOOD is encountered, class is FOOD until the next noun
                #class is encountered)
                if 'noun.' in noun:
                    noun_split = noun.split('.')
                    current_class = noun_split[1]
                elif noun == Lem.lemmatize(target[0]) and current_class in concrete_classes:
                    #only adding nouns that are not already in the list (some might be in more than one conrete class)
                    if (noun, target[1]) not in classified_nouns:
                        classified_nouns.append((noun, target[1]))
    #sorting by the mutual information value
    classified_nouns = sorted(classified_nouns, key=itemgetter(1))
    return classified_nouns

In [6]:
#Because the WordNet domain categorising uses version 2.0 (or 1.6) of WordNet and the current nltk version is 3.0,
#we need to download WordNet version 2 and use the noun.data file in the dict folder to get the correct offset ids for synsets.

with open('WordNet2.0/dict/noun.dat', 'r') as file:
    wn_data = file.readlines()[29:79719]

def find_offsets(node):
    '''
    Returns all possible WordNet offset ids for a given noun.
    '''
    offsets = []
    for line in wn_data:
        line = line.split()
        offset = line[0]
        pos = line[2]
        word = line[4]
        if word == node:
            offsets.append(offset)
    return offsets

In [7]:
#Domains can be downloaded from https://wndomains.fbk.eu/download.html


with open('Domains/domains_2', 'r') as file: #domains_2 = wn-domains-3.2-20070223
    domain_data = file.readlines()

def find_domains(target_offset):
    '''
    Returns the domains matching a given WordNet offset id.
    '''
    found_domains = []
    for line in domain_data:
        line = line.split('-')
        offset = line[0].lstrip("0")
        domains = line[1][2:-1]
        if offset == target_offset:
            domains = domains.split(' ')
            for domain in domains:
                found_domains.append(domain)
    return found_domains

In [8]:
def is_a_metaphor(sentence):
    '''
    Determines whether a sentence includes a type III metaphor (adjective/adverb-noun metaphor) by going through a set of steps.
    '''
    pair = find_pair(sentence)
    if not pair:
        print('No pair!')
        return
    print('Pair found: ', pair)
    
    check = check_senses(pair)
    if not check:
        print('Sense check failed!')
        return
    print('Sense check ok!')
    
    adjective = pair[0]
    noun = pair[1]
    
    words_near = find_words_near(adjective)
    print('Found', len(words_near), 'nouns near', adjective)
    
    collocates = find_collocates(adjective, words_near)
    print('Found', len(collocates), 'unique collocates')
    
    if not collocates:
        print('Undetermined, not enough collocates')
        return

    classified_nouns = find_classes(collocates)
    print(len(classified_nouns), 'collocate words in concrete classes')
    
    top_three = classified_nouns[-4:-1]
    print('The top three collocates are', top_three)
    
    if not classified_nouns:
        return 'Is a metaphor!'

    node_offsets = find_offsets(noun)
    for offset in node_offsets:
        node_domains = find_domains(offset.lstrip("0"))
    
    collocate_domains = []
    for collocate in top_three:
        offsets = find_offsets(collocate[0])
        for offset in offsets:
            collocate_domains.append(find_domains(offset.lstrip("0")))
            
    collocate_domains = [domain for domain in collocate_domains]
            
    for domain in node_domains:
        if any(domain in sublist for sublist in collocate_domains):
            print('At least one of the top three collocates and the node noun belong to the domain', domain)
            return 'Not a metaphor!'
    
    return 'Is a metaphor!'

In [9]:
is_a_metaphor('She is such a dramatic person!')

Pair found:  ['dramatic', 'person']
Sense check ok!
Looking for words appearing next to dramatic
Found 1299 nouns near dramatic
Determining the collocates of dramatic
Found 118 unique collocates
43 collocate words in concrete classes
The top three collocates are [('transformation', 6.50273567357617), ('skyline', 6.701349709312811), ('scenery', 7.781378090728456)]
At least one of the top three collocates and the node noun belong to the domain grammar


'Not a metaphor!'

In [10]:
is_a_metaphor('He has a green thumb.')

Pair found:  ['green', 'thumb']
Sense check ok!
Looking for words appearing next to green
Found 2329 nouns near green
Determining the collocates of green
Found 172 unique collocates
113 collocate words in concrete classes
The top three collocates are [('blob', 7.5282568204702836), ('sward', 8.100163168365993), ('malachite', 9.287248720966488)]


'Is a metaphor!'

In [11]:
is_a_metaphor('I have a curious cat who likes to get into trouble.')

Pair found:  ['curious', 'cat']
Sense check ok!
Looking for words appearing next to curious
Found 551 nouns near curious
Determining the collocates of curious
Found 38 unique collocates
16 collocate words in concrete classes
The top three collocates are [('blend', 5.802967630654725), ('traveller', 5.996954542556096), ('mixture', 6.8080318390906145)]
At least one of the top three collocates and the node noun belong to the domain factotum


'Not a metaphor!'

In [12]:
is_a_metaphor('They stood in the dead center of the room.')

Pair found:  ['dead', 'center']
Sense check ok!
Looking for words appearing next to dead
Found 1436 nouns near dead
Determining the collocates of dead
Found 68 unique collocates
41 collocate words in concrete classes
The top three collocates are [('nettle', 6.918619434687318), ('magpie', 8.097290478594447), ('aspidistra', 8.853524406465432)]


'Is a metaphor!'

In [14]:
is_a_metaphor('He has a cold heart.')

Pair found:  ['cold', 'heart']
Sense check ok!
Looking for words appearing next to cold
Found 1861 nouns near cold
Determining the collocates of cold
Found 86 unique collocates
48 collocate words in concrete classes
The top three collocates are [('sweat', 6.65484878398373), ('chisel', 7.3439196880456254), ('lino', 7.532364777458736)]


'Is a metaphor!'