Implementation for Task 7. Instead of cheking the Wu & Palmer similarity like in Task 5, the compatibility between the node noun and the collocates (of the adjective) is checked using domain information. If at least one of the collocates belong to the same domain as the noun, they are compatible and the phrase is classified as not a metaphor. 

In [7]:
import nltk
from nltk.corpus import wordnet as wn
from nltk.corpus.reader.bnc import BNCCorpusReader
from nltk.stem.wordnet import WordNetLemmatizer
import numpy as np
from operator import itemgetter
import re
import pickle
from nltk import FreqDist

In [2]:
#only using folders A-C for now
#reader = BNCCorpusReader(root='BNC/Texts/', fileids=r'[A-C]/\w*/\w*\.xml')

In [3]:
'''Casting reader output to list. For some reason,
this makes all other processing to go tens of times faster.
The downside is that the casting takes significant amount of time,
but it has to be done only once.'''

#Runtime with CPU Ryzen 5 5600X ~XX minutes
#sents = list(reader.tagged_sents())
#Get full corpus from local file 1min30s
with open('sentsFull.pkl', 'rb') as input:
    sents = pickle.load(input)

In [4]:
#Runtime with CPU Ryzen 5 5600X ~XX minutes
#words = list(reader.words())

with open('wordsFull.pkl', 'rb') as input:
    words = pickle.load(input)

In [5]:
#some preprocessing (removing special characters and numbers from the list of words and changing all the words into lower case)
#runtime would be about 1.5 hours if using the whole corpus, though probably (and hopefully) shorter on a better computer

wordlist = []
special_chars = ['(',')',',','"','.','!','?','-','\'','‘','’','—',':']
for w in words:
    if w not in special_chars and not w.isnumeric():
        wordlist.append(w.lower())

In [6]:
len(wordlist)

97858411

In [8]:

freqDist = FreqDist(wordlist)

In [9]:
def find_pair(sentence):
    '''
    Finds adjective/adverb-noun part-of-speech in a given sentence using nltk part-of-speech tagging. 
    Returns only the first occurence of such pair in a sentence.
    '''
    pair = []
    tagged_words = nltk.pos_tag(sentence.split())
    adjectives = ['JJ', 'JJR', 'JJS', 'RB', 'RBR', 'RBS']
    nouns = ['NN', 'NNS', 'NNPS', 'NNP']
    for i in range(len(tagged_words) - 1):
        word1_category = tagged_words[i][1]
        word2_category = tagged_words[i + 1][1]
        if word1_category in adjectives and word2_category in nouns:
            pair = [tagged_words[i][0].lower(), re.sub('\W+','', tagged_words[i + 1][0]).lower()]
            return pair
    return pair

def check_senses(pair):
    '''
    Given an adjective/adverb-noun pair checks that the adjective/adverb has more than one sense and the noun has an
    entry in WordNet.
    '''
    adj = pair[0]
    noun = pair[1]
    if len(wn.synsets(adj)) == 1:
        print('Adjective has only one sense!')
        return 2
    elif len(wn.synsets(noun)) == 0:
        print('Noun has no entry in WordNet!')
        return 3
    return True

def find_words_near(node):
    '''
    Finds nouns appearing next to a given node word by checking each sentence of the corpus individually.
    '''
    print('Looking for words appearing next to', node)
    words_near = []
    for sentence in sents:
        for i in range(len(sentence)):
            if sentence[i][0] == node:
                indexesToTry = [i - 1, i + 1]
                for index in indexesToTry:
                    if index >= 0 and index < len(sentence):
                        if sentence[index][1] and sentence[index][1] == 'SUBST':
                            words_near.append(sentence[index][0].lower())
    return words_near

def find_collocates(node, words_near):
    '''
    Finds all unique collocate nouns from a list of nouns that appear near the node word. A noun is considered a collocate
    when its mutual information to the node is greater or equal to 3. Only considers nouns that appear at least twice
    near the node word.
    '''
    print('Determining the collocates of', node)
    collocates = []
    checked = []
    freq_node = freqDist[node]
    for word in words_near:
        if word not in checked:
            checked.append(word)
            freq_near = words_near.count(word)
            if freq_near >= 2:
                freq_collocate = freqDist[word]
                mutual_information = calculate_mutual_information(freq_node, freq_collocate, freq_near)
                if mutual_information >= 3:
                    if (word, mutual_information) not in collocates:
                        collocates.append((word, mutual_information))
    return collocates

def calculate_mutual_information(freq_node, freq_collocate, freq_near):
    '''
    Calculates the mutual information between a node and a possible collocate using the expression (2) in
    the article https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0062343 by Neuman et al (2013).
    '''
    corpus_size = len(wordlist)
    span = 2 # span = 2 * lexical window
    mutual_information = np.log10((freq_near * corpus_size)/(freq_node * freq_collocate * span))/np.log10(2)
    return mutual_information

with open('WordNet2/Words.cat', 'r') as file:
    data = file.readlines()
    nouns = data[25040:94946]
    
#tidying the noun categorization data
tidy_nouns = []
for noun in nouns:
    noun = noun.replace(' (1)', '')
    noun = noun.replace('\t', '')
    noun = noun.replace('\n', '')
    noun = noun.lower()
    tidy_nouns.append(noun)

def find_classes(collocates):
    '''
    Classifies a list of nouns using WordStat. Returns a list of nouns (and their mutual information also given in
    the input list) that belong to a concrete class. The list is sorted by the mutual information value in ascending order.
    '''
    Lem = WordNetLemmatizer()

    current_class = ''
    classified_nouns = []
    
    #there should be 13 concrete classes out of the 25 noun classes
    concrete_classes = ['animal', 'artifact', 'body', 'event', 'food', 'group', 'location', 'object', 'person', 'possession',
                       'plant', 'shape', 'substance']

    for target in collocates:
        if tidy_nouns.count(target[0]) > 0:
            for noun in tidy_nouns:
                #changing the noun class (e.g. when NOUN.FOOD is encountered, class is FOOD until the next noun
                #class is encountered)
                if 'noun.' in noun:
                    noun_split = noun.split('.')
                    current_class = noun_split[1]
                elif noun == Lem.lemmatize(target[0]) and current_class in concrete_classes:
                    #only adding nouns that are not already in the list (some might be in more than one conrete class)
                    if (noun, target[1]) not in classified_nouns:
                        classified_nouns.append((noun, target[1]))
    #sorting by the mutual information value
    classified_nouns = sorted(classified_nouns, key=itemgetter(1))
    return classified_nouns

In [10]:
#Because the WordNet domain categorising uses version 2.0 (or 1.6) of WordNet and the current nltk version is 3.0,
#we need to download WordNet version 2 and use the noun.data file in the dict folder to get the correct offset ids for synsets.

with open('WordNet2.0/dict/noun.dat', 'r') as file:
    wn_data = file.readlines()[29:79719]

def find_offsets(node):
    '''
    Returns all possible WordNet offset ids for a given noun.
    '''
    offsets = []
    for line in wn_data:
        line = line.split()
        offset = line[0]
        pos = line[2]
        word = line[4]
        if word == node:
            offsets.append(offset)
    return offsets

#Domains can be downloaded from https://wndomains.fbk.eu/download.html


with open('Domains/domains_2', 'r') as file: #domains_2 = wn-domains-3.2-20070223
    domain_data = file.readlines()

def find_domains(target_offset):
    '''
    Returns the domains matching a given WordNet offset id.
    '''
    found_domains = []
    for line in domain_data:
        line = line.split('-')
        offset = line[0].lstrip("0")
        domains = line[1][2:-1]
        if offset == target_offset:
            domains = domains.split(' ')
            for domain in domains:
                found_domains.append(domain)
    return found_domains

In [11]:
def is_a_metaphor(sentence):
    '''
    Determines whether a sentence includes a type III metaphor (adjective/adverb-noun metaphor) by going through a set of steps.
    '''
    pair = find_pair(sentence)
    if not pair:
        print('No pair!')
        return
    print('Pair found: ', pair)
        
    adjective = pair[0]
    noun = pair[1]
    
    check = check_senses(pair)
    if check == 2:
        print(adjective, noun, 'is not a metaphore')
        return False
    elif check == 3:
        return 'Result is unknown'
    
    words_near = find_words_near(adjective)
    #print('Found', len(words_near), 'nouns appearing next to', adjective)
    
    collocates = find_collocates(adjective, words_near)
    print('Found', len(collocates), 'unique collocates')
    
    if not collocates:
        print('No collocates found')
        return 'Result is unknown'

    classified_nouns = find_classes(collocates)
    print(len(classified_nouns), 'collocate words appear in concrete classes')
    
    if not classified_nouns:
        print(adjective, noun, 'is a metaphore')
        return True
    
    top_three = classified_nouns[-3:]
    print('The top three collocates are', top_three)

    node_domains = []
    node_offsets = find_offsets(noun)
    for offset in node_offsets:
        node_domains.append(find_domains(offset.lstrip("0")))

    #print(node_domains)
        
    node_domains = [domain for sublist in node_domains for domain in sublist]
    #print(node_domains)
    collocate_domains = []
    for collocate in top_three:
        offsets = find_offsets(collocate[0])
        for offset in offsets:
            collocate_domains.append(find_domains(offset.lstrip("0")))
            
    for domain in node_domains:
        if any(domain in sublist for sublist in collocate_domains):
            print('At least one of the top three collocates and the node noun belong to the domain', domain)
            print(adjective, noun, 'is not a metaphore')
            return False
    
    print(adjective, noun, 'is a metaphore')
    return True

Below are a few example sentences.

In [12]:
is_a_metaphor('She is such a dramatic person!')

Pair found:  ['dramatic', 'person']
Looking for words appearing next to dramatic
Determining the collocates of dramatic
Found 219 unique collocates
79 collocate words appear in concrete classes
The top three collocates are [('reversal', 8.170991748369083), ('backcloth', 8.496978059830797), ('turnaround', 8.548693096374954)]
[['biology', 'person'], ['person'], ['grammar']]
['biology', 'person', 'person', 'grammar']
dramatic person is a metaphore


True

In [63]:
is_a_metaphor('He has a green thumb.')

Pair found:  ['green', 'thumb']
Looking for words appearing next to green
Determining the collocates of green
Found 203 unique collocates
132 collocate words appear in concrete classes
The top three collocates are [('sward', 8.68512566908715), ('malachite', 9.872211221687644), ('baize', 9.944619717215694)]
green thumb is a metaphore


True

In [64]:
is_a_metaphor('I have a curious cat who likes to get into trouble.')

Pair found:  ['curious', 'cat']
Looking for words appearing next to curious
Determining the collocates of curious
Found 53 unique collocates
25 collocate words appear in concrete classes
The top three collocates are [('traveller', 6.581917043277252), ('mixture', 7.392994339811771), ('coincidence', 8.600964498014095)]
At least one of the top three collocates and the node noun belong to the domain factotum
curious cat is not a metaphore


False

In [65]:
is_a_metaphor('They stood in the dead center of the room.')

Pair found:  ['dead', 'center']
Looking for words appearing next to dead
Determining the collocates of dead
Found 93 unique collocates
54 collocate words appear in concrete classes
The top three collocates are [('magpie', 8.682252979315605), ('aspidistra', 9.438486907186588), ('ringer', 10.356024746994615)]
At least one of the top three collocates and the node noun belong to the domain factotum
dead center is not a metaphore


False

In [66]:
is_a_metaphor('He has a cold heart.')

Pair found:  ['cold', 'heart']
Looking for words appearing next to cold
Determining the collocates of cold
Found 98 unique collocates
55 collocate words appear in concrete classes
The top three collocates are [('chisel', 7.92888218876678), ('lino', 8.117327278179891), ('compress', 8.422576631920624)]
cold heart is a metaphore


True

In [19]:
'''
Formatting of the metaphor annotated corpus: http://aclweb.org/anthology/W/W17/W17-2201.pdf
Example: destroying alexandria . sunlight is silence @4@y
@-sign separates different fields
Sentece:                        destroying alexandria . sunlight is silence
Position of the head word       4 -> sunlight
Is the expression a metaphore?  y ()
'''
annotatedSentences = []
headWordPosition = []
isSentenceMetaphor = []
with open("type1_metaphor_annotated.txt") as textfile:
    for line in textfile: 
        line = line.split("@")
        if(line[2] != 's'): #Exclude sentences which humans failed to classify
            annotatedSentences.append(line[0].strip())
            headWordPosition.append(line[1].strip())
            isSentenceMetaphor.append(line[2].strip())

'''
The dataset is quite problematic because the text is so unclean:
(
the night is each man 's castle . @2@y
swelling lukewarm ; her mouth is water , @5@y
& yet the earth is divinity , the sky is divinity @4@n
"i am the resurrection and the life . " @1@n
"how is the dean ? " -- "he 's just alive . " @1@n <-- especially problematic, because quotation mark is connected to the word
)
If not cleaned, the some words are left unrecognized...
'''

'\nThe dataset is quite problematic because the text is so unclean:\n(\nthe night is each man \'s castle . @2@y\nswelling lukewarm ; her mouth is water , @5@y\n& yet the earth is divinity , the sky is divinity @4@n\n"i am the resurrection and the life . " @1@n\n"how is the dean ? " -- "he \'s just alive . " @1@n <-- especially problematic, because quotation mark is connected to the word\n)\nIf not cleaned, the some words are left unrecognized...\n'

In [14]:
#We'll do some simple cleaning
cleanAnnotatedSentences = [re.sub("[^0-9A-Za-z ]", "" , sent) for sent in annotatedSentences]
print(cleanAnnotatedSentences[0])
print(annotatedSentences[0])

poise is a club 
poise is a club .


In [None]:
predictions = []
for sentence in cleanAnnotatedSentences:
    if is_a_metaphor(sentence):
        predictions.append('y')
    else:
        predictions.append('n')

In [16]:
print(len(predictions))
print(len(isSentenceMetaphor))


719
719


In [17]:
PredictedPositives = predictions.count('y')
PredictedNegatives = predictions.count('n')
TruePositives = 0
FalsePositives = 0
TrueNegatives = 0
FalseNegatives = 0
Positives = isSentenceMetaphor.count('y') #Ground truth
Negatives = isSentenceMetaphor.count('n') #Ground truth

for i in range(len(isSentenceMetaphor)):
    if predictions[i] == 'y' and isSentenceMetaphor[i] == 'y':
        TruePositives +=1
    elif predictions[i] == 'y' and isSentenceMetaphor[i] == 'n':
        FalsePositives += 1
    elif predictions[i] == 'n' and isSentenceMetaphor[i] == 'n':
        TrueNegatives += 1
    elif predictions[i] == 'n' and isSentenceMetaphor[i] == 'y':
        FalseNegatives += 1

print("Positives: " + str(Positives))
print("Negatives: " + str(Negatives))
print("Predicted positives: " + str(PredictedPositives))
print("Predicted negatives: " + str(PredictedNegatives))
print("TP: " + str(TruePositives))
print("FP: " + str(FalsePositives))
print("TN: " + str(TrueNegatives))
print("FN: " + str(FalseNegatives))

Positives: 358
Negatives: 322
Predicted positives: 143
Predicted negatives: 576
TP: 69
FP: 59
TN: 263
FN: 289


In [18]:
accuracy = (TruePositives + TrueNegatives) / (Positives + Negatives)
print(accuracy)
precision = TruePositives / (TruePositives + FalsePositives)
print(precision)
recall = TruePositives / (TruePositives + FalseNegatives)
print(recall)

0.48823529411764705
0.5390625
0.19273743016759776
