Task 5 approach so far. Only using a part of the corpus since the runtimes would be too long using the whole 100 million word dataset. Aim of the task is to classify whether an adjective/adverb-noun pair in a sentence is a metaphor or not.

In [1]:
import nltk
from nltk.corpus import wordnet as wn
from nltk.corpus.reader.bnc import BNCCorpusReader
from nltk.stem.wordnet import WordNetLemmatizer
import numpy as np
from operator import itemgetter
import re

In [12]:
#only using folders A-C for now
reader = BNCCorpusReader(root='BNC/Texts/', fileids=r'[A-C]/\w*/\w*\.xml')
sents = reader.tagged_sents()
words = reader.words()

In [13]:
#some preprocessing (removing special characters and numbers from the list of words and changing all the words into lower case)
#runtime would be about 1.5 hours if using the whole corpus, though probably (and hopefully) shorter on a better computer

wordlist = []
special_chars = ['(',')',',','"','.','!','?','-','\'','‘','’','—',':']
for w in words:
    if w not in special_chars and not w.isnumeric():
        wordlist.append(w.lower())

In [14]:
len(wordlist)

37525438

In [31]:
def find_pair(sentence):
    '''
    Finds adjective/adverb-noun part-of-speech in a given sentence using nltk part-of-speech tagging. 
    Returns only the first occurence of such pair in a sentence.
    '''
    pair = []
    tagged_words = nltk.pos_tag(sentence.split())
    adjectives = ['JJ', 'JJR', 'JJS', 'RB', 'RBR', 'RBS']
    nouns = ['NN', 'NNS', 'NNPS', 'NNP']
    for i in range(len(tagged_words) - 1):
        word1_category = tagged_words[i][1]
        word2_category = tagged_words[i + 1][1]
        if word1_category in adjectives and word2_category in nouns:
            pair = [tagged_words[i][0].lower(), re.sub('\W+','', tagged_words[i + 1][0]).lower()]
            return pair
    return pair

def check_senses(pair):
    '''
    Given an adjective/adverb-noun pair checks that the adjective/adverb has more than one sense and the noun has an
    entry in WordNet.
    '''
    adj = pair[0]
    noun = pair[1]
    if len(wn.synsets(adj)) == 1:
        print('Adjective has only one sense!')
        return False
    elif len(wn.synsets(noun)) == 0:
        print('Noun has no entry in WordNet!')
        return False
    return True

def find_words_near(node):
    '''
    Finds nouns appearing next to a given node word by checking each sentence of the corpus individually.
    '''
    print('Looking for words appearing next to', node)
    words_near = []
    for sentence in sents:
        for i in range(len(sentence)):
            if sentence[i][0] == node:
                indexesToTry = [i - 1, i + 1]
                for index in indexesToTry:
                    if index >= 0 and index < len(sentence):
                        if sentence[index][1] and sentence[index][1] == 'SUBST':
                            words_near.append(sentence[index][0].lower())
    return words_near

def find_collocates(node, words_near):
    '''
    Finds all unique collocate nouns from a list of nouns that appear near the node word. A noun is considered a collocate
    when its mutual information to the node is greater or equal to 3. Only considers nouns that appear at least twice
    near the node word.
    '''
    print('Determining the collocates of', node)
    collocates = []
    checked = []
    freq_node = wordlist.count(node)
    for word in words_near:
        if word not in checked:
            checked.append(word)
            freq_near = words_near.count(word)
            if freq_near >= 2:
                freq_collocate = wordlist.count(word)
                mutual_information = calculate_mutual_information(freq_node, freq_collocate, freq_near)
                if mutual_information >= 3:
                    if (word, mutual_information) not in collocates:
                        collocates.append((word, mutual_information))
    return collocates

def calculate_mutual_information(freq_node, freq_collocate, freq_near):
    '''
    Calculates the mutual information between a node and a possible collocate using the expression (2) in
    the article https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0062343 by Neuman et al (2013).
    '''
    corpus_size = len(wordlist)
    span = 3 #maybe?
    mutual_information = np.log10((freq_near * corpus_size)/(freq_node * freq_collocate * span))/np.log10(2)
    return mutual_information

with open('WordNet2/Words.cat', 'r') as file:
    data = file.readlines()
    nouns = data[25040:94946]
    
tidy_nouns = []
for noun in nouns:
    noun = noun.replace(' (1)', '')
    noun = noun.replace('\t', '')
    noun = noun.replace('\n', '')
    noun = noun.lower()
    tidy_nouns.append(noun)

def find_classes(collocates):
    '''
    Classifies a list of nouns using WordStat. Returns a list of nouns (and their mutual information also given in
    the input list) that belong to a concrete class. The list is sorted by the mutual information value in ascending order.
    '''
    Lem = WordNetLemmatizer()

    current_class = ''
    classified_nouns = []
    
    #there should be 13 concrete classes out of the 25 classes, now there is 12 and some might be incorrect :D
    concrete_classes = ['ANIMAL', 'ARTIFACT', 'BODY', 'EVENT', 'FOOD', 'LOCATION', 'OBJECT', 'PERSON', 'POSSESSION',
                       'PLANT', 'SHAPE', 'SUBSTANCE']

    for target in collocates:
        if tidy_nouns.count(target) > 0:
            for noun in tidy_nouns:
                #changing the noun class (e.g. when NOUN.FOOD is encountered, class is FOOD until the next noun
                #class is encountered) 
                if 'NOUN.' in noun:
                    noun_split = noun.split('.')
                    current_class = noun_split[1]
                elif noun == Lem.lemmatize(target[0]) and current_class in concrete_classes:
                    #only adding nouns that are not already in the list (some might be in more than one conrete class)
                    if (noun, target[1]) not in classified_nouns:
                        classified_nouns.append((noun.lower(), target[1]))
    #sorting by the mutual information value
    classified_nouns = sorted(classified_nouns, key=itemgetter(1))
    return classified_nouns

def calculate_compatibility(classified_nouns, node):
    '''
    Calulates the Wu and Palmer semantic similarity
    '''
    compatible = []
    for word in classified_nouns:
        syn1 = wn.synsets(word[0])[0]
        syn2 = wn.synsets(node)[0]
        similarity = syn1.wup_similarity(syn2)
        if similarity >= 0.3:
            compatible.append(word[0])
    return compatible

In [33]:
def is_a_metaphor(sentence):
    '''
    Determines whether a sentence includes a type III metaphor (adjective/adverb-noun metaphor) by going through a set of steps.
    '''
    pair = find_pair(sentence)
    if not pair:
        print('No pair!')
        return
    print('Pair found: ', pair)
    
    check = check_senses(pair)
    if not check:
        print('Sense check failed!')
        return
    print('Sense check ok!')
    
    adjective = pair[0]
    noun = pair[1]
    
    words_near = find_words_near(adjective)
    print('Found', len(words_near), 'nouns near', adjective)
    
    collocates = find_collocates(adjective, words_near)
    print('Found', len(collocates), 'unique collocates')
    
    if not collocates:
        print('Undetermined, not enough collocates')
        return
    
    classified_nouns = find_classes(collocates)
    print(len(classified_nouns), 'collocate words in concrete classes')
    
    if not classified_nouns:
        return 'Is a metaphor!'
    
    compatible = calculate_compatibility(classified_nouns[-4:-1], noun)
    print('Out of the top three collocates belonging to concrete classes,', len(compatible), 'are compatible with the noun')
    
    if compatible:
        return 'Not a metaphor!'
    return 'Is a metaphor!'

Below are a few example sentences, the counts in the first one are there just to check that the code was running okay and removed from the later test runs. Some phrases are classified correctly, some not. On average it takes about 15 to 20 minutes to run is_a_metaphor() for one sentence, but again the time might be considerably shorter on better computers.

In [21]:
is_a_metaphor('She is such a dramatic person!')

Pair found:  ['dramatic', 'person']
Sense check ok!
Looking for words appearing next to dramatic
0
100000
200000
300000
400000
500000
600000
700000
800000
900000
1000000
1100000
1200000
1300000
1400000
1500000
1600000
1700000
1800000
1900000
2000000
Found 1299 nouns near dramatic
Determining the collocates of dramatic
Found 118 unique collocates
66 collocate words in concrete classes
Out of the top three collocates belonging to concrete classes, 1 are compatible with the noun


'Not a metaphor!'

In [24]:
is_a_metaphor('He has a green thumb.')

Pair found:  ['green', 'thumb']
Sense check ok!
Looking for words appearing next to green
Found 2329 nouns near green
Determining the collocates of green
Found 172 unique collocates
152 collocate words in concrete classes
Out of the top three collocates belonging to concrete classes, 0 are compatible with the noun


'Is a metaphor!'

In [25]:
is_a_metaphor('I have a curious cat who likes to get into trouble.')

Pair found:  ['curious', 'cat']
Sense check ok!
Looking for words appearing next to curious
Found 551 nouns near curious
Determining the collocates of curious
Found 38 unique collocates
21 collocate words in concrete classes
Out of the top three collocates belonging to concrete classes, 0 are compatible with the noun


'Is a metaphor!'

In [26]:
is_a_metaphor('They stood in the dead center of the room.')

Pair found:  ['dead', 'center']
Sense check ok!
Looking for words appearing next to dead
Found 1436 nouns near dead
Determining the collocates of dead
Found 68 unique collocates
63 collocate words in concrete classes
Out of the top three collocates belonging to concrete classes, 2 are compatible with the noun


'Not a metaphor!'

In [36]:
is_a_metaphor('Do you have a colourful mind?')

Pair found:  ['colourful', 'mind']
Sense check ok!
Looking for words appearing next to colourful
Found 316 nouns near colourful
Determining the collocates of colourful
Found 36 unique collocates
0 collocate words in concrete classes


'Is a metaphor!'