In [None]:
import math
from nltk.corpus.reader.bnc import BNCCorpusReader
from nltk import FreqDist
import matplotlib.pyplot as plt
import re
import pickle
import nltk

In [None]:
#Data structure for storing info of a collocate word
class CollocateWord:
    
    def __init__(self, collocate):
        self.collocate = collocate
        #Frequency of how often collocate word is found with word of interest
        self.collocateFrequency = 1
        #Will be calculated later with method call
        self.mutualInformation = None
        #Words count in corpus. Will be set later with method call
        self.corpusFrequency = None
        #Is the word positioned before of after the parent word in a sentence
        self.position = None #Not used yet

    def getWord(self):
        return self.collocate

    #Called in main each time the same word is found again
    def updateFrequency(self):
        self.collocateFrequency += 1

    def printMe(self):
        print("Collocate word: " + self.collocate)
        print("Frequency with parent word: " + str(self.collocateFrequency))
        print("Frequency in corpus: " + str(self.corpusFrequency))
        print("MI score: " + str(self.mutualInformation))

    def setMutualInformation(self, MI):
        self.mutualInformation = MI

    def setCorpusFrequency(self, freq):
        self.corpusFrequency = freq

In [None]:
#Data structure and logic for word under search and its collocates
class MutualInformationCalculator:
    
    def __init__(self, word, corpusSentences, corpusWords, span, sizeOfCorpus, freqDistribution):
        self.word = word
        self.frequency = 0
        self.collocates = []
        self.corpusSentences = corpusSentences
        self.corpusWords = corpusWords
        self.span = span
        self.sizeOfCorpus = sizeOfCorpus
        self.freqDistribution = freqDistribution

    def findCollocates(self):
        '''Find all collocate words around the word of interest'''    
        for sentence in self.corpusSentences:
            l_sentence = [(word_tuple[0].lower(), word_tuple[1] )for word_tuple in sentence]
            #if(l_sentence[0].count(self.word) == 0):
            #    continue
            #Should we remove stop words from search? Should be done here, at latest.
            for index, word in enumerate(l_sentence):
                #print(word[0].lower())
                #print(index)
                if word[0] == self.word:
                    self.updateFrequency()
                    self.findSpanningCollocations(l_sentence, index)
    
    def findSpanningCollocations(self, sentence, startingIndex,):
        '''When words of interest is found in the sentence, the sentence and index is passed here to find collocate words
        and updateCollocates is called for valid words to save them'''
        listOfIndexes = []
        #Assuming that the span = 4 in the task description actually means windows size instead, check only adjacent indexes.
        #  CURRENTLY FIXED WINDOW SIZE = 5
        indexesToTry = [startingIndex - 2, startingIndex - 1, startingIndex + 1, startingIndex + 2]
        for index in indexesToTry:
            if index >= 0 and index < len(sentence):
                word, wordType = sentence[index]
                #program should indentify only adjectives, adverbs and verbs
                if wordType == 'ADJ' or wordType == 'ADV' or wordType == "VERB":
                    self.updateCollocates(word)

    def updateCollocates(self, collocateWord):
        '''Checks if collocate is new to append it to list or to just update existing frequency'''
        #Check if collocate list is empty
        if self.collocates:
            for collocate in self.collocates:
                if collocate.getWord() == collocateWord:
                    collocate.updateFrequency()
                    return
                    
            self.collocates.append(CollocateWord(collocateWord))
        else:
            self.collocates.append(CollocateWord(collocateWord))
    
    #Used while searching collocate words.
    def updateFrequency(self):
        self.frequency += 1

    def calculateCollocateFrequencies(self):
        #Set the frequency in corpus for every collocate word
        for collocate in self.collocates:
            collocate.setCorpusFrequency(self.freqDistribution[collocate.getWord()])
            

    def printMe(self):
        print("Word of interest: " + self.word)
        print("Frequncy in corpus: " + str(self.frequency))
        print("Collocate words found: " + str(len(self.collocates)))

    def printCollocates(self):
        for collocate in self.collocates:
            collocate.printMe()

    def getCollocates(self):
        return self.collocates

    def calculateMutualInformations(self):
        '''Calculate MI for all collocate words. Requires that frequencies are calculated before running this.'''
        for collocate in self.collocates:
            MI = math.log10(( (collocate.collocateFrequency * self.sizeOfCorpus) / (self.frequency * collocate.corpusFrequency * self.span) ))
            MI = MI / math.log10(2)
            collocate.setMutualInformation(MI)

    def calculateAll(self):
        '''Does all calculation steps in one command.'''
        self.findCollocates()
        self.calculateCollocateFrequencies()
        self.calculateMutualInformations()
        self.removeCollocatesBelowFrequency(2)
        self.removeCollocatesBelowMI(3)

    def getCollacatesOverMiValue(self, limit):
        '''Return a list of collocates above wanted MI value'''
        return [collocate for collocate in self.collocates if (collocate.mutualInformation >= limit)]

    def getCollocatesOverFrequency(self, limit):
        return [collocate for collocate in self.collocates if (collocate.collocateFrequency >= limit)]

    def removeCollocatesBelowFrequency(self, limitFrequency):
        '''Remove collocates from object whose frequency with word of interest is lower than given limit.'''
        self.collocates[:] = [collocate for collocate in self.collocates if (collocate.collocateFrequency >= limitFrequency)]

    def removeCollocatesBelowMI(self, limitMI):
        self.collocates[:] = [collocate for collocate in self.collocates if (collocate.mutualInformation >= limitMI)]

    def sortCollocatesByFrequency(self):
        self.collocates.sort(key=lambda collocate: collocate.collocateFrequency, reverse=True)

    def sortCollocatesByMIscore(self):
        self.collocates.sort(key=lambda collocate: collocate.mutualInformation, reverse=True)

    def plotFrequencyFigure(self, xSize, ySize):
        miLim = 3
        fLim = 2
        #self.sortCollocatesByFrequency()
        words = [c.collocate for c in self.collocates]
        freqs = [c.collocateFrequency for c in self.collocates]
        fig, ax = plt.subplots(figsize = (xSize, ySize))
        ax.set_ylabel('Mutual frequency', fontsize=15)
        ax.set_title('Frequency of adjacent words from \'' + self.word + '\' with MI score >= ' + str(miLim) + ' and mutual frequency >= ' + str(fLim) + ')')
        ax.barh(words, freqs)
        plt.show()

    def plotMiFigure(self, xSize, ySize):
        miLim = 3
        fLim = 2
        self.sortCollocatesByMIscore()
        words = [c.collocate for c in self.collocates]
        mi_score = [c.mutualInformation for c in self.collocates]
        fig, ax = plt.subplots(figsize = (xSize, ySize))
        ax.set_ylabel('Mutual Information score', fontsize=15)
        ax.set_title('Mutual information score over ' + str(miLim) + ' with a word \'' + self.word + '\' and mutual frequency >= ' + str(fLim) + ')')
        ax.barh(words, mi_score)
        plt.show()

    def plotPresentationFigures(self):
        '''Plots only 25 top MI collocates, removes others'''
        self.collocates = self.collocates[:25]
        self.plotMiFigure(5,5)
        self.plotFrequencyFigure(5,5)

    

In [None]:
with open('sentsFull.pkl', 'rb') as input:
    corpusSentences = pickle.load(input)

In [None]:
with open('wordsFull.pkl', 'rb') as input:
    corpusWords = pickle.load(input)
#We want decapitalize words for easier handling
lower_words = [word.lower() for word in corpusWords]
sizeOfCorpus = len(lower_words)

In [None]:
#Calculate frequencies for all the words in corpus
fdist = FreqDist(lower_words)

In [None]:
'''
Formatting of the metaphor annotated corpus: http://aclweb.org/anthology/W/W17/W17-2201.pdf
Example: destroying alexandria . sunlight is silence @4@y
@-sign separates different fields
Sentece:                        destroying alexandria . sunlight is silence
Position of the head word       4 -> sunlight
Is the expression a metaphore?  y (yes), n (no), s (skipped)
'''
annotatedSentences = []
headWordPosition = []
isSentenceMetaphor = []
with open("type1_metaphor_annotated.txt") as textfile:
    for line in textfile: 
        line = line.split("@")
        if(line[2].strip() != 's'): #Exclude sentences which humans failed to classify
            annotatedSentences.append(line[0].strip())
            headWordPosition.append(line[1].strip())
            isSentenceMetaphor.append(line[2].strip())

'''
The dataset is quite problematic because the text is so unclean:
(
the night is each man 's castle . @2@y
swelling lukewarm ; her mouth is water , @5@y
& yet the earth is divinity , the sky is divinity @4@n
"i am the resurrection and the life . " @1@n
"how is the dean ? " -- "he 's just alive . " @1@n <-- especially problematic, because quotation mark is connected to the word
)
If all special characters are cleaned a way, the head word
position tag may change, and we cannot reliably verify the algorithm.
If not cleaned, the some words are left unrecognized...
'''

In [None]:
def getCollocatesAroundHead(sentence, index, span):
    taggedSentence = nltk.pos_tag(sentence, 'universal')
    width = span / 2
    print(taggedSentence)
    collocates = []
    i = 1
    while(width > 0):
        if(index - i >= 0):
            if(taggedSentence[index -i][1] in {'ADJ','ADV', 'VERB'} and (taggedSentence[index -i][1] != '')):
                collocates.append(sentence[index -i])
        
        if(index + i <= len(sentence)):
            if(taggedSentence[index +i][1] in {'ADJ','ADV', 'VERB'} and (taggedSentence[index -i][1] != '')):
                collocates.append(sentence[index + i])
        i +=1
        width -= 1
    print("collocates around head in sentence:", collocates)
    
    
    return collocates


In [None]:
span = 4

In [None]:


predictions = []
for i in range(len(headWordPosition)):
    #print(annotatedSentences[i])
    sentence = annotatedSentences[i].split()
    headWordIndex =int(headWordPosition[i])-1
    
    
    sentence = [re.sub(r'[^a-zA-Z ]', '', word) for word in sentence]
    sentence[:] = [word.strip() for word in sentence]
    headWord = sentence[headWordIndex].lower()
    #sentence[:] = [word for word in sentence if word] #this will mess up the head word indexing
    #print(sentence)
    print(headWord)

    MutualInformation = MutualInformationCalculator(headWord, corpusSentences, lower_words, span, sizeOfCorpus, fdist)
    MutualInformation.findCollocates()
    MutualInformation.calculateCollocateFrequencies()
    MutualInformation.calculateMutualInformations()
    #TODO: select frequency cutoff
    #MutualInformation.printMe()

    corpusCollocates = MutualInformation.getCollocates()

    #TODO: get only adj, adv, verb?
    sentenceCollocates = getCollocatesAroundHead(sentence, headWordIndex, span)
    
    matchingCollocates = []
    for collocate in sentenceCollocates:
        for corpCollocate in corpusCollocates:
            if collocate == corpCollocate.getWord():
                matchingCollocates.append(corpCollocate)

    for c in matchingCollocates:
        print(c.collocate, c.mutualInformation)
    print('\n')
    isMetaphor = False
    
    for c in matchingCollocates:
        if c.mutualInformation < 2.5:
            isMetaphor = True
    if not matchingCollocates:
        predictions.append('u')
    elif isMetaphor == False:
        predictions.append('n')
    else:
        predictions.append('y')



In [None]:
print(len(predictions))
print(len(isSentenceMetaphor))

In [None]:
PredictedPositives = predictions.count('y')
PredictedNegatives = predictions.count('n')
TruePositives = 0
FalsePositives = 0
TrueNegatives = 0
FalseNegatives = 0

#Unknown results are skipped
for i in range(len(isSentenceMetaphor)):
    if predictions[i] == 'y' and isSentenceMetaphor[i] == 'y':
        TruePositives +=1
    elif predictions[i] == 'y' and isSentenceMetaphor[i] == 'n':
        FalsePositives += 1
    elif predictions[i] == 'n' and isSentenceMetaphor[i] == 'n':
        TrueNegatives += 1
    elif predictions[i] == 'n' and isSentenceMetaphor[i] == 'y':
        FalseNegatives += 1

#Disregards unknown classifications
Positives = TruePositives + FalseNegatives #Ground truth
Negatives = TrueNegatives + FalsePositives #Ground truth

print("Positives: " + str(Positives))
print("Negatives: " + str(Negatives))
print("Predicted positives: " + str(PredictedPositives))
print("Predicted negatives: " + str(PredictedNegatives))
print("TP: " + str(TruePositives))
print("FP: " + str(FalsePositives))
print("TN: " + str(TrueNegatives))
print("FN: " + str(FalseNegatives))

In [None]:
accuracy = (TruePositives + TrueNegatives) / (Positives + Negatives)
print(accuracy)
precision = TruePositives / (TruePositives + FalsePositives)
print(precision)
recall = TruePositives / (TruePositives + FalseNegatives)
print(recall)