In [1]:
#Preprocessing
import pandas as pd
from datetime import datetime
import numpy as np
import string
from nltk.corpus import stopwords
import re

In [2]:
import spacy
from spacy import displacy
nlp = spacy.load('en_core_web_trf') #transformer model
nlp1 = spacy.load('en_core_web_lg') #less accuracy, but does not use CUDA or require GPU

In [3]:
from spacy.matcher import PhraseMatcher
pmatcher = PhraseMatcher(nlp1.vocab)

In [4]:
#NLP nltk
import nltk
from nltk.corpus import wordnet as wn
stop_words = list(stopwords.words("english"))

In [5]:
#NLP word embeddings
import gensim
from gensim.test.utils import datapath
from gensim import utils
import gensim.models
from gensim.models import Word2Vec

import gensim.downloader as api
dataset = api.load('text8')
wv = Word2Vec(dataset)

In [6]:
#Cosine computation
import scipy as sp
from scipy.spatial import distance

In [7]:
#Defining useful functions/classes
def clean_synonyms(synsets):
    cleaned_synonyms = []
    no_stop_words = []
    syn_in_wv = []

    
    sep = '.'
    for s in synsets:
        synonym = s.name()
        cleaned_synonyms.append(synonym.split(sep, 1)[0])
    
    for word in cleaned_synonyms:
        if word not in stop_words:
            no_stop_words.append(word)
        if word in wv.wv.index_to_key:
            syn_in_wv.append(word)
    
    return syn_in_wv

class Sentence:
    
    def __init__(self, text = ''):
        
        self.text = text
        
        lower = text.lower()
        
        self.lower = lower
        
        
        doc = nlp(lower)
        
        target = []
        context = []
                
        for token in doc:
            if token.pos_ == 'VERB':
                if token.text in wv.wv.index_to_key:
                    target.append(token.text)
                else:
                    pass
            else:
                if token.text in wv.wv.index_to_key:
                    context.append(token.text)
                else:
                    pass
                
        self.target = target
        self.context = context

In [19]:
def metaphor_checker2(sentence):

    lowered_sentence = sentence.lower()
    doc = nlp1(lowered_sentence)

    targets = []
    target_docs = []
    context = []
    contexts = []
    target_vecs = []
    chopped = sentence.lower().split(' ')
    all_candidates = []
    syn_vecs = []
    context_vec_means = []
    results = []
                
    for token in doc:
        if token.pos_ == 'VERB':
            if token.text in wv.wv.index_to_key:
                targets.append(token.text)
            else:
                pass
        else:
            if token.text in wv.wv.index_to_key:
                context.append(token.text)
            else:
                pass

    for t in targets:
        target_docs.append(nlp1(t))

    #pmatcher.remove('TARGETS')
    pmatcher.add('TARGETS', target_docs)

    matches = pmatcher(doc)

    for t in targets:
        if len(t) == 1:
            dummy = sentence.lower().split(' ')
            i = chopped.index(t)
            del dummy[i]
            contexts.append(dummy)
        else:
            i = targets.index(t)
            dummy = sentence.lower().split(' ')
            match = matches[i]
            start = match[1]
            end = match[2]
            del dummy[start: end]
            contexts.append(dummy)

    for i in range(0, len(targets)):
        target_word = targets[i]
        
        direct_syn = [] #list of synsets for one target word
        direct_syn.append(wn.synsets(target_word, pos = 'v'))
        
        for s in range(0, len(direct_syn)):
            candidate_words = []
            syn = direct_syn[s] #getting list of synsets for target
            for word in syn: #for each synset in the list of synsets
                lemmas = word.lemmas()
                for l in lemmas:
                    hyponyms = l.synset().hyponyms()
                    if len(hyponyms) > 0:
                        for h in hyponyms:
                            sep = '.'
                            name = h.name().split(sep, 1)[0]
                            if name in wv.wv.index_to_key:
                                if candidate_words.count(name) == 0:
                                    candidate_words.append(name)
                    else:
                        name = l.name().split(sep, 1)[0]
                        if name in wv.wv.index_to_key:
                            if candidate_words.count(name) == 0:
                                candidate_words.append(name)
        all_candidates.append(candidate_words)
    
    for ac in all_candidates:
        syn_vec = []
        if len(ac) > 0:
            for c in ac:
                if c in wv.wv.index_to_key:
                    syn_vec.append(wv.wv[c])
            syn_vecs.append(syn_vec)

    for c in contexts:
        c_vec_list = []
        for word in context:
            c_vec_list.append(wv.wv[word])
        context_mean = np.mean(c_vec_list, axis = 0)
        context_vec_means.append(context_mean)
    
    for i in range(0, len(targets)):
        target = targets[i]
        target_syns = syn_vecs[i]
        target_context = context_vec_means[i]
    
        comparison = []
        for s in target_syns:
            comparison.append(distance.cosine(s, target_context))
        
        comparison_array = np.array(comparison)
            
        best_fit_index = comparison.index(np.amax(comparison_array))
        best_fit_word = all_candidates[i][best_fit_index]
                    
        similarity = wv.wv.similarity(best_fit_word, target)
        
        if similarity > 0.45: #Different threshold?
            results.append([target, 'LITERAL', similarity, best_fit_word])
        else:
            results.append([target, 'METAPHORICAL', similarity, best_fit_word])

    return results

In [20]:
sentence = 'She ate the apple and devoured the novel.' #Expect: literal, metaphorical
metaphor_checker2(sentence)

[['ate', 'LITERAL', 0.4579959, 'forage'],
 ['devoured', 'METAPHORICAL', 0.09248138, 'consume']]

In [21]:
sentence1 = 'The bird flew like the wind and sang like an angel.' #Expect: metaphorical, literal
metaphor_checker2(sentence1)

[['flew', 'METAPHORICAL', 0.34615675, 'airlift'],
 ['sang', 'METAPHORICAL', 0.2641522, 'madrigal']]

In [22]:
sentence2 = 'The boy rang the bell and the clock struck noon.' #Expect: literal, metaphorical
metaphor_checker2(sentence2)

[['rang', 'LITERAL', 0.5495239, 'enclose'],
 ['struck', 'METAPHORICAL', 0.032689016, 'assume']]