In [13]:
import pandas as pd
import os
import gzip
import spacy as sp
from collections import Counter
from sklearn.metrics.pairwise import cosine_similarity
from math import log
from collections import defaultdict
from sklearn.cluster import DBSCAN
from math import log
from nltk.corpus import wordnet as wn
import nltk
from itertools import product
import pickle
import warnings
warnings.filterwarnings('ignore')

In [14]:
nlp = sp.load('en')

In [15]:
eval_data = pd.read_csv("eval-mod_ours.txt")

metaphor_words = []
metaphors = {}

for r in eval_data.iterrows():
    #tokens = nlp(r[1]["Metaphor"].decode('utf-8', errors='ignore').strip().lower())
    tokens = nlp(r[1]["Metaphor"].strip().lower())
    topic = tokens[0]
    vehicle = tokens[-1]
    if topic.lemma_ not in metaphor_words:
        metaphor_words.append(topic.lemma_)
    if vehicle.lemma_ not in metaphor_words:
        metaphor_words.append(vehicle.lemma_)
    if r[1]["Metaphor"] not in metaphors:
        metaphors[r[1]["Metaphor"]] = [topic.lemma_, vehicle.lemma_, 
                                       [(str(r[1]["Interpretation"]).lower(), r[1]["Freq"])],
                                       [str(r[1]["Interpretation"]).lower()]
                                      ]
    else:
        metaphors[r[1]["Metaphor"]][2].append((str(r[1]["Interpretation"]).lower(), r[1]["Freq"]))
        metaphors[r[1]["Metaphor"]][3].append(str(r[1]["Interpretation"]).lower())


In [17]:
class Lang:
    def __init__(self):
        self.vec = []
        self.word_count = 0
        self.ind2word = {}
        self.word2ind = {}
        for line in open("/Users/Kfir/Documents/corpora/glove.6B/glove.6B.300d.txt"):
            values = line.split(" ")
            v = []
            for i in range (1, len(values)):
                v.append(float(values[i]))
            self.vec.append(v)
            self.ind2word[self.word_count] = values[0]
            self.word2ind[values[0]] = self.word_count
            self.word_count += 1
    
    def get_vec(self, word):
        word = word.strip().lower()
        if word in self.word2ind:
            return self.vec[self.word2ind[word]]
        return None
            
lang = Lang()

In [43]:
class DepNode:
    def __init__(self, line):
        tokens = line.split('\t')
        self.id = int(tokens[0])
        self.text = tokens[1].lower()
        self.lemma = tokens[2].lower()
        self.pos = tokens[3]
        self.head = int(tokens[6])
        self.dep_type = tokens[7]
        self.children = []

class DepSentence:
    def __init__(self):
        self.nodes = []
        
    def add_word(self, line):
        node = DepNode(line)
        self.nodes.append(node)
        return node
        
    def rewire(self):
        for n in self.nodes:
            if n.dep_type != 'ROOT':
                self.nodes[n.head].children.append(n)
    

def is_valid_meaning(p):
    #if self.pos in ['JJ', 'JJS', 'RB', 'VBG']:
    #if p in ['JJ', 'JJS', 'VBG']:
    if p in ['VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'VB']:
        return True
    return False
    

def count_words(dep_corpus_folder):
    words = 0
    files = [f for f in os.listdir(dep_corpus_folder) if os.path.isfile(os.path.join(dep_corpus_folder, f))]
    for f in files:
        full_path = os.path.join(dep_corpus_folder, f)
        print("processing", full_path)
        sentence = None
        valid = False
        for line in gzip.open(full_path):
            if len(line.strip()) > 0:
                words += 0
                
    print(words)
        
def find_candidates_for(words, dep_corpus_folder):
    candidates = {}
    files = [f for f in os.listdir(dep_corpus_folder) if os.path.isfile(os.path.join(dep_corpus_folder, f))]
    for f in files:
        full_path = os.path.join(dep_corpus_folder, f)
        print("processing", full_path)
        sentence = None
        valid = False
        for line in gzip.open(full_path):
            line = line.decode('utf-8')
            line = line.strip()
            if len(line) > 0 and line[0] != '#':
                if sentence is None:
                    sentence = DepSentence()
                node = sentence.add_word(line)
                if node.text in words:
                    valid = True
            elif sentence is not None:
                if valid:
                    sentence.rewire()
                    for n in sentence.nodes:
                        if n.text in words:
                            if n.text not in candidates:
                                candidates[n.text] = Counter()
                            if n.dep_type != 'ROOT' and is_valid_meaning(sentence.nodes[n.head].pos):
                                candidates[n.text][sentence.nodes[n.head].text] += 1
                            for c in n.children:
                                if is_valid_meaning(c.pos):
                                    candidates[n.text][c.text] += 1
                sentence = None
    return candidates

In [44]:
candidates = find_candidates_for(metaphor_words, 'dep_corpus')

processing dep_corpus/part-m-00000.gz


In [24]:
pickle.dump(candidates, open( "candidates-verb-noun.p", "wb" ) )

In [17]:
print candidates['stir'].most_common(200)

[('using', 43), ('good', 42), ('quick', 37), ('trying', 26), ('sure', 26), ('smooth', 24), ('mixed', 23), ('causing', 21), ('low', 16), ('stirring', 16), ('crazy', 16), ('making', 15), ('baking', 14), ('scraping', 12), ('going', 11), ('little', 11), ('hot', 11), ('incorporated', 10), ('thick', 10), ('cold', 9), ('frying', 9), ('much', 9), ('serving', 7), ('slanting', 7), ('giving', 7), ('dry', 7), ('big', 6), ('mixing', 6), ('cool', 6), ('soft', 6), ('cooking', 6), ('necessary', 6), ('boiling', 6), ('warm', 6), ('adding', 5), ('careful', 5), ('easy', 5), ('brown', 5), ('large', 5), ('looking', 5), ('coated', 5), ('continuing', 5), ('simple', 5), ('beginning', 5), ('icing', 4), ('small', 4), ('simmering', 4), ('creating', 4), ('unhinged', 4), ('working', 4), ('few', 4), ('white', 4), ('bubbling', 4), ('thin', 4), ('stopping', 4), ('fried', 4), ('huge', 4), ('irrational', 4), ('important', 4), ('refrigerate', 4), ('starting', 4), ('mild', 3), ('blended', 3), ('stiff', 3), ('getting', 3),