In [1]:
import matplotlib
%matplotlib inline

# core libs
import os
import itertools
from collections import defaultdict, OrderedDict, Counter
import json
import yaml
import codecs
import re
from operator import itemgetter
import cPickle as pickle

from functools import wraps
from unidecode import unidecode

# numerical and ml libs
import numpy as np
from scipy.sparse import coo_matrix, hstack, csr_matrix
from nltk.corpus import wordnet as wn
import spacy
import sklearn
print("using sklearn vsersion -> %s" % sklearn.__version__)
from sklearn.metrics import f1_score, confusion_matrix
from sklearn import linear_model, datasets, svm, ensemble
from sklearn.utils.extmath import safe_sparse_dot
import matplotlib.pyplot as plt

# project files
from dfiner.ontonote.ontonotes_data import load_ontonotes, read_figer, GoldMentionView

from dfiner.utils import get_default_config, get_size, dump_pickle, load_pickle
from dfiner.datastructures.utils import print_view
from dfiner.annotators.hyp_pattern_annotator import HypPatternAnnotator
from dfiner.annotators.kb_bias_annotator import KBBiasTypeAnnotator
from dfiner.annotators.nsd_annotator import NSDView
from dfiner.annotators.fine_type_annotator import FineTypeView, SynsetFineTyper
from dfiner.annotators import get_non_default_annotator
import dfiner.ontonotes_annotation_extender as oae
from dfiner.topics import get_embedding_func, EmbeddingsType
from dfiner.classifier.lexicon import Lexicon
from dfiner.classifier.feature_function import DenseFeatureFunction, FeatureExtractor, FeatureFunction, FeatureStroage
import pandas as pd
# from sklearn.multiclass import OneVsRestClassifier
# import polylearn

FeatureStroage.set_cache_dir("/home/haowu4/.py_cache")
config = get_default_config()


using sklearn vsersion -> 0.18.1


In [2]:
%%time
nlp = spacy.load('en')

CPU times: user 2.54 s, sys: 136 ms, total: 2.68 s
Wall time: 2.68 s


In [3]:
# Loading resources.
common_cities_names_csv = set(pd.read_csv(config["common_us_city_path"], names=["Name", "State", "Pop"])["Name"].tolist())
common_cities_names = {x.strip() for x in common_cities_names_csv}

kba = KBBiasTypeAnnotator(config)


In [4]:
kba.surface_to_type_dist["University of Washington"]

{u'education.department': 0.1188118811881188,
 u'organization.educational_institution': 0.019801980198019802,
 u'organization.sports_team': 0.8613861386138614}

In [5]:
%%time
# train_docs = load_ontonotes(nlp, "/home/haowu4/data/simple_finer/NotAnnotated.name")
train_docs = load_ontonotes(nlp, "/home/haowu4/Downloads/non_nw_train_dev.name")
# _ = [annotator(doc) for doc in annotated_ontonote_docs for annotator in non_default_annotators[:2]]

test_docs = load_ontonotes(nlp, config['ontonotes_test_path'])
figer_docs = read_figer(nlp, config['figer_path'])
figer_gold_docs = read_figer(nlp, config['figer_gold'])
annotated_ontonote_docs = read_figer(nlp, "/home/haowu4/codes/dataless_finer/data/big_one.label")

print "loaded %d train docs" % len(train_docs)
print "loaded %d test docs" % len(test_docs)
print "loaded %d figer docs" % len(figer_docs)
print "loaded %d figer gold docs" % len(figer_gold_docs)
print "loaded %d figer gold docs" % len(annotated_ontonote_docs)

loaded 43193 train docs
loaded 9518 test docs
loaded 434 figer docs
loaded 434 figer gold docs
loaded 4061 figer gold docs
CPU times: user 57.5 s, sys: 236 ms, total: 57.8 s
Wall time: 57.8 s


In [6]:
print_view(annotated_ontonote_docs[0], "gold_mention_view")

/organization,/news_agency:Xinhua News Agency
/time:February 13
/location/country,/location:Beijing


In [7]:
def annotate_extend(docs):
    _ = [oae.add_titles(doc, oae.title_set) for doc in docs]
    _ = [oae.add_typexs('MEDICINE', doc, oae.symptom_alias_set.union(oae.drug_set.union(oae.treatment_set))) for doc in docs]
    _ = [oae.add_typexs('ANIMAL', doc, oae.animal_set) for doc in docs]
    _ = [oae.add_typexs('ROAD', doc, oae.road_set) for doc in docs]
    _ = [oae.fix_type1_to_type2(doc, "ORG", "FAC", oae.facility_as_org_trigger_words) for doc in docs]
    _ = [oae.fix_type1_to_type2(doc, "FAC", "ROAD", oae.road_set) for doc in docs]

In [8]:
%%time
annotate_extend(train_docs)
annotate_extend(test_docs)

CPU times: user 29.3 s, sys: 180 ms, total: 29.5 s
Wall time: 29.4 s


In [9]:
hyp_pattern_annotator = HypPatternAnnotator(nlp)

In [10]:
%%time
non_default_annotators = get_non_default_annotator(nlp, config)
# noun_and_rule_annotators = non_default_annotators[1:]

CPU times: user 1min 34s, sys: 1.78 s, total: 1min 35s
Wall time: 1min 35s


In [11]:
non_default_annotators

[<dfiner.annotators.hyp_pattern_annotator.HypPatternAnnotator at 0x7f9c24429ad0>,
 <dfiner.annotators.nsd_annotator.NounSenseAnnotator at 0x7f9c24429b90>,
 <dfiner.annotators.fine_type_annotator.RuleBasedFineTypeAnnotator at 0x7f9c24429bd0>]

In [12]:
%%time
# %%prun -s cumulative
_ = [annotator(doc) for doc in train_docs for annotator in non_default_annotators[:2]]
_ = [annotator(doc) for doc in test_docs for annotator in non_default_annotators[:2]]
_ = [annotator(doc) for doc in figer_docs for annotator in non_default_annotators[:2]]
_ = [annotator(doc) for doc in figer_gold_docs for annotator in non_default_annotators[:2]]
_ = [annotator(doc) for doc in annotated_ontonote_docs for annotator in non_default_annotators[:2]]

CPU times: user 2min 13s, sys: 424 ms, total: 2min 13s
Wall time: 2min 14s


In [13]:
# _ = [annotator(doc) for doc in annotated_ontonote_docs for annotator in non_default_annotators[:2]]

In [14]:
with open(config["ontonote_to_figer_map"]) as f_in:
    ontonotes_to_figer_course = yaml.load(f_in)

def type_func(mention_constituent):
    return ontonotes_to_figer_course[mention_constituent.name]

In [15]:
def slash_to_dot(slash_str):
    return ".".join(slash_str.split("/")[1:])

def get_figer_type_func(kba):
    def figer_type_func(mention_constituent):
        all_types = map(slash_to_dot, mention_constituent.name.split(","))
        try:
            course_types = list(set(map(kba.type_system.get_root, all_types)))
            if len(course_types) > 1:
                print "CAUTION: more than one course types -> (%s). Assigining (%s)." % (course_types, course_types[0])
                return course_types[0]
            return course_types[0]
        except:
            print "CAUTION: encoutered error while looking up (%s)" % (all_types)
            return None
    return figer_type_func

figer_type_func = get_figer_type_func(kba)

In [16]:
%%time
# we want full w2v
w2v_500k_pickle_path = config["embeddings_cache_path"]
with open(w2v_500k_pickle_path) as f_in:
    w2vdict = pickle.load(f_in)

CPU times: user 1.77 s, sys: 348 ms, total: 2.12 s
Wall time: 2.13 s


In [17]:
default_w2v_mean = np.mean(list(w2vdict.values()), axis=0)
default_w2v_zero = np.zeros(default_w2v_mean.shape)

In [18]:
# extended w2v with lowercase letters
extended_w2vdict = {}
for w in w2vdict:
    extended_w2vdict[w] = w2vdict[w]
    if w.lower() not in w2vdict:
        extended_w2vdict[w.lower()] = w2vdict[w]

print "# words in extended_w2vdict = %d" % len(extended_w2vdict)

# words in extended_w2vdict = 804253


In [19]:
lsi_embedding_func = get_embedding_func(config, EmbeddingsType.LSI, 100)
lda_embedding_func = get_embedding_func(config, EmbeddingsType.LDA, 50)

In [20]:

import os

class GazetteerReader():
    def __init__(self, base_folder, exclude = {}, exclude_begin = {}, exclude_end = {}):
        self.base_folder = base_folder
        gzs = defaultdict(set)
        start_counter = defaultdict(list)
        end_counter = defaultdict(list)
        
        for gz_entry in os.listdir(base_folder):
            if gz_entry in exclude:
                continue
            with open(os.path.join(base_folder,gz_entry)) as inp:
                for line in inp:
                    line = line.strip()
                    gzs[gz_entry].add(line)
                    ws = line.split()
                    if gz_entry not in exclude_begin:
                        start_counter[ws[0]].append(gz_entry)
                    if gz_entry not in exclude_end:
                        end_counter[ws[-1]].append(gz_entry)

        self.gzs = defaultdict(set)
        
        for gz_entry, gz_sets in gzs.iteritems():
            
            self.gzs[gz_entry] = gz_sets
            continue
            
            none_uniq = set()
            for gz_entry_2, gz_sets_2 in gzs.iteritems():
                if gz_entry == gz_entry_2:
                    continue
                for x in gz_sets.intersection(gz_sets_2):
                    none_uniq.add(x)
                    self.gzs["%s-and-%s" % (gz_entry,gz_entry_2)].add(x)
                    
            uniq_sets = gz_sets.difference(none_uniq)
#             uniq_sets = gz_sets.difference(none_uniq)
            self.gzs[gz_entry] = uniq_sets
            
        self.begins = {}
        self.ends = {}
        
        for w, obz in start_counter.iteritems():            
            target_, _ = Counter(obz).most_common(1)[0]
            self.begins[w] = target_
            
        
        for w, obz in end_counter.iteritems():            
            target_, _ = Counter(obz).most_common(1)[0]
            self.ends[w] = target_

            
    def found_exact_match(self, surface):
        ret = [a for a,b in self.gzs.iteritems() if surface in b or surface.lower() in b]
        return ret
        
    def begin_match(self, surface):
        if surface not in self.begins:
            return []
        ret = self.begins[surface]
        return [ret]
    
    def end_match(self, surface):
        if surface not in self.ends:
            return []

        ret = self.ends[surface]
        return [ret]
    

In [21]:
gaz_reader = GazetteerReader("/home/haowu4/data/gazetteers/", 
                             exclude={"all", "GPE.filter"},
                             exclude_begin={"loc", "org", "ORG.filter"},
                             exclude_end={"loc", "org", "ORG.filter"}
                            )
# gaz_reader.gzs['org'].add("Washington State")
# gaz_reader.gzs['org'].add("San Antonio")
# gaz_reader.gzs['org'].add("Utah")
# gaz_reader.gzs['org'].add("Cal")
# gaz_reader.gzs['org'].add("Rampage")


In [22]:
gaz_reader.end_match("Balch")

['per']

In [23]:
def word_shape_func(text):
    text = re.sub("[a-z]+", "a" ,text)
    text = re.sub("[A-Z]+", "A" ,text)
    text = re.sub("[0-9]+", "0" ,text)
    return text


def gazetteer_feature(gzr):
    @FeatureFunction("gazetteer_feature")
    def f(doc, mention):
        surface = doc[mention.start:mention.end].text.strip()
#         print(surface)
        matches = gzr.found_exact_match(surface)
        return matches

    f.__name__ = "gazetteer_feature"
    
    return f


def gazetteer_begin(gzr):
    @FeatureFunction("gazetteer_begin")
    def f(doc, mention):
        surface = doc[mention.start].text.strip()
#         print(surface)
        matches = gzr.begin_match(surface)
        return matches

    f.__name__ = "gazetteer_begin"
    
    return f

def gazetteer_end(gzr):
    @FeatureFunction("gazetteer_end")
    def f(doc, mention):
        surface = doc[mention.end-1].text.strip()
#         print(surface)
        matches = gzr.end_match(surface)
        return matches

    f.__name__ = "gazetteer_end"
    
    return f

def common_cities_feature(cities_names_set):
    @FeatureFunction("common_cities_feature")
    def f(doc, mention):
        surface = doc[mention.start:mention.end].text.strip()
#         print(surface)
        if surface in cities_names_set:
            return ["1True"]
        else:
            return ["2False"]

    
    f.__name__ = "common_cities_fn"
    
    return f


def ngram_before(n):
    @FeatureFunction("%dgram_before" % n)
    def f(doc, mention):
        start, end = mention.start, mention.end
        if start - n >= 0:
            return [doc[start-n:start].text]
        else:
            return []
    f.__name__ = "%dgram_before" % n
    return f


def ngram_after(n):
    @FeatureFunction("%dgram_after" % n)
    def f(doc, mention):
        start, end = mention.start, mention.end
        if end + n <= len(doc):
            return [doc[end:end+n].text]
        else:
            return []
    f.__name__ = "%dgram_after" % n
    return f


@FeatureFunction("dep_feature")
def mention_details(doc, mention):
    start, end = mention.start, mention.end
    heads = [token.head for token in doc[start:end]]
    deps = [list(token.children) for token in doc[start:end]]
    for token, head, children in zip(doc[start:end], heads, deps):
        if not (head.i >= start and head.i < end):            
            yield "<-%s- %s" % (token.dep_, head.lemma_)
            yield "<- %s" % (head.lemma_)
        for child in children:
            if not (child.i >= start and child.i < end):
                yield  "-%s-> %s" % (child.dep_,child.lemma_)
                yield  "-> %s" % (child.lemma_)


@FeatureFunction("dep_feature_len2")
def dep_feat_len2(doc, mention):
    start, end = mention.start, mention.end
    heads = [token.head for token in doc[start:end]]
    deps = [list(token.children) for token in doc[start:end]]
    for token, head, children in zip(doc[start:end], heads, deps):
        # len 2 deps through head
        if head.head != head and not (start <= head.head.i < end):
            yield "<-%s- %s <-%s- %s" % (token.dep_, head.lemma_, head.dep_, head.head.lemma_)
            yield "<- %s <- %s" % (head.lemma_, head.head.lemma_)
        for child in head.children:
            if child == token or start <= child.i < end:
                continue
            yield "<-%s- %s -%s-> %s" % (token.dep_, head.lemma_, child.dep_, child.lemma_)
            yield "<- %s -> %s" % (head.lemma_, child.lemma_)
        # len 2 deps through children
        for child in children:
            for grandchild in child.children:
                if start <= grandchild.i < end:
                    continue
                yield  "-%s-> %s -%s-> %s" % (child.dep_, child.lemma_, grandchild.dep_, grandchild.lemma_)
                yield  "-> %s -> %s" % (child.lemma_, grandchild.lemma_)


def word_before(position):
    @FeatureFunction("word_before_%d" % position)
    def f(doc, mention):
        start, end = mention.start, mention.end
        for i in range(max(start-position,0), start):
            yield doc[i].text
    return f


def word_before_loc(position):
    @FeatureFunction("word_before_loc_%d" % position)
    def f(doc, mention):
        start, end = mention.start, mention.end
        for i in range(max(start-position,0), start):
            yield "%d-%s" % (start - i,doc[i].text)
    return f


def word_before_lemma(position):
    @FeatureFunction("word_before_lemma_%d" % position)
    def f(doc, mention):
        start, end = mention.start, mention.end
        for i in range(max(start-position,0), start):
            yield doc[i].lemma_
    return f


def word_after(position):
    @FeatureFunction("word_after_%d" % position)
    def f(doc, mention):
        start, end = mention.start, mention.end
        for i in range(end, min(end+position,len(doc))):
            yield doc[i].text
    return f


def word_after_loc(position):
    @FeatureFunction("word_after_loc_%d" % position)
    def f(doc, mention):
        start, end = mention.start, mention.end
        for i in range(end, min(end+position,len(doc))):
            yield "%d-%s" % (i - end,doc[i].text)
    return f


def word_after_lemma(position):
    @FeatureFunction("word_after_lemma_%d" % position)
    def f(doc, mention):
        start, end = mention.start, mention.end
        for i in range(end, min(end+position,len(doc))):
            yield doc[i].lemma_
        
    return f


@FeatureFunction("wim_shape")
def word_shape_in_mention(doc, mention):
    start, end = mention.start, mention.end
    for token in doc[start:end]:
        yield word_shape_func(token.text)


@FeatureFunction("wim")
def word_in_mention(doc, mention):
    start, end = mention.start, mention.end
    for token in doc[start:end]:
        yield token.text


@FeatureFunction("wim_lemma")
def word_in_mention_lemma(doc, mention):
    start, end = mention.start, mention.end
    for token in doc[start:end]:
        yield token.lemma_
        

@FeatureFunction("wim_loc")
def word_in_mention_loc(doc, mention):
    start, end = mention.start, mention.end
    for i,x in enumerate(doc[start:end]):
        yield "f%d-%s" % (i,x.text)
        yield "b%d-%s" % ((end-start-1) - i,x.text)
        

@FeatureFunction("wim_loc_lemma")
def word_in_mention_loc_lemma(doc, mention):
    start, end = mention.start, mention.end
    for i,x in enumerate(doc[start:end]):
        x = x.lemma_
        yield "f%d-%s" % (i,x)
        yield "b%d-%s" % ((end-start-1) - i,x)
    

def wim_ngram(n=2):
    @FeatureFunction("wim_%dgram" % n)
    def f(doc, mention):
        start, end = mention.start, mention.end
        words = map(lambda token: token.text, doc[start:end])
        for ngram_tup in zip(*[words[i:] for i in xrange(n)]):
            yield "-".join(ngram_tup)
    return f
        

def wim_ngram_lemma(n=2):
    @FeatureFunction("wim_%dgram_lemma" % n)
    def f(doc, mention):
        start, end = mention.start, mention.end
        words = map(lambda token: token.lemma_, doc[start:end])
        for ngram_tup in zip(*[words[i:] for i in xrange(n)]):
            yield "-".join(ngram_tup)
    return f


@FeatureFunction("mention_shape")
def mention_shape(doc, mention):
    start, end = mention.start, mention.end
    t = " ".join([x.text for x in doc[start:end]])
    return [word_shape_func(t)]
        

@FeatureFunction("mention_length")
def mention_length(doc, mention):
    start, end = mention.start, mention.end
    return ["%d" % (end-start)]
        

@FeatureFunction("is_all_cap")
def is_all_cap(doc, mention):
    surface = doc[mention.start:mention.end].text.strip()
    for c in surface:
        if c.isalpha() and c.islower():
            return ["False"]
    return ["True"]


@FeatureFunction("has_non_alphanum")
def has_non_alphanum(doc, mention):
    start, end = mention.start, mention.end
    ret = set()
    surface = doc[mention.start:mention.end].text.strip()
    for w in doc[start:end]:
        for c in w.text:
            if not c.isalnum():
                ret.add("w=%s" % w) 
                ret.add( "c=%s" % c) 
#                 break
    return list(ret)


@FeatureFunction("one_word_endding")
def one_word_endding(doc, mention):
    start, end = mention.start, mention.end
    if start - end == 1:
        return [doc[end].text[-2:]]
    else:
        return []

    

@FeatureFunction("one_word_ge2_cap")
def one_word_ge2_cap(doc, mention):
    start, end = mention.start, mention.end
    if start - end == 1:
        w = doc[start].text
        if len([c for c in w if c.isupper()]) >= 2:
            return ["True"]
    else:
        return []

#     for c in surface:
#         if not c.isalnum():
#             yield c
            
#     for i in range(max(start-position,0), start):
#         yield "%d-%s" % (start - i,doc[i].text)

#     return ["True"]

@FeatureFunction("prefix")
def prefix(doc, mention):
    start, end = mention.start, mention.end
    for w in doc[start:end]:
        for i in range(3, min(5, len(w.text))):
            yield w.text[:i]
        
@FeatureFunction("length_ge")
def length_ge(doc, mention):
    start, end = mention.start, mention.end
    for i in range(1,6):
        if end - start >= i:
            yield "ge_%d" % i
        else:
            yield "le_%d" % i
            
@FeatureFunction("suffix")
def postfix(doc, mention):
    start, end = mention.start, mention.end
    for w in doc[start:end]:
        for i in range(3, min(5, len(w.text))):
            yield w.text[-i:]


# KB-Bias features
def kbbias(kbbias_annotator):
    @FeatureFunction("kbbias-new")
    def wrappee(doc, mention):
        surface = doc[mention.start:mention.end].text
        results = None
        if surface in kbbias_annotator.surface_to_type_dist:
            results = kbbias_annotator.surface_to_type_dist[surface]
        elif (surface[:4].lower() == 'the ') and \
              surface[4:] in kbbias_annotator.surface_to_type_dist:
            results = kbbias_annotator.surface_to_type_dist[surface[4:]]
        if results:
            return results.iteritems()
        else:
            return []
    return wrappee


def get_most_sim_from_gensim(gensim_w2v, query, topn):
    if query in gensim_w2v.cached_most_sim:
        return gensim_w2v.cached_most_sim[query]
    else:
        r = list(gensim_w2v.most_similar(query, topn=topn))
        gensim_w2v.cached_most_sim[query] = r
        return r

# KB-Bias features
def kbbias_approx(kbbias_annotator, gensim_word2vec, min_sim=0.5):
    @FeatureFunction("kbbias_approx")
    def wrappee(doc, mention):
        if mention.end - mention.start != 1:
            return []
        surface = doc[mention.start:mention.end].text
        results_dist = defaultdict(float)
        total_count = 0.0
        if surface not in gensim_word2vec.vocab:
            return []
        sims = get_most_sim_from_gensim(gensim_word2vec, surface, 4)
        for approx_word, sim_score in sims:
            results = None
            if sim_score < sim_score:
                continue
            if approx_word in kbbias_annotator.surface_to_type_dist:
                results = kbbias_annotator.surface_to_type_dist[approx_word]
            elif (approx_word[:4].lower() == 'the ') and \
                  approx_word[4:] in kbbias_annotator.surface_to_type_dist:
                results = kbbias_annotator.surface_to_type_dist[approx_word[4:]]
            if results:
                for k in results:
                    results_dist[k] += results[k]                   
                total_count += 1.0
        if total_count > 0:
            for k in results_dist:
                results_dist[k] = results_dist[k]   / total_count                 
            return results_dist.iteritems()
        else:
            return []
    
    return wrappee

@FeatureFunction('has_number_inside')
def has_number_inside(doc, mention):
    start, end = mention.start, mention.end
    ret = set()
    surface = doc[mention.start:mention.end].text.strip()
    for c in surface:
        if c.isdigit():
            return ["True"]
    return ["False"]
#     for w in doc[start:end]:
#         for c in w.text:
#             if not c.isalnum():
#                 ret.add("w=%s" % w) 
#                 ret.add( "c=%s" % c) 
# #                 break
#     return list(ret)


@FeatureFunction("in_quotes")
def in_quotes(doc, mention):
    
    start, end = mention.start, mention.end
    
    if any([unidecode(doc[i].text) == '"' for i in xrange(max(0, start-2), start+1)]) and \
       any([unidecode(doc[i].text) == '"' for i in xrange(end-1, min(end+2, len(doc)))]):
            return ["in-double-quotes"]
    
    if any([unidecode(doc[i].text) == "'" for i in xrange(max(0, start-2), start+1)]) and \
       any([unidecode(doc[i].text) == "'" for i in xrange(end-1, min(end+2, len(doc)))]):
            return ["in-single-quotes"]
    
    return []


synset_typer = SynsetFineTyper(config)
take_best_sense = True


@FeatureFunction("hyp_fine_types")
def hyp_fine_type_feats(doc, mention):
    hyp_view = doc.user_data[HypPatternAnnotator.HYP_VIEW]
    nsd_view = doc.user_data[NSDView.NSD_VIEW_NAME]

    fine_type_scores = defaultdict(float)
    hyp_pattern_fine_type_scores = defaultdict(float)
    for token_constituent in hyp_view.constituents:
        # mention covers this token
        if mention.start <= token_constituent.start < mention.end:
            if token_constituent.incoming_relations is None:
                continue
            for relation in token_constituent.incoming_relations:
                # checking if source is inside the mention. could remove it too
                hypernym_token_constituent = relation.source
                if mention.start <= hypernym_token_constituent.start < mention.end:
                    continue
                # print doc[hypernym_token_constituent.start:hypernym_token_constituent.end]
                for nsd_constituent in nsd_view.constituents:
                    if nsd_constituent.start <= hypernym_token_constituent.start < nsd_constituent.end:
                        # print nsd_constituent.label2score
                        if take_best_sense:
                            synset_offset_pos, score = max(nsd_constituent.label2score.items(), key=itemgetter(1))
                            for fine_type in synset_typer.get_fine_types(synset_offset_pos):
                                fine_type_scores[fine_type] += 1.
                                hyp_pattern_fine_type_scores[relation.relation_name +"=>" + fine_type] += 1.
                        else:
                            for synset_offset_pos, score in nsd_constituent.label2score.iteritems():
                                for fine_type in synset_typer.get_fine_types(synset_offset_pos):
                                    # print fine_type
                                    fine_type_scores[fine_type] += score
                                    hyp_pattern_fine_type_scores[relation.relation_name +"=>" + fine_type] += score
    return fine_type_scores.items() + hyp_pattern_fine_type_scores.items()


@FeatureFunction("bias")
def CONSTANT_BIAS(doc, mention):
    start, end = mention.start, mention.end
    return ["bias"]

In [24]:
def getOrDefault(m, k, d):
    if k in m:
        return m[k]
    else:
        return d


def topicSentence(sentEmbeddingFunc, feature_name, size):
    @DenseFeatureFunction(size)
    def topicSentence(doc, mention):
        tokenized_doc = [token.text for token in doc]
        return sentEmbeddingFunc(tokenized_doc)
    topicSentence.__name__ = feature_name
    return topicSentence

In [25]:
def get_ngrams_matches(token_sequence, ngram_set, max_n):
    """
    From the token_sequence return all ngrams of size <= max_n.
    Any ngram should be from the given ngram_set.
    Ensure all tokens in the token_sequence is part of one and exactly one ngram.
    """
    non_overlapping_matches = []
    l = len(token_sequence)
    max_n = min(l, max_n)
    for n in xrange(max_n, 0, -1):
        i = 0
        while i <= l-n:
            if any([(match[1][0]<=i<match[1][1] or match[1][0]<=i+n<match[1][1]) for match in non_overlapping_matches]):
                # overlapping with a higher ngram. skip.
                i += n
                continue
            s = "_".join(token_sequence[i:i+n])
            if s in ngram_set:
                non_overlapping_matches.append((s, (i, i+n)))
                i += n
                continue
            s_lower = s.lower()
            if s_lower in ngram_set:
                non_overlapping_matches.append((s_lower, (i, i+n)))
                i += n
                continue
            i += 1
    return non_overlapping_matches
                

def w2vMention(w2v_dict, default_w2v, max_n=3):
    @DenseFeatureFunction(300)
    def w2vMention(doc, mention):
        l = mention.end - mention.start
        if l == 0:
            print(doc)
            print("WARNING: The length of the mention is 0!")
        mention_tokens = [token.text for token in doc[mention.start:mention.end]]
        mean_vecs = [w2v_dict[match[0]] for match in get_ngrams_matches(mention_tokens, w2v_dict, max_n)]
        return np.mean(mean_vecs, axis=0) if len(mean_vecs) else default_w2v
    return w2vMention

def w2vMentionExactMatch(w2v_dict, default_w2v, max_n=3):
    @DenseFeatureFunction(300)
    def w2vMentionExactMatch(doc, mention):
        surface = doc[mention.start:mention.end].text.replace(" ","_")
        ret = w2v_dict[surface] if surface in w2v_dict else default_w2v
        return ret
    return w2vMentionExactMatch

In [26]:
text = "The “ working ” German shepherds and Belgian malinois " + \
                 "he cared for were either attack dogs or bomb or narcotics sniffers "+ \
                 ", he said , standing in his year-old Baxter Creek Veterinary Clinic . "
token_sequence = text.decode('utf-8').split()

In [27]:
get_ngrams_matches(token_sequence, extended_w2vdict, 2)

[(u'German_shepherds', (4, 6)),
 (u'belgian_malinois', (7, 9)),
 (u'he_said', (22, 24)),
 (u'in_his', (26, 28)),
 (u'Veterinary_Clinic', (31, 33)),
 (u'The', (0, 1)),
 (u'working', (2, 3)),
 (u'he', (9, 10)),
 (u'cared', (10, 11)),
 (u'for', (11, 12)),
 (u'were', (12, 13)),
 (u'either', (13, 14)),
 (u'attack', (14, 15)),
 (u'dogs', (15, 16)),
 (u'or', (16, 17)),
 (u'bomb', (17, 18)),
 (u'or', (18, 19)),
 (u'narcotics', (19, 20)),
 (u'sniffers', (20, 21)),
 (u'Baxter', (29, 30))]

In [28]:
doc = figer_gold_docs[1]
print doc
print ""
hyp_view = doc.user_data[HypPatternAnnotator.HYP_VIEW]
nsd_view = doc.user_data[NSDView.NSD_VIEW_NAME]
print_view(doc, HypPatternAnnotator.HYP_VIEW)

The AAAS is the largest scientific society in the world and publishes journals such as Science , and Science Translational Medicine . 

journals -> [ hearst1 -> Science, hearst1 -> Medicine, hearst1 -> Science ]
Science
AAAS -> [ hearst_rev_copular -> society ]
Science
Medicine
Translational -> [ hearst_ncompmod -> Science, hearst_ncompmod -> Medicine, hearst_ncompmod -> Science ]
society -> [ hearst_copular -> AAAS ]


In [29]:
type_lex.lexeme_to_index


NameError: name 'type_lex' is not defined

In [30]:
def build_corpora(docs, type_func, skip_none=True):
    ret = []
    for doc in docs:
        for m in doc.user_data["gold_mention_view"]:
            if skip_none and type_func(m) is None:
                continue
            ret.append((doc, m))
    return ret

def build_ys_from_gold(corpora, type_lex, type_func):
    ret = []
    missing = set()
    for doc, m in corpora:
        t = type_func(m)
        if t is None:
            raise ValueError("type is None")
        type_lex.see_lexeme(t)
        idx = type_lex.getOrNegOne(t)
        if idx == -1:
            missing.add(t)
        ret.append(idx)
    for t in missing:
        print("missing label %s" % t)
    return np.array(ret)

type_lex = Lexicon()
    
ontonote_train = build_corpora(train_docs, type_func)
Y_train = build_ys_from_gold(ontonote_train, type_lex, type_func)

ontonote_test = build_corpora(test_docs, type_func)
Y_test = build_ys_from_gold(ontonote_test, type_lex, type_func)

figer_test = build_corpora(figer_docs, figer_type_func)
Y_figer = build_ys_from_gold(figer_test, type_lex, figer_type_func)

figer_gold_test = build_corpora(figer_gold_docs, figer_type_func)
Y_figer_gold = build_ys_from_gold(figer_gold_test, type_lex, figer_type_func)

new_doc_test = build_corpora(annotated_ontonote_docs, figer_type_func)
Y_new_doc = build_ys_from_gold(new_doc_test, type_lex, figer_type_func)


# annotated_ontonote_docs
# annotated_ontonote_docs

CAUTION: more than one course types -> ([u'transportation.road', u'location']). Assigining (transportation.road).
CAUTION: more than one course types -> (['building', u'location']). Assigining (building).
CAUTION: more than one course types -> (['building', u'location']). Assigining (building).
CAUTION: more than one course types -> (['building', u'location']). Assigining (building).
CAUTION: more than one course types -> ([u'building', 'organization', u'location']). Assigining (building).
CAUTION: more than one course types -> ([u'transportation.road', u'location']). Assigining (transportation.road).
CAUTION: more than one course types -> ([u'transportation.road', u'location']). Assigining (transportation.road).
CAUTION: more than one course types -> ([u'transportation.road', u'location']). Assigining (transportation.road).
CAUTION: more than one course types -> ([u'building', u'location']). Assigining (building).
CAUTION: more than one course types -> ([u'organization', u'location'])

In [31]:
# features

# sparse feature functions

sparse_feat_funcs = [
#     CONSTANT_BIAS,
#     word_shape_in_mention,
#     word_in_mention, word_in_mention_lemma,
    is_all_cap, 
    has_number_inside,
#     length_ge,
#     has_non_alphanum,
#     one_word_endding,
    mention_details,
    word_in_mention_loc,
    word_in_mention_loc_lemma,
    wim_ngram(2), wim_ngram_lemma(2),
    wim_ngram(3), wim_ngram(3),
    kbbias(kba), 
#     one_word_ge2_cap,
#     common_cities_feature(common_cities_names),
    gazetteer_feature(gaz_reader),
    gazetteer_begin(gaz_reader),
    gazetteer_end(gaz_reader),
#     kbbias_approx(kba, gensim_word_vectors),
    
    hyp_fine_type_feats,
    in_quotes,
    ngram_before(3), #ngram_before(2),
    ngram_after(3), #ngram_after(2)
    dep_feat_len2,
    w2vMentionExactMatch(extended_w2vdict, default_w2v_zero, max_n=3),
#         w2vMention(extended_w2vdict, default_w2v_mean, max_n=3),
#     topicSentence(lda_embedding_func,"lda_embedding",50),
    topicSentence(lsi_embedding_func,"lsi_embedding",100),
                    ]
#             wim_ext, wim_ext_lemma,   
#             word_shape,
#             mention_length,
#             prefix,
#             postfix,
#             mention_pronoun_wh_dep
sparse_extractor= FeatureExtractor(sparse_feat_funcs)

In [32]:
%%time
%%prun -s cumulative

force_update = True
min_support = 3
sparse_extractor.build_lexicon(ontonote_train, min_support=min_support, force_update=force_update)
X_train = sparse_extractor.extract("ontonote-notannotated", ontonote_train, force_update=force_update)
X_test = sparse_extractor.extract("ontonote-test", ontonote_test, force_update=force_update)
X_figer = sparse_extractor.extract("figer-test", figer_test, force_update=force_update)
X_figer_gold = sparse_extractor.extract("figer-gold-test", figer_gold_test, force_update=force_update)

X_new_doc = sparse_extractor.extract("new-doc", new_doc_test, force_update=force_update)

feature_names = sparse_extractor.reverse_lexicon()

index_to_feat = {i:f for i, f in enumerate(feature_names)}
index_to_type = {i:t for t, i in type_lex.lexeme_to_index.iteritems()}

 CPU times: user 55.2 s, sys: 812 ms, total: 56 s
Wall time: 55.9 s


In [33]:
{a[:5] for a in feature_names}

{u'3gram',
 u'dep_f',
 'gazet',
 'has_n',
 'hyp_f',
 'in_qu',
 'is_al',
 u'kbbia',
 'topic',
 'w2vMe',
 u'wim_2',
 u'wim_3',
 u'wim_l'}

In [34]:
# list(kbbias_approx(kba, gensim_word_vectors)(figer_docs[0], figer_docs[0].user_data["gold_mention_view"].constituents[1]))

In [35]:
figer_docs[0].user_data["gold_mention_view"].constituents[1].start

23

In [36]:
print X_train.shape
print X_test.shape
print X_figer.shape
print X_figer_gold.shape
print X_new_doc.shape

(50693, 39799)
(15554, 39799)
(561, 39799)
(577, 39799)
(7587, 39799)


# Eval Stuff

In [37]:
metric_names = ["micro-f1", "macro-f1", "weighted-f1"]
averages = ["micro", "macro", "weighted"]

def print_global_metrics_header(metric_names):
    print("%-15s %10s %10s %10s\n" % tuple(["dataset"] + metric_names))
    
def print_global_metrics(dataset_name, ys_gold, ys_pred, classes, averages):
    print("%-15s %9.2f%% %9.2f%% %9.2f%%" % tuple([dataset_name] + 
                                             [100*f1_score(ys_gold, ys_pred, classes, average=avg)
                                              for avg in averages]))

# Linear Classifier

In [38]:
%%time



def get_model_of(model_name):
    if model_name == 0:
        logreg = linear_model.SGDClassifier(
                            loss='hinge',#'modified_huber',#'hinge',#perceptron',
#                             penalty="l2",
#                             alpha=1e-6,
#                             average=True,
#                             class_weight='balanced'
                            )
#         sklearn.ensemble.BaggingClassifier(base_estimator=logreg, n_estimators=20)
        return logreg

    if model_name == 1:
        logreg = linear_model.SGDClassifier(
                            loss='hinge',#'modified_huber',#'hinge',#perceptron',
#                             penalty="l2",
#                             alpha=1e-6,
                            average=True,
#                             class_weight='balanced'
                            )
        logreg = sklearn.ensemble.BaggingClassifier(base_estimator=logreg, n_estimators=20)
        return logreg
    
logreg = get_model_of(0)

logreg.fit(X_train, Y_train)
    



CPU times: user 11.5 s, sys: 124 ms, total: 11.6 s
Wall time: 4.39 s


In [39]:
y_train_pred = logreg.predict(X_train)
y_test_pred = logreg.predict(X_test)
y_figer_pred = logreg.predict(X_figer)
y_figer_gold_pred = logreg.predict(X_figer_gold)
y_new_doc_pred = logreg.predict(X_new_doc)
classes = sorted(index_to_type.keys())

averages = ["micro", "macro", "weighted"]
print_global_metrics_header(metric_names)
print_global_metrics("train", Y_train, y_train_pred, classes, averages)
print_global_metrics("test", Y_test, y_test_pred, classes, averages)
print_global_metrics("figer", Y_figer, y_figer_pred, classes, averages)
print_global_metrics("figer-gold", Y_figer_gold, y_figer_gold_pred, classes, averages)
print("\n")

train_f1 = f1_score(Y_train, y_train_pred, classes, average=None).tolist()
test_f1 = f1_score(Y_test, y_test_pred, classes, average=None).tolist()
figer_gold_f1 = f1_score(Y_figer_gold, y_figer_gold_pred, classes, average=None).tolist()

class_names = [None] * len(index_to_type)
for i in range(len(index_to_type)):
    class_names[i] = index_to_type[i]

print("%22s : %8s %8s %8s\n" % ("TYPE", "train_f1", "test_f1", "figer_f1"))
for type_name, train_type_f1, test_type_f1, figer_gold_type_f1 in \
    sorted([(index_to_type[index], train_f1[index], test_f1[index], figer_gold_f1[index])
            for index in index_to_type], key=lambda a:a[2], reverse=True):
    print("%22s : %8.3f %8.3f %8.3f" % (type_name, train_type_f1, test_type_f1, figer_gold_type_f1)) 

dataset           micro-f1   macro-f1 weighted-f1

train               98.01%     85.60%     98.00%
test                91.22%     70.83%     91.01%
figer               86.63%     41.90%     85.21%
figer-gold          89.60%     51.29%     88.27%


                  TYPE : train_f1  test_f1 figer_f1

                 title :    0.997    0.994    0.917
      finance.currency :    0.993    0.980    0.000
                  time :    0.993    0.980    0.929
          living_thing :    0.980    0.955    0.500
              location :    0.983    0.944    0.878
                 norpl :    0.981    0.943    0.909
              medicine :    0.999    0.896    0.667
                person :    0.984    0.889    0.962
          organization :    0.967    0.865    0.891
   transportation.road :    0.988    0.741    0.000
                   law :    0.957    0.606    0.667
               product :    0.935    0.586    0.000
                 event :    0.961    0.577    0.182
              building

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


# Creating Coarse Grain View, and annotating with fine-grain data

In [40]:
from dfiner.datastructures import View, Constituent
def add_pred_view(y_pred, docs_and_mentions, pred_viewname):
    for doc, _ in docs_and_mentions:
        if "OntonoteType" in doc.user_data:
            del doc.user_data['OntonoteType']

    assert len(y_pred) == len(docs_and_mentions)
    for pred, (doc, mention) in zip(y_pred, docs_and_mentions):
        user_data = doc.user_data
        if pred_viewname not in user_data:
            user_data[pred_viewname] = View()
        view = user_data[pred_viewname]
        label_name = type_lex.reverse_lex()[pred]
        c = Constituent(mention.start,
                        mention.end,
                        name=pred_viewname,
                        label2score={label_name: 1.0})
        view.add_constituent(c)
VIEW_NAME = "OntonoteType"
add_pred_view(y_figer_pred, figer_test, VIEW_NAME)
add_pred_view(y_figer_gold_pred, figer_gold_test, VIEW_NAME)
add_pred_view(y_new_doc_pred, new_doc_test, VIEW_NAME)


In [41]:
# from gensim.models.keyedvectors import KeyedVectors
# with open("/home/haowu4/data/simple_finer/GoogleNews-vectors-negative300.combined_500k.pkl") as input_fd:
#     gensim_word_vectors = pickle.load(input_fd)
# # gensim_word_vectors = KeyedVectors.load_word2vec_format("/home/haowu4/data/simple_finer/GoogleNews-vectors-negative300.combined_500k.txt", binary=False)

kbann = KBBiasTypeAnnotator(config, "OntonoteType")
config["kba"] = kbann


In [42]:
type_lex.lexeme_to_index

{'award': 15,
 'building': 10,
 'event': 5,
 'finance.currency': 9,
 'law': 11,
 'living_thing': 12,
 'location': 0,
 'medicine': 8,
 'norpl': 4,
 'organization': 6,
 'person': 2,
 'product': 14,
 u'software': 16,
 'time': 1,
 'title': 3,
 'transportation.road': 13,
 'work': 7}

In [46]:
from collections import defaultdict


def set_to_string(set_of_type):
    labels = ["/" + s.replace(".", "/") for s in set_of_type]
    return ",".join(sorted(labels))


def id(typs):
    return [x for source, x in typs]


def to_column_format(doc, use_views, type_map_function=id):
    ret = ""
    bios = defaultdict(lambda: "O")
    typs = defaultdict(set)
    for v, use_all in use_views:
        view = doc.user_data[v]
        for c in view.constituents:
            for i in range(c.start, c.end):
                if use_all and c.label2score:
                    for t in c.label2score:
                        typs[i].add((v, t))
                else:
                    # use best
                    if c.best_label_name:
                        typs[i].add((v, c.best_label_name))
                bios[i] = "I"
            bios[c.start] = "B"
    for i, token in enumerate(doc):
        w = token.text
        if bios[i] == "O":
            if len(typs[i]) == 0:
                inc = "%s\t%s\n" % (w, "O")
                ret += inc
            else:
                raise ValueError("O tag have types..")
        else:
            if len(typs[i]) > 0:
                types = type_map_function(typs[i])
                inc = "%s\t%s-%s\n" % (w, bios[i], set_to_string(types))
                ret += inc
            else:
                raise ValueError("B-I tag have no types..")
    return ret


In [68]:
def new_rule(typs):
    ret = []
    for source, x in typs:
        if source == "KBBiasType" and x == "organization.company":
#             pass
            continue
        if x  == "ethnicity":
            x = "people.ethnicity"
            ret.append("people.ethnicity")
    
        if x == "norpl":
            x = "people.ethnicity"
#             ret.append("people")
            ret.append("people.ethnicity")
#             ret.append("location")

        if x == "work":
#             ret.append("art")
            x = "art"
            
#         if x == "transportation.road":
#             ret.append("location")
        if x == "news_agency":
            ret.append("organization.company")
        if x == "building":
            ret.append("location")
        if x == "organization.sports_league":
            ret.append("organization.company")
            
        ret.append(x)
    return set(ret)


VIEW_NAME = "OntonoteType"
add_pred_view(y_figer_pred, figer_test, VIEW_NAME)
add_pred_view(y_figer_gold_pred, figer_gold_test, VIEW_NAME)
add_pred_view(y_new_doc_pred, new_doc_test, VIEW_NAME)

def write_col_format(filename, figer_docs, use_types = [("OntonoteType", False),
                         ("MRP-FINE", False),
                         ("KBBiasType", False),
                         ("GZFineType",False),
                         ("MentionEntail", False)]):
    counter = 0

    with codecs.open(filename, "w", "utf-8") as out:
        for doc in figer_docs:
            if "OntonoteType" not in doc.user_data:
                counter += 1
                doc.user_data["OntonoteType"] = View()
            kbann(doc)
            meann(doc)
            gz_fine_ann(doc)
            mrp_ann(doc)
            for a in non_default_annotators:
                a(doc)
#             use_types = [("OntonoteType", False),
#                          ("KBBiasType", False),
#                          ("GZFineType",False),
#                          ("MentionEntail", False)]
            for t, _ in use_types:
                if t not in doc.user_data:
                    doc.user_data[t] = View()
            s = to_column_format(doc, use_types, new_rule)
    #         s = to_column_format(doc, [("OntonoteType", False)])        
            out.write(s)
            out.write("\n")
    print("%d doc do not have coarse grain new view... " % counter)

write_col_format("/tmp/eval_out/newdoc_out", annotated_ontonote_docs)

ev_finer.eval_two_file( "/tmp/big_one.label",
    "/tmp/eval_out/newdoc_out")

# write_col_format("/home/haowu4/.py_cache/figer_gold_docs.out", figer_gold_docs)


Not found type organization or body_part
Not found type person or body_part
Not found type person or body_part
Not found type person or body_part
Not found type building or body_part
Not found type time or body_part
Not found type person or body_part
Not found type person or body_part
Not found type person or body_part
Not found type person or body_part
Not found type organization or body_part
Not found type organization or body_part
Not found type organization or body_part
Not found type person or body_part
Not found type organization or body_part
Not found type event or body_part
Not found type organization or body_part
Not found type organization or body_part
Not found type organization or body_part
Not found type organization or body_part
Not found type organization or body_part
Not found type building or body_part
Not found type organization or body_part
Not found type organization or body_part
Not found type location or body_part
Not found type person or body_part
Not found type 

In [None]:

write_col_format("/tmp/eval_out/newdoc_out_coarse", annotated_ontonote_docs, 
                             use_types = [
                              ("OntonoteType", False),
#                               ("MRP-FINE", False),                              
#                               ("KBBiasType", False),
#                               ("GZFineType",False),
#                               ("MentionEntail", False)
                             ])

ev_finer.eval_two_file( "/tmp/big_one.label",
    "/tmp/eval_out/newdoc_out_coarse")

In [None]:

write_col_format("/tmp/eval_out/organized/outs/figer.out", figer_docs)

write_col_format("/tmp/eval_out/figer_docs-noKB.out",
                 figer_docs,
                 use_types = [("OntonoteType", False),
                             ("MRP-FINE", False),                              
#                               ("KBBiasType", False),
#                               ("GZFineType",False),
                              ("MentionEntail", False)
                             ])

write_col_format("/tmp/eval_out/figer_docs-noHyp.out",
                 figer_docs,
                 use_types = [("OntonoteType", False),
                              ("MRP-FINE", False),                              
                              ("KBBiasType", False),
                              ("GZFineType",False),
#                               ("MentionEntail", False)
                             ])

write_col_format("/tmp/eval_out/figer_docs-noMenPt.out",
                 figer_docs,
                 use_types = [("OntonoteType", False),
#                                 ("MRP-FINE", False),                              
                              ("KBBiasType", False),
                              ("GZFineType",False),
                              ("MentionEntail", False)
                             ])


write_col_format("/tmp/eval_out/figer_docs-onlyKB.out",
                 figer_docs,
                 use_types = [("OntonoteType", False),
#                              ("MRP-FINE", False),                              
                              ("KBBiasType", False),
                              ("GZFineType",False),
#                               ("MentionEntail", False)
                             ])

write_col_format("/tmp/eval_out/figer_docs-onlyHyp.out",
                 figer_docs,
                 use_types = [("OntonoteType", False),
#                               ("MRP-FINE", False),                              
#                               ("KBBiasType", False),
#                               ("GZFineType",False),
                              ("MentionEntail", False)
                             ])

write_col_format("/tmp/eval_out/figer_docs-onlyMenPt.out",
                 figer_docs,
                 use_types = [("OntonoteType", False),
                                ("MRP-FINE", False),                              
#                               ("KBBiasType", False),
#                               ("GZFineType",False),
#                               ("MentionEntail", False)
                             ])

write_col_format("/tmp/eval_out/figer_docs-nofine.out",
                 figer_docs,
                 use_types = [("OntonoteType", False),
#                                 ("MRP-FINE", False),                              
#                               ("KBBiasType", False),
#                               ("GZFineType",False),
#                               ("MentionEntail", False)
                             ])

write_col_format("/tmp/eval_out/figer_docs-nocoarse.out",
                 figer_docs,
                 use_types = [#("OntonoteType", False),
                                ("MRP-FINE", False),                              
                              ("KBBiasType", False),
                              ("GZFineType",False),
                              ("MentionEntail", False)
                             ])


In [None]:
for c in figer_docs[0].user_data["fine_type_view"].constituents:
    print c.start, c.end, c.label2score

In [None]:

[figer_type_func(c) for c in figer_docs[0].user_data["gold_mention_view"].constituents]

In [None]:
GOLD_FIGER_PATH = "/home/haowu4/codes/dataless_finer/python/eval_output/organized/gold/figer.xiang.label"

ev_finer.eval_two_file( GOLD_FIGER_PATH,
    "/tmp/eval_out/figer.out")

ev_finer.eval_two_file( GOLD_FIGER_PATH,
    "/tmp/eval_out/figer_docs-noKB.out")

ev_finer.eval_two_file( GOLD_FIGER_PATH,
    "/tmp/eval_out/figer_docs-noHyp.out")

ev_finer.eval_two_file( GOLD_FIGER_PATH,
    "/tmp/eval_out/figer_docs-noMenPt.out")


ev_finer.eval_two_file( GOLD_FIGER_PATH,
    "/tmp/eval_out/figer_docs-onlyKB.out")

ev_finer.eval_two_file( GOLD_FIGER_PATH,
    "/tmp/eval_out/figer_docs-onlyHyp.out")

ev_finer.eval_two_file( GOLD_FIGER_PATH,
    "/tmp/eval_out/figer_docs-onlyMenPt.out")

ev_finer.eval_two_file( GOLD_FIGER_PATH,
    "/tmp/eval_out/figer_docs-nofine.out")

In [65]:
import dfiner.eval.eval as ev_finer
reload(ev_finer)

ev_finer.eval_two_file(config["figer_path"], "/tmp/eval_out/figer_docs.out")
print("----------------------------------------------------------------")
ev_finer.eval_two_file("/home/haowu4/codes/dataless_finer/python/eval_output/xiang_ren_figer_gold.label", "/tmp/eval_out/figer_docs.out")
print("----------------------------------------------------------------")


# ev_finer.eval_two_file("/home/haowu4/codes/dataless_finer/python/eval_output/xiang_ren_figer_gold.label",
#                        "/home/haowu4/codes/dataless_finer/python/eval_output/organized/abelation/figer_docs-noMenPt.out")
print("----------------------------------------------------------------")

# ev_finer.eval_two_file(config['figer_gold'], "/home/haowu4/.py_cache/figer_gold_docs.out")
# print("----------------------------------------------------------------")
# ev_finer.eval_two_file("/home/haowu4/.py_cache/figer_coarse_gold.out", "/home/haowu4/.py_cache/figer_coarse_theirs.out")

# print("----------------------------------------------------------------")
# ev_finer.eval_two_file("/home/haowu4/.py_cache/figer_coarse_gold.out", "/home/haowu4/.py_cache/figer_docs_coarse.out")



IOError: [Errno 2] No such file or directory: '/tmp/eval_out/figer_docs.out'

In [44]:
from gensim.models.keyedvectors import KeyedVectors
gensim_word_vectors = KeyedVectors.load_word2vec_format("/home/haowu4/data/simple_finer/GoogleNews-vectors-negative300.combined_500k.txt", binary=False)


In [48]:
#gensim_word_vectors.most_similar("New York")

In [50]:
#gensim_word_vectors.most_similar_cosmul("Zarowsky")

In [51]:
%%timeit

"A" in gensim_word_vectors

The slowest run took 33.32 times longer than the fastest. This could mean that an intermediate result is being cached.
1000000 loops, best of 3: 301 ns per loop


In [52]:
len(extended_w2vdict)

804253

In [53]:
gensim_word_vectors.cached_most_sim = {}

In [54]:
kbann.surface_to_type_dist["NBA"]

{u'broadcast_program': 0.0017206521271561923,
 u'news_agency': 0.33268808878564976,
 u'organization.company': 0.3326450724824709,
 u'organization.sports_league': 0.33281713769518645,
 u'organization.sports_team': 4.30163031789048e-05,
 u'person.author': 8.60326063578096e-05}

In [55]:
from nltk.corpus import wordnet as wn
from dfiner.datastructures import View, Constituent
from dfiner.annotators.fine_type_annotator import SynsetFineTyper
from sklearn.metrics.pairwise import cosine_similarity

from dfiner.types.finer_type_system import FinerTypeSystem

from nltk.corpus import stopwords
stopWords = set(stopwords.words('english'))

def best_k_label(label2score, k):
    min_v = sorted(label2score.values(), reverse=True)[k-1]
    return [x for x in label2score if label2score[x] >= min_v]


def embedding_of_phrase(gensim_w2v, doc, start, end):
    base_vec = np.zeros(300)
    counter = 0.0
    for w in doc[start: end]:
        w = w.text
        if w == "UW":
            w = "University"
        if w.lower() in stopWords:
            continue            
        if w in gensim_w2v:
            b = gensim_w2v[w]
            base_vec += b
            counter += 1.0
    return base_vec


class MentionEntailmentAnnotator(object):
    TYPE_NAME = "MentionEntail"
    def __init__(self,
                 config,
                 gensim_w2v,
                 min_cosine=0.4,
                 mention_view ="OntonoteType"
                 ):
        self.typer = SynsetFineTyper(config)
        self.min_cosine = min_cosine
        self.gensim_w2v = gensim_w2v
        self.type_system = FinerTypeSystem.load_type_system(config)
        self.mention_view = mention_view

    def __call__(self, doc):
        new_view = View()
        view = doc.user_data[self.mention_view]
        for constituent in view.constituents:
            start = constituent.start
            end = constituent.end
            types = set()
            max_sim = defaultdict(float)
            mention_vec = embedding_of_phrase(self.gensim_w2v, doc, start, end)
            mention_vec = embedding_of_phrase(self.gensim_w2v, doc, 0, len(doc))
            
            log_ = False
            for w in doc[max(start-1, 0): end]:
#                 print "Searching ... ", w
                
                if w.text == "UW":
                    text_ = "University"
                    log_ = True
                else:
                    text_ = w.text
                    
                for x in wn.synsets(text_):
                    if x.pos() == "n":
                        def_doc = nlp.make_doc(x.definition())
                        def_vec = embedding_of_phrase(self.gensim_w2v, def_doc, 0, len(def_doc))
                        sim_score = cosine_similarity(def_vec.reshape(1,-1), mention_vec.reshape(1,-1))[0,0]                        
                        tps_w = self.typer.get_fine_types("%d_n" % x.offset())
#                         if log_:
#                             print(tps_w)
                        for t in tps_w:
                            types.add(t)
                            max_sim[t] = max(max_sim[t], sim_score)
#                         print(x)
#                         print(x.definition())
#                         print(sim_score)
#                         print(types)
#                         print(" ")
            if len(constituent.label2score) == 0:
                continue
                
#             kk = max(self.trust_k, len(constituent.label2score))
#             print(kk)
#             ls = set(best_k_label(constituent.label2score, kk))
#             if log_:
#                 print(types)
#                 print(max_sim)
#             print("111",ls)
            ls = constituent.label2score.keys()
            mx_label2score = {}
#             print("\n"*3)
#             print("Fine types", types)
#             print("Coarse types", ls)
            for fine_type in types:
                for coarse_type in ls:
                    try:
#                         print("Checking [%s],[%s]" % (fine_type, coarse_type) )
                        check = self.type_system.a_belongs_to_b(fine_type, coarse_type)
#                         print check
                        if check:
#                             print("!!! Checking %s,%s" % (fine_type, coarse_type) )
                            if max_sim[fine_type] > self.min_cosine:
                                mx_label2score[fine_type] = max_sim[fine_type] 
                    except KeyError:
                        print("Not found type %s or %s" % (coarse_type, fine_type))
                        continue
#             print(mx_label2score)
#             if log_:
#                 print(mx_label2score)
#                 print(max_sim)
            if len(mx_label2score) > 0:
                c = Constituent(start,
                                end,
                                self.TYPE_NAME,
                                label2score=mx_label2score)
                new_view.add_constituent(c)
        doc.user_data[self.TYPE_NAME] = new_view

meann = MentionEntailmentAnnotator(config, gensim_word_vectors)

In [56]:
# kbann

In [57]:
# gensim_word_vectors.most_similar("Cougars")

In [58]:
# kbann.surface_to_type_dist[u'AAAS']

In [59]:
import codecs
import gzip
import json
from dfiner.datastructures import View, Constituent
from dfiner.types.finer_type_system import FinerTypeSystem
import pandas as pd


class KBBiasTypeAnnotator(object):

    TYPE_NAME = "KBBiasType"

    @staticmethod
    def load_surface_to_typedist(fname):
        ret = {}
        with gzip.open(fname, 'rb') as zf:
            reader = codecs.getreader("utf-8")
            contents = reader(zf)
            for line in contents:
                obj = json.loads(line)
                ret[obj['surface']] = obj["type_dist"]
        return ret

    def __init__(self,
                 config,
                 mention_view="coarse_type"):
        # surface_to_type_dist maps
        #       (surface, coarse type) => fine type to fine type.
        self.surface_to_type_dist = self.load_surface_to_typedist(
            config["mention_to_type_dist"])
        self.coarse_view_name = mention_view
        self.config = config
        self.type_system = FinerTypeSystem.load_type_system(config)
        # self.cities_names = set(pd.read_csv(config["common_us_city_path"], names=["Name", "State", "Pop"])["Name"].tolist())

    def __call__(self, doc):
        new_view = View()
        view = doc.user_data[self.coarse_view_name]
        for constituent in view.constituents:
            start = constituent.start
            end = constituent.end
            coarse_type = constituent.best_label_name
            surface = doc[start:end].text
            # if surface in self.cities_names:
            #         c = Constituent(start,
            #                         end,
            #                         self.TYPE_NAME,
            #                         label2score={"location.city": 1.0})
                    
                    # new_view.add_constituent(c)
            try:
                type_dist = self.surface_to_type_dist[surface]
                fine_type_name = self.pick_fine_type_or_none(type_dist,
                                                             coarse_type)
                if fine_type_name:
                    c = Constituent(start,
                                    end,
                                    self.TYPE_NAME,
                                    label2score={fine_type_name: 1.0})
                    new_view.add_constituent(c)
            except KeyError:
                continue

        doc.user_data[self.TYPE_NAME] = new_view

    def pick_fine_type_or_none(self, type_dist, coarse_type):
        consistent_types = {}
        max_prob = 0.0
        rescale = 0.0
        best_type = None
        for t in type_dist:
            if self.type_system.a_belongs_to_b(t, coarse_type):
                p = type_dist[t]
                rescale += p
                if p > max_prob:
                    max_prob = p
                    best_type = t
                consistent_types[t] = p

        if len(consistent_types) == 0:
            return None

        if len(consistent_types) == 1:
            if max_prob > 0.4:
                return best_type
            else:
                return None

        sorted_entry = sorted(consistent_types.keys(),
                              key=lambda x: consistent_types[x],
                              reverse=True)
        second_best_key = sorted_entry[1]

        if (max_prob - consistent_types[second_best_key]) / rescale > 0.8:
            return best_type

        best_key = sorted_entry[1]
        if best_key == "organization.company":
            if consistent_types[second_best_key] > 0.35:
                return second_best_key

        if second_best_key == "organization.company":
            return best_key


if __name__ == '__main__':
    pass

kbann = KBBiasTypeAnnotator(config, "OntonoteType")
config["kba"] = kbann
kbann.surface_to_type_dist["Xinhua News Agency"] = {'news_agency': 1.0}

In [66]:
import codecs
import gzip
import json
from dfiner.datastructures import View, Constituent
from dfiner.types.finer_type_system import FinerTypeSystem
import pandas as pd
import os

class GZFineTypeAnnotator(object):

    TYPE_NAME = "GZFineType"

    @staticmethod
    def load_gzs(base_folder, exclude = {}):
        surface_to_type = {}
        
        for gz_entry in os.listdir(base_folder):
            if gz_entry in exclude:
                continue
            with open(os.path.join(base_folder,gz_entry)) as inp:
                for line in inp:
                    line = line.strip()
                    if line not in surface_to_type:
                        surface_to_type[line] = gz_entry
                    else:
                        surface_to_type[line] = None
        return surface_to_type

    def __init__(self,
                 config,
                 mention_view="OntonoteType"):
        # surface_to_type_dist maps
        #       (surface, coarse type) => fine type to fine type.
        self.surface_to_type = self.load_gzs(
            config["fine_gz_base"])
        self.coarse_view_name = mention_view
        self.config = config
        self.type_system = FinerTypeSystem.load_type_system(config)
        # self.cities_names = set(pd.read_csv(config["common_us_city_path"], names=["Name", "State", "Pop"])["Name"].tolist())

    def __call__(self, doc):
        new_view = View()
        view = doc.user_data[self.coarse_view_name]
        for constituent in view.constituents:
            start = constituent.start
            end = constituent.end
            coarse_type = constituent.best_label_name
            surface = doc[start:end].text

            try:
                typ = self.surface_to_type[surface]
                if typ:
                    if self.type_system.a_belongs_to_b(typ, coarse_type):
                        c = Constituent(start,
                                        end,
                                        self.TYPE_NAME,
                                        label2score={typ: 1.0})
                        new_view.add_constituent(c)
            except KeyError:
                continue

        doc.user_data[self.TYPE_NAME] = new_view



if __name__ == '__main__':
    pass

config["fine_gz_base"] = "/home/haowu4/data/simple_finer/fine_type_gazetteers"
gz_fine_ann = GZFineTypeAnnotator(config)

In [67]:
gz_fine_ann.surface_to_type["NFL"]

'organization.sports_league'

In [62]:
import codecs
import gzip
import json
from dfiner.datastructures import View, Constituent
from dfiner.types.finer_type_system import FinerTypeSystem
import pandas as pd
import os
import cPickle as pickle
class MentionRegexPatternTypeAnnotator(object):

    TYPE_NAME = "MRP-FINE"

    @staticmethod
    def load_patterns(path):
        with open(path, "rb") as inp:
            return pickle.load(inp)
    
    @staticmethod
    def extract_all_pattern(tokens, ks = range(1,5)):
        for i, w in enumerate(tokens):
            word_before = i
            for k in ks:            
                word_after = len(tokens) - i - k
                if word_after < 0:
                    break
                yield (word_before,tuple(tokens[i:i+k])  , word_after)

    
    def __init__(self,
                 config,
                 mention_view="OntonoteType"):
        # surface_to_type_dist maps
        #       (surface, coarse type) => fine type to fine type.
        self.pattern_db = self.load_patterns(
            config["pattern_db_path"])
        self.coarse_view_name = mention_view
        self.config = config
        self.type_system = FinerTypeSystem.load_type_system(config)
        # self.cities_names = set(pd.read_csv(config["common_us_city_path"], names=["Name", "State", "Pop"])["Name"].tolist())

    def __call__(self, doc):
        new_view = View()
        view = doc.user_data[self.coarse_view_name]
        for constituent in view.constituents:
            start = constituent.start
            end = constituent.end
            coarse_type = constituent.best_label_name
            
            surface = [x.text for x in doc[start:end]]
            
            if len(surface) == 1 and surface[0] == "UW":
                surface = ["University", "of" ,"Washington"]
#             all_pat = extract_all_pattern(surface)

            candidate_types = defaultdict(float)

            for pat in self.extract_all_pattern(surface):
                if pat in self.pattern_db:
                    for candidate, sup in self.pattern_db[pat]:
                        try:
                            if self.type_system.a_belongs_to_b(candidate, coarse_type):
                                candidate_types[candidate] += sup
                        except:
                            continue

            if len(candidate_types) > 0:
#                 print(surface)
                c = Constituent(start,
                                end,
                                self.TYPE_NAME,
                                label2score=candidate_types)

                new_view.add_constituent(c)

        doc.user_data[self.TYPE_NAME] = new_view



if __name__ == '__main__':
    pass

config["pattern_db_path"] = "/tmp/pat_dump"

mrp_ann = MentionRegexPatternTypeAnnotator(config)


for i in range(5):
    for j in range(3):
        mrp_ann.pattern_db[(i,("Inc.",),j)].add(('organization.company', 0.014499239237447418))
        mrp_ann.pattern_db[(i,("LLC",),j)].add(('organization.company', 0.014499239237447418))
        mrp_ann.pattern_db[(i,("Co.",),j)].add(('organization.company', 0.014499239237447418))
        mrp_ann.pattern_db[(i,("AG",),j)].add(('organization.company', 0.014499239237447418))
        mrp_ann.pattern_db[(i,("GmbH",),j)].add(('organization.company', 0.014499239237447418))
        mrp_ann.pattern_db[(i,("Corp.",),j)].add(('organization.company', 0.014499239237447418))



In [63]:
for k in mrp_ann.extract_all_pattern(["Viatech", "Inc."]):
    print k
    if k in mrp_ann.pattern_db:
        for candidate, sup in mrp_ann.pattern_db[k]:
            print candidate

(0, ('Viatech',), 1)
(0, ('Viatech', 'Inc.'), 0)
(1, ('Inc.',), 0)
organization.company


NameError: name 'mrp_ann' is not defined