In [61]:
import os
from collections import defaultdict, OrderedDict
import json
import codecs
import numpy as np
from scipy.sparse import coo_matrix, hstack
from nltk.stem import WordNetLemmatizer
import re
import spacy
import cPickle as pickle
from unidecode import unidecode
from dfiner.utils import get_default_config
from dfiner.kb_bias.kb_bias_annotator import KBBiasTypeAnnotator

nlp = spacy.load('en')

class Mention(object):
    def __init__(self, start, end, label, tokens, doc_id = ""):
        self.start = start
        self.end = end
        self.label = label
        self.tokens = tokens
        self.surface = [tokens[i] for i in range(start,end)]
        self.ext_surface = [tokens[i] for i in range(max(start-1,0),min(end+1,len(tokens)))]
        self.doc_id = doc_id
        doc = nlp.tokenizer.tokens_from_list(tokens)
        nlp.tagger(doc)
        nlp.parser(doc)
        self.doc = doc
        
        
    def __repr__(self):
        o = [("start",self.start),
            ("end",self.end),
            ("surface",self.surface), 
            ("label",self.label),
            ("doc_id",self.doc_id),
            ("tokens",self.tokens)]
        
        return json.dumps(OrderedDict(o), indent=4)
        
def read_figer(file = "/home/haowu4/codes/dataless_finer/eval_figer/data/gold_cleaned_figer_test.label"):
    tokens = []
    labels = []
    sentences = []
    with codecs.open(file, "r", "utf-8") as input:
        for i, line in enumerate(input):
            line = line.strip()
            if len(line) == 0:
                # New sentence
                if len(tokens) == 0:
                    continue
                sent = get_sentence(tokens, labels)
                for s in sent:
                    sentences.append(s)
                tokens = []
                labels = []
                continue
            line = line.split("\t")
            word, label = line[0], line[1]
            tokens.append(word)
            labels.append(label)
            
    if len(tokens) == 0:
        return sentences

    sent = get_sentence(tokens, labels)
    sentences.append(sent)
    tokens = []
    labels = []
    return sentences

    
        
def get_sentence(tokens, labels):
    entities = []
    current_labels = ""
    current_start = 0
    in_mention = False
    for i, (t, lab_str) in enumerate(zip(tokens, labels)):
        if lab_str.startswith("B"):
            if in_mention:
                entities.append(Mention(current_start, i, current_labels, tokens))
                in_mention = False
            current_start = i
            current_labels = lab_str.split("-")[1]
            in_mention = True

        if lab_str == "O":
            if in_mention:
                entities.append(Mention(current_start, i, current_labels, tokens))
                in_mention = False
   
    if in_mention:
        entities.append(Mention(current_start, len(labels), current_labels, tokens))
    
    return entities


def load_ontonotes(file):
    tokens = []
    labels = []
    sentences = []
    with codecs.open(file, "r", "utf-8") as input:
        for i, line in enumerate(input):
            if i < 1:
                continue
            line = line.strip()
            if len(line) == 0:
                # New sentence
                if len(tokens) == 0:
                    continue
                sent = get_sentence(tokens, labels)
                for s in sent:
                    sentences.append(s)
                tokens = []
                labels = []
                continue
            line = line.split("\t")
            word, label = line[5], line[0]
            tokens.append(word)
            labels.append(label)
            
    if len(tokens) == 0:
        return sentences

    sent = get_sentence(tokens, labels)
    sentences.append(sent)
    tokens = []
    labels = []
    return sentences


def load_all_data(base_folder):
    ret = []
    for f in os.listdir(base_folder):
        fn = os.path.join(base_folder,f)
        ms = load_ontonotes(fn)
        for m in ms:
            m.doc_id = f
            ret.append(m)
    return ret

class Lexicon(object):
    
    def __init__(self):
        self.curr = 0
        self.m = {}
        self.counter = defaultdict(int)
        self.counter_per_type = defaultdict(lambda:defaultdict(int))
    
    def see_feature(self, f, t = None):
        self.counter[f] += 1
        if t:
            self.counter_per_type[f][t] += 1
        if f not in self.m:
            self.m[f] = self.curr
            self.curr += 1
            
    
    def prune(self, min_support):
        self.curr = 0
        self.m = {}
        for k in self.counter:
            if self.counter[k] > min_support:
                self.m[k] = self.curr
                self.curr += 1
    
    def getOrNegOne(self, f):
        if f in self.m:
            return self.m[f]
        else:
            return -1
        
    def getOneHot(self, f):
        ret = np.zeros((self.curr))
        if f in self.m:
            ret[self.m[f]] = 1.0
        return ret
    
def loadW2V(w2v_file, allowed = None):
    ret = {}
    err = 0
    with codecs.open(w2v_file, "r" , 'utf-8') as input:
        for line in input:
            line = line.strip()
            if len(line) == 0:
                continue
            try:
                w,vec = line.split("\t")
            except ValueError:
#                 print(line)
                err += 1
                continue
            if allowed is not None and w in allowed:
                vec = [float(v) for v in vec.split(" ")]
                ret[w] = np.array(vec)
    print("%d line failed" % err)
    return ret

def generate_vecs(objs, 
                  typ_function,
                  feature_func,
                  dense_real_vec_features = [],
                  lex = None,
                  type_lex = None
                 ):
    len_x = len(objs)
    
    if lex is None:
        lex = Lexicon()
        for x in objs:
            for ff in feature_func:
                for k,v in ff(x.doc, x.start, x.end):
                    lex.see_feature(k, typ_function(x))
        lex.prune(7)

    if type_lex is None:
        type_lex = Lexicon()
        for x in objs:
            y = typ_function(x)
            type_lex.see_feature(y)

    
    
    row_ids = []
    col_ids = []
    vs = []
    
    dense_vecs = []
    
    ys = []
    
    for i, x in enumerate(objs):
        for ff in feature_func:
            for k,v in ff(x.doc, x.start, x.end):
                idx = lex.getOrNegOne(k)
                if idx > -1:
                    row_ids.append(i)
                    col_ids.append(idx)
                    vs.append(v)
                    
        if len(dense_real_vec_features) > 0:
            ds = []
            for dff in dense_real_vec_features:
                v  = dff(x)
                ds.append(v)
#             print(len(ds))
            denv = np.hstack((ds))
            dense_vecs.append(denv)
            
        ys.append(type_lex.getOrNegOne(typ_function(x)))
        
    sp = (len_x, lex.curr)
    print(sp)
    xs = coo_matrix((vs, (row_ids, col_ids)), shape=sp)
    print("dense_vecs[0].shape", dense_vecs[0].shape)
    dense_vecs = np.vstack(dense_vecs)
    print("dense_vecs.shape", dense_vecs.shape)
    if len(dense_real_vec_features) > 0:
        print("shapes : ", xs.shape, dense_vecs.shape, )
        xs = hstack((xs, dense_vecs))
    return lex, type_lex, xs.tocsr(), np.asarray(ys)


In [62]:
def load_cache_or_none(filepath):
    return None
    if os.path.isfile(filepath):
        with open(filepath, 'rb') as f_in:
            data = pickle.load(f_in)
        return data
    else:
        return None


def save_cache(obj, filepath):
    return None
    with open(filepath, 'wb') as f_out:
        pickle.dump(obj, f_out, pickle.HIGHEST_PROTOCOL)


train_mentions_cache_file = "./OntonoteNERCache/TrainAndDev.pkl"
train_mentions = load_cache_or_none(train_mentions_cache_file)
if train_mentions is None:
    train_mentions= load_all_data("/home/haowu4/data/ontonotes_ner/ColumnFormat/TrainAndDev/")
    save_cache(train_mentions, train_mentions_cache_file)


test_mentions_cache_file = "./OntonoteNERCache/Test.pkl"
test_mentions = load_cache_or_none(test_mentions_cache_file)
if test_mentions is None:
    test_mentions = load_all_data("/home/haowu4/data/ontonotes_ner/ColumnFormat/Test/")
    save_cache(test_mentions, test_mentions_cache_file)


figer_datas = read_figer()


VOCABS = set()
mention_counter = 0
for mention in [train_mentions,test_mentions, figer_datas]:
    for men in mention:
        mention_counter += 1
        for t in men.tokens:
            VOCABS.add(t)
print("%d mention loaded" % mention_counter)

print("%d word loaded" % len(VOCABS))

w2vdict_file_path = "./OntonoteNERCache/w2vdict.pkl"
w2vdict = load_cache_or_none(w2vdict_file_path)
if w2vdict is None:
    w2vdict=loadW2V("/home/haowu4/data/autoextend/GoogleNews-vectors-negative300.combined_500k.txt", VOCABS)
    save_cache(w2vdict, w2vdict_file_path)

print("%d words have w2v" % len(w2vdict))

default_w2v_mean = np.mean(list(w2vdict.values()), axis=0)
default_w2v_zero = np.zeros(default_w2v_mean.shape)

159298 mention loaded
52467 word loaded
1 line failed
41459 words have w2v


In [4]:
# test_mentions[100]

In [196]:
# Define features:

# from nltk.stem.wordnet import WordNetLemmatizer
# lmtzr = WordNetLemmatizer().lemmatize


def word_shape_func(text):
    text = re.sub("[a-z]+", "a" ,text)
    text = re.sub("[A-Z]+", "A" ,text)
    text = re.sub("[0-9]+", "0" ,text)
    return text
    

class FeatureFunc(object):
    def __init__(self, name):
        self.name = name
        
    def __call__(self, original_func):
        decorator_self = self
        def wrappee( *args, **kwargs):
            for feat in original_func(*args,**kwargs):
                yield ("%s=%s" % (unidecode(self.name), feat), 1.0) 
        return wrappee

class RealFeatureFunc(object):
    def __init__(self, name):
        self.name = name
        
    def __call__(self, original_func):
        decorator_self = self
        def wrappee( *args, **kwargs):
            for feat,v in original_func(*args,**kwargs):
                yield ("%s=%s" % (unidecode(self.name), feat), v)
        return wrappee

@FeatureFunc("dep_feature")
def mention_details(doc, start, end):
    heads = [token.head for token in doc[start:end]]
    deps = [list(token.children) for token in doc[start:end]]
    for token, head, children in zip(doc[start:end], heads, deps):
        if not (head.i >= start and head.i < end):            
            yield "<-%s- %s" % (token.dep_, head.lemma_)
            yield "<- %s" % (head.lemma_)
        for child in children:
            if not (child.i >= start and child.i < end):
                yield  "-%s-> %s" % (child.dep_,child.lemma_)
                yield  "-> %s" % (child.lemma_)

def word_before(pos):
    @FeatureFunc("word_before")
    def f(doc, start, end):
        for i in range(max(start-pos,0), start):
            yield doc[i].text
#             yield word_shape_func(doc[i].text)
    return f

def word_before_loc(pos):
    @FeatureFunc("word_before")
    def f(doc, start, end):
        for i in range(max(start-pos,0), start):
            yield "%d-%s" % (start - i,doc[i].text)
    return f

def word_before_lemma(pos):
    @FeatureFunc("word_before_lemma")
    def f(doc, start, end):
        for i in range(max(start-pos,0), start):
            yield doc[i].lemma_
    return f


def word_after(pos):
    @FeatureFunc("word_after")
    def f(doc, start, end):
        for i in range(end, min(end+pos,len(doc))):
            yield doc[i].text
#             yield word_shape_func(doc[i].text)
    return f

def word_after_loc(pos):
    @FeatureFunc("word_after_loc")
    def f(doc, start, end):
        for i in range(end, min(end+pos,len(doc))):
            yield "%d-%s" % (i - end,doc[i].text)
    return f


def word_after_lemma(pos):
    @FeatureFunc("word_after_lemma")
    def f(doc, start, end):
        for i in range(end, min(end+pos,len(doc))):
            yield doc[i].lemma_
        
    return f

@FeatureFunc("wim")
def word_in_mention(doc, start, end):
    for x in doc[start:end]:
        yield x.text
        yield word_shape_func(x.text)

@FeatureFunc("wim_lemma")
def word_in_mention_lemma(doc, start, end):
    for x in doc[start:end]:
        yield x.lemma_
        

@FeatureFunc("wim_loc")
def word_in_mention_loc(doc, start, end):
    for i,x in enumerate(doc[start:end]):
        yield "f%d-%s" % (i,x.text)
        yield "b%d-%s" % ((end-start) - i,x.text)
        
@FeatureFunc("wim_loc_lemma")
def word_in_mention_loc_lemma(doc, start, end):
    for i,x in enumerate(doc[start:end]):
        x = x.lemma_
        yield "f%d-%s" % (i,x)
        yield "b%d-%s" % ((end-start)-i,x)



# @FeatureFunc("wim_ext")
# def wim_ext(doc, start, end):
#     for x in mention.ext_surface:
#         yield x
#         yield word_shape_func(x)
    
# @FeatureFunc("wim_ext_lemma")
# def wim_ext_lemma(doc, start, end):
#     for x in mention.ext_surface:
#         x = lmtzr(x)
#         yield x
    
@FeatureFunc("wim_bigram")
def wim_bigram(doc, start, end):
    for i,x in zip(doc[start:end-1], doc[start+1:end]):
        yield "%s-%s" % (i.text,x.text)
        
@FeatureFunc("wim_bigram_lemma")
def wim_bigram_lemma(doc, start, end):
    lms = [x.lemma_ for x in doc[start:end]]
    for i,x in zip(lms[:-1], lms[1:]):
        yield "%s-%s" % (i,x)


        
@FeatureFunc("word_shape")
def word_shape(doc, start, end):
    t = " ".join([x.text for x in doc[start:end]])
    return [word_shape_func(t)]
        
@FeatureFunc("length")
def mention_length(doc, start, end):
    return ["%d" % (end-start)]
        
@FeatureFunc("prefix")
def prefix(doc, start, end):
    for w in doc[start:end]:
        for i in range(3, min(5, len(w.text))):
            yield w.text[:i]
        
@FeatureFunc("surfix")
def postfix(doc, start, end):
    for w in doc[start:end]:
        for i in range(3, min(5, len(w.text))):
            yield w.text[-i:]

@FeatureFunc("bias")
def CONSTANT_BIAS(doc, start, end):
    return ["bias"]


def kb_bias(kba):
    @RealFeatureFunc("KBBias")
    def wrapee(doc, start, end):
        surface = doc[start:end].text
        try:
            m = kba.surface_totype_dist[surface]
        except KeyError:
            return
        ss = defaultdict(float)
        for k in m:
            if "." in k:
                nk = k.split(".")[0]
                ss[nk] += m[k]
            else:
                ss[k] += m[k]
        for k in ss:
            s = ss[k]
            if s > 0.6:
                yield (k, 1)
    return wrapee
# def gazzarteer(gas):
#     @FeatureFunc("gazzarteer")
#     def gazzarteer_wrappee(mention):
#         for gz_name in gas:
#             if mention.surface:
#                 return

def getOrDefault(m, k, d):
    if k in m:
        return m[k]
    else:
        return d

def w2vBefore(ws, default_w2v, pos = 4):    
    def wrappee(mention):
        words = []
        for i in range(mention.start-pos, mention.start):
            if i < 0:
                words.append(np.zeros(default_w2v.shape))
            else:
                words.append(getOrDefault(ws,mention.tokens[i],default_w2v))
        words.append(np.mean(words, axis=0))
        return np.hstack(words)
    return wrappee

def w2vAfrer(ws, default_w2v, pos = 4):    
    def wrappee(mention):
        words = []
        for i in range(mention.end, mention.end + pos):
            if i >= len(mention.tokens):
                words.append(np.zeros(default_w2v.shape))
            else:
                words.append(getOrDefault(ws,mention.tokens[i],default_w2v))
        words.append(np.mean(words, axis=0))
        return np.hstack(words)

    return wrappee

def w2vMention(ws, default_w2v):    
    def wrappee(mention):
        if len(mention.surface) == 0:
            print(mention)
        ms = [getOrDefault(ws,w,default_w2v) for w in mention.surface]
        return np.mean(ms, axis=0)
    return wrappee

In [197]:
MAX_PATH_LEN = 4
UP = 1
DOWN = 2

def shortest_path((x, y)):
    """ Returns the shortest dependency path from x to y
    :param x: x token
    :param y: y token
    :return: the shortest dependency path from x to y
    """

    x_token = x
    y_token = y
    if not isinstance(x_token, spacy.tokens.token.Token):
        x_token = x_token.root
    if not isinstance(y_token, spacy.tokens.token.Token):
        y_token = y_token.root

    # Get the path from the root to each of the tokens
    hx = heads(x_token)
    hy = heads(y_token)

    # Get the lowest common head
    i = -1
    for i in xrange(min(len(hx), len(hy))):
        if hx[i] is not hy[i]:
            break

    if i == -1:
        lch_idx = 0
        if len(hy) > 0:
            lch = hy[lch_idx]
        elif len(hx) > 0:
            lch = hx[lch_idx]
        else:
            lch = None
    elif hx[i] == hy[i]:
        lch_idx = i
        lch = hx[lch_idx]
    else:
        lch_idx = i-1
        lch = hx[lch_idx]

    # The path from x to the lowest common head
    hx = hx[lch_idx+1:]
    if lch and check_direction(lch, hx, lambda h: h.lefts):
        return None
    hx = hx[::-1]

    # The path from the lowest common head to y
    hy = hy[lch_idx+1:]
    if lch and check_direction(lch, hy, lambda h: h.rights):
        return None

    return (x, hx, lch, hy, y)


def shortest_path2((x, y)):
    """ Returns the shortest dependency path from x to y
    :param x: x token
    :param y: y token
    :return: the shortest dependency path from x to y
    """

    x_token = x
    y_token = y
    if not isinstance(x_token, spacy.tokens.token.Token):
        x_token = x_token.root
    if not isinstance(y_token, spacy.tokens.token.Token):
        y_token = y_token.root

    # Get the path from the root to each of the tokens including the tokens
    hx = heads(x_token) + [x_token]
    hy = heads(y_token) + [y_token]

    # Get the lowest common head
    i = -1
    for i in xrange(min(len(hx), len(hy))):
        if hx[i] is not hy[i]:
            break

    # i cannot be -1 since the path should atleast have the ROOT as the common ancestor
    if hx[i] == hy[i]:
        lch_idx = i
        lch = hx[lch_idx]
    else:
        lch_idx = i-1
        lch = hx[lch_idx]

    # The path from x to the lowest common head
    hx = hx[lch_idx+1:-1]
    hx = hx[::-1]

    # The path from the lowest common head to y
    hy = hy[lch_idx+1:-1]

    return (x, hx, lch, hy, y)


def heads(token):
    """
    Return the heads of a token, from the root down to immediate head
    :param token:
    :return:
    """
    t = token
    hs = []
    while t is not t.head:
        t = t.head
        hs.append(t)
    return hs[::-1]


def direction(dir):
    """
    Print the direction of the edge
    :param dir: the direction
    :return: a string representation of the direction
    """
    # Up to the head
    if dir == UP:
        return '>'
    # Down from the head
    elif dir == DOWN:
        return '<'


def token_to_string(token):
    """
    Convert the token to string representation
    :param token:
    :return:
    """
    if not isinstance(token, spacy.tokens.token.Token):
        return ' '.join([t.string.strip().lower() for t in token])
    else:
        return token.string.strip().lower()


def token_to_lemma(token):
    """
    Convert the token to string representation
    :param token: the token
    :return: string representation of the token
    """
    if not isinstance(token, spacy.tokens.token.Token):
        return token_to_string(token)
    else:
        return token.lemma_.strip().lower()


def clean_path((x, hx, lch, hy, y), entity_on_left=True, include_target_pos=False):
    """
    Filter out long paths and pretty print the short ones
    :return: the string representation of the path
    """
    
    def argument_to_string(token, edge_name, include_pos=False):
        """
        Converts the argument token (X or Y) to an edge string representation
        :param token: the X or Y token
        :param edge_name: 'X' or 'Y'
        :return:
        """
        if not isinstance(token, spacy.tokens.token.Token):
            token = token.root

        if include_pos:
            return '/'.join([edge_name, token.pos_, token.dep_ if token.dep_ != '' else 'ROOT'])
        else:
            return '/'.join([edge_name, token.dep_ if token.dep_ != '' else 'ROOT'])
    
    def edge_to_string(token, is_head=False, is_lexicalized=True):
        """
        Converts the token to an edge string representation
        :param token: the token
        :return: the edge string
        """
        t = token
        if not isinstance(token, spacy.tokens.token.Token):
            t = token.root

        if is_lexicalized:
            return '/'.join([token_to_lemma(token), t.dep_ if t.dep_ != '' and not is_head else 'ROOT'])
        else:
            return '/'.join([t.pos_, t.dep_ if t.dep_ != '' and not is_head else 'ROOT'])


    lch_lex_lst = []
    lch_pos_lst = []

    # X is the head
    if isinstance(x, spacy.tokens.token.Token) and lch == x:
        dir_x = ''
        dir_y = direction(DOWN)
    # Y is the head
    elif isinstance(y, spacy.tokens.token.Token) and lch == y:
        dir_x = direction(UP)
        dir_y = ''
    # X and Y are not heads
    else:
        lch_lex_lst = [edge_to_string(lch, is_head=True, is_lexicalized=True)] if lch else []
        lch_pos_lst = [edge_to_string(lch, is_head=True, is_lexicalized=False)] if lch else []
        dir_x = direction(UP)
        dir_y = direction(DOWN)

    len_path = len(hx) + len(hy) + len(lch_lex_lst)

    if len_path <= MAX_PATH_LEN:
#     if True:
        mid_lex_path = (
            [edge_to_string(token, is_lexicalized=True) + direction(UP) for token in hx] +
            lch_lex_lst + 
            [direction(DOWN) + edge_to_string(token, is_lexicalized=True) for token in hy]
        )

        mid_pos_path = (
            [edge_to_string(token, is_lexicalized=False) + direction(UP) for token in hx] + 
            lch_pos_lst + 
            [direction(DOWN) + edge_to_string(token, is_lexicalized=False) for token in hy]
        )

        if entity_on_left:
            yield '_'.join(['ENT' + dir_x] + mid_lex_path + 
                           [dir_y + argument_to_string(y, y.lemma_, include_pos=include_target_pos)])
            yield '_'.join(['ENT' + dir_x] + mid_pos_path + 
                           [dir_y + argument_to_string(y, y.lemma_, include_pos=include_target_pos)])
        else:
            yield '_'.join([argument_to_string(x, x.lemma_, include_pos=include_target_pos) + dir_x] + 
                           mid_lex_path + [dir_y + 'ENT'])
            yield '_'.join([argument_to_string(x, x.lemma_, include_pos=include_target_pos) + dir_x] + 
                           mid_pos_path + [dir_y + 'ENT'])
    else:
        return
        yield

@FeatureFunc("prp_wh_dep_feat")
def mention_pronoun_wh_dep(doc, start, end):
    PRP_SYM = nlp.vocab.strings["PRP"]
    WH_SYM = nlp.vocab.strings["WP"]
    prp_tokens = [token for token in doc if token.tag == PRP_SYM]
    wh_tokens = [token for token in doc if token.tag == WH_SYM]
    for mention_token in doc[start:end]:
        for target_token in prp_tokens + wh_tokens:
            path = shortest_path2((mention_token, target_token))
            for cleaned_path in clean_path(path):
                yield cleaned_path

In [198]:
# list(mention_pronoun_wh_dep(ex.doc, 0, 1))

In [None]:

config = get_default_config()
kbann = KBBiasTypeAnnotator(config, "OntonoteType")


In [199]:
features = [CONSTANT_BIAS,
            mention_details,
            kb_bias(kbann),
#             word_before(3), word_before_lemma(3),
#             word_after(3), word_after_lemma(3),
            
            word_in_mention, word_in_mention_lemma,
            word_in_mention_loc, word_in_mention_loc_lemma,
            wim_bigram, wim_bigram_lemma,
            
#             wim_ext, wim_ext_lemma,
            
#             word_shape,
            
#             mention_length,
            
#             prefix,
#             postfix,
            
#             mention_pronoun_wh_dep
           ]

default_w2v = default_w2v_mean

dense_feature = [ #w2vBefore(w2vdict, default_w2v),
                  #w2vAfrer(w2vdict, default_w2v),
                  w2vMention(w2vdict, default_w2v)]

def typ_func(m):
    return m.label

print("generating train vectors")
lex, type_lex, xs_train, ys_train = generate_vecs(filter(lambda mention: len(mention.tokens)>=5,train_mentions),
                                      typ_func,
                                      features,
                                      dense_feature)

print("generating test vectors")
_,_, xs_test, ys_test = generate_vecs(test_mentions,
                            typ_func,
                            features,
                            dense_feature,
                            lex,
                            type_lex)

# _,_, xs_dev, ys_dev = generate_vecs(dev_mentions,
#                             typ_func,
#                             features,
#                             dense_feature,
#                             lex,
#                             type_lex)

_,_, xs_figer, _ = generate_vecs(figer_datas,
                            lambda m : "LOC",
                            features,
                            dense_feature,
                            lex,
                            type_lex)

generating train vectors
(140706, 34674)
('dense_vecs[0].shape', (300,))
('dense_vecs.shape', (140706, 300))
('shapes : ', (140706, 34674), (140706, 300))
generating test vectors
(17085, 34674)
('dense_vecs[0].shape', (300,))
('dense_vecs.shape', (17085, 300))
('shapes : ', (17085, 34674), (17085, 300))
(579, 34674)
('dense_vecs[0].shape', (300,))
('dense_vecs.shape', (579, 300))
('shapes : ', (579, 34674), (579, 300))


In [206]:
from sklearn.metrics import f1_score, confusion_matrix
from sklearn import linear_model, datasets, svm, ensemble

# logreg = ensemble.BaggingClassifier(base_estimator=linear_model.Perceptron(penalty="l2", alpha=1e-6), n_estimators=10, max_samples=.5)
logreg = linear_model.SGDClassifier(loss = "perceptron",
                                    penalty="l2",
                                    alpha=1e-6,
                                    class_weight = "balanced",
                                    average= True)
logreg.fit(xs_train, ys_train)
y_pred = logreg.predict(xs_test)
y_pred_figer = logreg.predict(xs_figer)

    

In [201]:
def fast_gen_vec(m):
    _,_, e, _ = generate_vecs([m],
                            lambda m : "LOC",
                            features,
                            dense_feature,
                            lex,
                            type_lex)
    return e

def generate_example(tokens):
    m = Mention(0, len(tokens), "", tokens)
    _,_, e, _ = generate_vecs([m],
                            lambda m : "LOC",
                            features,
                            dense_feature,
                            lex,
                            type_lex)
    return e

# print("Binary f1  %.3f" % (f1_score(ys_test, y_pred)))

f1 = f1_score(ys_test, y_pred, average="micro").tolist()
print(f1)
rd = {type_lex.m[x]:x for x in type_lex.m}

class_names = [None] * len(rd)
for i in range(len(rd)):
    class_names[i] = rd[i]

# for k,v in sorted([(rd[k], f1[k]) for k in rd], key=lambda a:a[1], reverse=True):
#     print("%15s : %.3f" % (k,v)) 

0.933860111209


Binary f1  0.945
[0.9561104805145667, 0.962999185004075, 0.9865470852017937, 0.9855574812247255, 0.928506513177825, 0.676470588235294, 0.9652351738241309, 0.9710485133020343, 0.9, 0.6827309236947792, 0.7031700288184438, 0.9870129870129871, 0.5, 0.9977081741787623, 0.9535353535353536, 0.5730659025787965, 0.7912087912087913, 0.6949152542372882]
        PERCENT : 0.998
          MONEY : 0.987
        ORDINAL : 0.987
           DATE : 0.986
       CARDINAL : 0.971
           NORP : 0.965
            GPE : 0.963
         PERSON : 0.956
           TIME : 0.954
            ORG : 0.929
       QUANTITY : 0.900
            LOC : 0.791
    WORK_OF_ART : 0.703
            LAW : 0.695
          EVENT : 0.683
        PRODUCT : 0.676
            FAC : 0.573
       LANGUAGE : 0.500

In [202]:
counter = 0
for i,m in enumerate(test_mentions):
    if y_pred[i] == ys_test[i]:
        continue
    if m.label == "FAC":
#         print(rd[y_pred[i]],m)
        counter += 1
    if counter == 10:
        break

In [203]:
matches =  defaultdict(int)
alllabel_matches =  defaultdict(int)
coarselabel_matches =  defaultdict(int)

coarselabel_match_examples = defaultdict(list)

match_examples =  defaultdict(list)
all_labels = 0
for i in range(len(figer_datas)):
    preded = rd[y_pred_figer[i]]
    stripped_type = strip_fine_type(figer_datas[i].label)
    fine_types = sorted(figer_datas[i].label.split(","))
    if preded == "NORP":
        pd_mapped = "/norpl"
    else:
        pd_mapped = "/" + getOrDefault(onto2figer_tm, preded, "X")

    coarselabel_matches[(pd_mapped, stripped_type)] += 1    
    coarselabel_match_examples[(pd_mapped, stripped_type)].append(figer_datas[i])
    
    alllabel_matches[(preded,",".join(fine_types))] += 1    
    for l in fine_types:
        all_labels += 1
        matches[(preded, l)] += 1
        match_examples[(preded, l)].append(figer_datas[i])

internet.website is missing
internet.website is missing


In [207]:
figer_len = float(len(figer_datas))

err_pre = 0.0

for k in sorted(coarselabel_matches.keys(), key=lambda x : coarselabel_matches[x], reverse=True):
    a, b =k
    if a != b:
        err_pre += coarselabel_matches[k]/figer_len
        print("%50s\t--\t%10d\t--\t%.2f" % (k, coarselabel_matches[k], coarselabel_matches[k]/figer_len))
print("%.3f" % err_pre)


for k in sorted(coarselabel_matches.keys(), key=lambda x : coarselabel_matches[x], reverse=True):
    a, b =k
    if a == b:
        err_pre += coarselabel_matches[k]/figer_len
        print("%50s\t--\t%10d\t--\t%.2f" % (k, coarselabel_matches[k], coarselabel_matches[k]/figer_len))
print("%.3f" % err_pre)

                    ('/product', u'/organization')	--	         9	--	0.02
                   ('/location', u'/organization')	--	         7	--	0.01
                     ('/organization', u'/person')	--	         7	--	0.01
                            ('/person', u'/title')	--	         6	--	0.01
                      ('/organization', u'/title')	--	         6	--	0.01
                   ('/organization', u'/location')	--	         6	--	0.01
                         ('/location', u'/person')	--	         3	--	0.01
                        ('/art', u'/organization')	--	         3	--	0.01
                   ('/organization', u'/building')	--	         3	--	0.01
                      ('/person', '/organization')	--	         3	--	0.01
                                 ('/art', '/work')	--	         3	--	0.01
                        ('/organization', '/work')	--	         3	--	0.01
                      ('/organization', u'/event')	--	         3	--	0.01
                   ('/building', u'/organization')	

In [208]:
figer_len = float(len(figer_datas))
for k in sorted(alllabel_matches.keys(), key=lambda x : alllabel_matches[x], reverse=True):
    print("%50s\t--\t%10d\t--\t%.2f" % (k, alllabel_matches[k], alllabel_matches[k]/figer_len))

                           (u'PERSON', u'/person')	--	       204	--	0.35
                        (u'ORG', u'/organization')	--	        51	--	0.09
(u'ORG', u'/organization,/organization/educational_institution')	--	        34	--	0.06
             (u'GPE', u'/location,/location/city')	--	        32	--	0.06
                               (u'DATE', u'/time')	--	        26	--	0.04
  (u'ORG', u'/organization,/organization/company')	--	        15	--	0.03
(u'ORG', u'/organization,/organization/sports_team')	--	        14	--	0.02
                   (u'NORP', u'/people/ethnicity')	--	        10	--	0.02
          (u'GPE', u'/location,/location/country')	--	        10	--	0.02
           (u'PERSON', u'/person,/person/athlete')	--	         9	--	0.02
     (u'ORG', u'/government_agency,/organization')	--	         8	--	0.01
(u'ORG', u'/organization,/organization/sports_league')	--	         8	--	0.01
         (u'GPE', u'/location,/location/province')	--	         8	--	0.01
                              (

In [108]:
def get_example(t):
    return coarselabel_match_examples[t]
kk=get_example((('ORG', '/person') ))

In [101]:
def generate_revert_map_from_lex(lex):
    r = [None] * len(lex.m)
    for k in lex.m:
        r[lex.m[k]] = k
    return r

rlex = generate_revert_map_from_lex(lex)

def check_detail(example_, inv_type_lex, *kkks_):
    xv_ = fast_gen_vec(example_)
    print("\n\n")
    xv_ = np.asarray(xv_.todense()).flatten()
    for kkk_ in kkks_:
        print "for label %s :\n" % inv_type_lex[kkk_]
        prod_ = xv_ * logreg.coef_[kkk_]
        print("Score : %.5f\n" % (np.sum(prod_) + logreg.intercept_[kkk_]))
        inds = np.argsort(-np.abs(prod_))[:200]

        weight_of_w2v = logreg.coef_[kkk_][len(lex.m):].reshape((1,300))
        fweight_of_w2v = xv_[len(lex.m):].reshape((1,300))

        w2v_scores = np.sum(weight_of_w2v * fweight_of_w2v, axis = 1).tolist()

        words = []
    #     for i in range(example_.start-4, example_.start):
    #         if i < 0:
    #             words.append("-*-")
    #         else:
    #             words.append(example_.tokens[i])   

    #     words.append("B-MEAN")
    #     for i in range(example_.end, example_.end + 4):
    #         if i >= len(example_.tokens):
    #             words.append("-*-")
    #         else:
    #             words.append(example_.tokens[i])    

    #     words.append("A-MEAN")
        words.append("MENTION-MEAN")

        for word, score in zip(words, w2v_scores):
            print("%s --> %.3f" %(word, score))

        print("\n\n")


        for ind in inds:
            if ind >= len(rlex):
                fname = "W2V"
                continue
            else:
                fname = rlex[ind]
            w_ = prod_[ind]
            if w_ > 0:
                print("%-20s : %.4f" % (fname, w_) )
        print( "----" * 10)
        for ind in inds:
            if ind >= len(rlex):
                fname = "W2V"
                continue
            else:
                fname = rlex[ind]
            w_ = prod_[ind]
            if w_ < 0:
                print("%-20s : %.4f" % (fname, w_) )
        print ""
        print "="*60
        print ""
    print(example_) 

In [211]:

def generate_revert_map_from_lex(lex):
    r = [None] * len(lex.m)
    for k in lex.m:
        r[lex.m[k]] = k
    return r

rlex = generate_revert_map_from_lex(lex)

def check_detail(example_, inv_type_lex, *kkks_):
    xv_ = fast_gen_vec(example_)
    print("\n\n")
    for kkk_ in kkks_:
        print "for label %s :\n" % inv_type_lex[kkk_]
        prod_ = np.asarray(xv_.todense()).flatten() * logreg.coef_[kkk_]
        inds = np.argsort(-np.abs(prod_))[:200]
        for ind in inds:
            if ind >= len(rlex):
                fname = "W2V"
                continue
            else:
                fname = rlex[ind]
            w_ = prod_[ind]
            if w_ > 0:
                print("%-20s : %.4f" % (fname, w_) )
        print( "----" * 10)
        for ind in inds:
            if ind >= len(rlex):
                fname = "W2V"
                continue
            else:
                fname = rlex[ind]
            w_ = prod_[ind]
            if w_ < 0:
                print("%-20s : %.4f" % (fname, w_) )
        print "="*20
        print ""
    print(example_) 

In [212]:
type_lex.m

{u'CARDINAL': 3,
 u'DATE': 5,
 u'EVENT': 15,
 u'FAC': 8,
 u'GPE': 1,
 u'LANGUAGE': 16,
 u'LAW': 17,
 u'LOC': 11,
 u'MONEY': 10,
 u'NORP': 2,
 u'ORDINAL': 7,
 u'ORG': 4,
 u'PERCENT': 13,
 u'PERSON': 0,
 u'PRODUCT': 9,
 u'QUANTITY': 12,
 u'TIME': 6,
 u'WORK_OF_ART': 14}

In [213]:
coarselabel_match_examples[('/organization', '/location')]

[{
     "start": 22, 
     "end": 24, 
     "surface": [
         "Federal", 
         "Way"
     ], 
     "label": "/location,/location/city", 
     "doc_id": "", 
     "tokens": [
         "A", 
         "federal", 
         "grand", 
         "jury", 
         "has", 
         "indicted", 
         "eight", 
         "people", 
         "suspected", 
         "of", 
         "operating", 
         "a", 
         "human-trafficking", 
         "ring", 
         "for", 
         "interstate", 
         "prostitution", 
         "from", 
         "a", 
         "Korean", 
         "nightclub", 
         "in", 
         "Federal", 
         "Way", 
         "."
     ]
 }, {
     "start": 11, 
     "end": 13, 
     "surface": [
         "Federal", 
         "Way"
     ], 
     "label": "/location,/location/city", 
     "doc_id": "", 
     "tokens": [
         "Also", 
         "arrested", 
         "Thursday", 
         "were", 
         ":", 
         "Miyoung", 
         "Roberts", 
  

In [214]:
inv_type_lex = {v:k for k, v in type_lex.m.iteritems()}
type_lex.m

{u'CARDINAL': 3,
 u'DATE': 5,
 u'EVENT': 15,
 u'FAC': 8,
 u'GPE': 1,
 u'LANGUAGE': 16,
 u'LAW': 17,
 u'LOC': 11,
 u'MONEY': 10,
 u'NORP': 2,
 u'ORDINAL': 7,
 u'ORG': 4,
 u'PERCENT': 13,
 u'PERSON': 0,
 u'PRODUCT': 9,
 u'QUANTITY': 12,
 u'TIME': 6,
 u'WORK_OF_ART': 14}

In [215]:
check_detail(get_example((('/organization', u'/location')))[2], inv_type_lex, 11, 4)

(1, 34674)
('dense_vecs[0].shape', (300,))
('dense_vecs.shape', (1, 300))
('shapes : ', (1, 34674), (1, 300))



for label LOC :

dep_feature=<-pobj- of : 6.1723
dep_feature=<- of    : 4.1910
----------------------------------------
bias=bias            : -60.3990
KBBias=location      : -46.3647
wim=Aa               : -29.7549
wim=Federal          : -11.7027
wim_lemma=federal    : -11.7027
wim_loc=b2-Federal   : -3.0042
wim_loc_lemma=b2-federal : -3.0042
wim_loc_lemma=f0-federal : -1.3525
wim_loc=f0-Federal   : -1.3525

for label ORG :

dep_feature=<-pobj- of : 4.6822
wim=Federal          : 4.3280
wim_lemma=federal    : 2.9289
wim_loc_lemma=f0-federal : 2.8428
wim_loc=f0-Federal   : 2.1542
dep_feature=<- of    : 1.0424
wim=Aa               : 0.9941
----------------------------------------
bias=bias            : -20.3572
KBBias=location      : -14.8247
wim_lemma=way        : -5.7287
wim=Way              : -5.7287
wim_loc=b2-Federal   : -0.3150
wim_loc_lemma=b2-federal : -0.3150

{
    "

In [None]:
print(" ".join(get_example(('ORG', '/person'))[11].tokens))

In [None]:
lex.counter_per_type["wim_loc=f0-Mackenzie"]

In [None]:
print(" ".join(get_example(('ORG', '/person'))[11].tokens))

In [None]:
ex = get_example(('ORG', '/person'))[11]
shortest_path2((ex.doc[0], ex.doc[2]))

In [None]:
heads(ex.doc[2])

In [None]:
list(clean_path((shortest_path2((ex.doc[2], ex.doc[0]))), entity_on_left=False))

In [None]:
shortest_path((ex.doc[0], ex.doc[2]))

In [None]:
ex = get_example(('ORG', '/person'))[6]
def pronoun_feat(doc, start, end):
    PRP_SYM = nlp.vocab.strings["PRP"]
    prps = [token.i for token in doc if token.tag == PRP_SYM]
    
    print prps
pronoun_feat(ex.doc, 0, 1)

In [None]:
filter(lambda feat: feat.startswith("prp"), lex.m.keys())[:10]

In [None]:
_m = get_example(('ORG', '/person'))[1]
for k in mention_details(_m.doc, _m.start, _m.end ):
    print(k)

In [None]:
bad_examples= get_example((('ORG', '/person') ))

In [216]:
import cPickle as pickle
from sklearn.externals import joblib
joblib.dump(logreg, "/home/haowu4/data/ontonotes_model/logreg.pkl")
with open("/home/haowu4/data/ontonotes_model/lex.pkl", 'wb') as f_lex:
    with open("/home/haowu4/data/ontonotes_model/type_lex.pkl", 'wb') as f_type_lex:
        pickle.dump(lex.m, f_lex, pickle.HIGHEST_PROTOCOL)
        pickle.dump(type_lex.m, f_type_lex, pickle.HIGHEST_PROTOCOL)
        
        
# with open("/home/haowu4/data/ontonotes_model/lex.pkl", 'r') as f_lex:
#     with open("/home/haowu4/data/ontonotes_model/type_lex.pkl", 'r') as f_type_lex:
#         lex.m = pickle.load(f_lex)
#         pickle.load(type_lex.m, f_type_lex, pickle.HIGHEST_PROTOCOL)

In [34]:
sum(xs_figer[0])

<1x34950 sparse matrix of type '<type 'numpy.float64'>'
	with 304 stored elements in Compressed Sparse Row format>

In [35]:
figer_datas[1]

{
    "start": 7, 
    "end": 10, 
    "surface": [
        "Department", 
        "of", 
        "Chemistry"
    ], 
    "label": "/education/department,/organization", 
    "doc_id": "", 
    "tokens": [
        "A", 
        "handful", 
        "of", 
        "professors", 
        "in", 
        "the", 
        "UW", 
        "Department", 
        "of", 
        "Chemistry", 
        "are", 
        "being", 
        "recognized", 
        "by", 
        "the", 
        "American", 
        "Association", 
        "for", 
        "the", 
        "Advancement", 
        "of", 
        "Science", 
        "(", 
        "AAAS", 
        ")", 
        "for", 
        "their", 
        "efforts", 
        "and", 
        "contributions", 
        "to", 
        "the", 
        "scientific", 
        "community", 
        "."
    ]
}

In [36]:
xs_figer[1]

<1x34950 sparse matrix of type '<type 'numpy.float64'>'
	with 321 stored elements in Compressed Sparse Row format>

In [39]:
_,_, xs_figer, _ = generate_vecs(figer_datas,
                            lambda m : "LOC",
                            features,
                            dense_feature,
                            lex,
                            type_lex)

(579, 34650)
('dense_vecs[0].shape', (300,))
('dense_vecs.shape', (579, 300))
('shapes : ', (579, 34650), (579, 300))


In [51]:
for v in xs_figer[1].nonzero()[1].tolist():
    if v < len(lex.m):
        print(rf_map[v])

wim=Aa
wim_bigram=Department-of
wim=of
wim_bigram_lemma=department-of
wim_lemma=of
wim_loc_lemma=b3-department
wim=a
wim_loc_lemma=b2-of
wim_loc=b3-Department
wim_loc=b2-of
wim_lemma=department
dep_feature=-det-> the
wim_loc=f1-of
wim_loc_lemma=f0-department
bias=bias
dep_feature=<- in
wim_loc_lemma=f1-of
wim_loc=f0-Department
wim=Department
dep_feature=-> the
dep_feature=<-pobj- in


In [44]:
rf_map = {lex.m[i]:i for i in lex.m}

In [46]:
rf_map[4297]

u'wim_bigram_lemma=department-of'

In [52]:
xs_figer[1].nonzero()[1]

array([ 1386,  2604,  2754,  4297,  5062,  5110,  8417,  9292, 15042,
       16190, 17257, 17805, 18688, 19246, 19955, 20837, 23396, 24856,
       29344, 33264, 33476, 34650, 34651, 34652, 34653, 34654, 34655,
       34656, 34657, 34658, 34659, 34660, 34661, 34662, 34663, 34664,
       34665, 34666, 34667, 34668, 34669, 34670, 34671, 34672, 34673,
       34674, 34675, 34676, 34677, 34678, 34679, 34680, 34681, 34682,
       34683, 34684, 34685, 34686, 34687, 34688, 34689, 34690, 34691,
       34692, 34693, 34694, 34695, 34696, 34697, 34698, 34699, 34700,
       34701, 34702, 34703, 34704, 34705, 34706, 34707, 34708, 34709,
       34710, 34711, 34712, 34713, 34714, 34715, 34716, 34717, 34718,
       34719, 34720, 34721, 34722, 34723, 34724, 34725, 34726, 34727,
       34728, 34729, 34730, 34731, 34732, 34733, 34734, 34735, 34736,
       34737, 34738, 34739, 34740, 34741, 34742, 34743, 34744, 34745,
       34746, 34747, 34748, 34749, 34750, 34751, 34752, 34753, 34754,
       34755, 34756,

In [60]:
print(logreg.predict(xs_figer[0]))
logreg.decision_function(xs_figer[0])[0]

[4]


array([-13.32179681, -25.25818249, -14.91354345, -22.44403821,
        13.14998444, -23.47352767, -30.6696133 ,  -6.89452493,
       -18.32095584, -16.20493598,  -7.23660526, -21.13938825,
       -26.65806334, -10.19096394, -31.52200585, -43.80509653,
       -22.84706278, -53.22492515])

In [54]:
dir(logreg)

['C',
 '__abstractmethods__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__doc__',
 '__format__',
 '__getattribute__',
 '__getstate__',
 '__hash__',
 '__init__',
 '__module__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_abc_cache',
 '_abc_negative_cache',
 '_abc_negative_cache_version',
 '_abc_registry',
 '_allocate_parameter_mem',
 '_estimator_type',
 '_expanded_class_weight',
 '_fit',
 '_fit_binary',
 '_fit_multiclass',
 '_get_learning_rate_type',
 '_get_loss_function',
 '_get_param_names',
 '_get_penalty_type',
 '_partial_fit',
 '_predict_proba_lr',
 '_validate_params',
 '_validate_sample_weight',
 'alpha',
 'average',
 'class_weight',
 'classes_',
 'coef_',
 'decision_function',
 'densify',
 'epsilon',
 'eta0',
 'fit',
 'fit_intercept',
 'fit_transform',
 'get_params',
 'intercept_',
 'l1_ratio',
 'learning_rate',
 'loss',
 'loss_function',
 'loss_functions',
 'n_ite

In [79]:
import sys
from dfiner.types.finer_type_system import FinerTypeSystem
from dfiner.utils import get_default_config
config = get_default_config()

tps = FinerTypeSystem.load_type_system(config)

def strip_fine_type(label):
    labels = label.split(",")
    coarse_types = set()
    # print("-------------")
    for label in labels:
        label = label.replace("/", ".")[1:]
        if not tps.has_type(label):
            print >> sys.stderr, "%s is missing" % label
            return "O"
        else:
            label = tps.get_root(label)
        coarse_types.add("/%s" % label)
    # print("-------------")
    return ",".join(sorted(list(coarse_types)))


In [166]:
import yaml
with open("/home/haowu4/codes/dataless_finer/resources/ontonote_to_figer_type.yaml") as input:
    onto2figer_tm = yaml.load(input.read())

In [167]:
getOrDefault(onto2figer_tm, "QUANTITY", "X")

'X'

In [168]:
onto2figer_tm["NORP"]

'people.ethnicity'

In [120]:
onto2figer_tm

In [192]:
kbann.surface_totype_dist["Supersonics"]

KeyError: 'Supersonics'