In [3]:
import os
from collections import defaultdict, OrderedDict
import json
import codecs
import numpy as np
from scipy.sparse import coo_matrix, hstack
from nltk.stem import WordNetLemmatizer
import re
import spacy

nlp = spacy.load('en')

class Mention(object):
    def __init__(self, start, end, label, tokens, doc_id = ""):
        self.start = start
        self.end = end
        self.label = label
        self.tokens = tokens
        self.surface = [tokens[i] for i in range(start,end)]
        self.ext_surface = [tokens[i] for i in range(max(start-1,0),min(end+1,len(tokens)))]
        self.doc_id = doc_id
        doc = nlp.tokenizer.tokens_from_list(tokens)
        nlp.tagger(doc)
        nlp.parser(doc)
        self.doc = doc
        
        
    def __repr__(self):
        o = [("start",self.start),
            ("end",self.end),
            ("surface",self.surface), 
            ("label",self.label),
            ("doc_id",self.doc_id),
            ("tokens",self.tokens)]
        
        return json.dumps(OrderedDict(o), indent=4)
        
def read_figer(file = "/home/haowu4/codes/dataless_finer/eval_figer/data/gold_cleaned_figer_test.label"):
    tokens = []
    labels = []
    sentences = []
    with codecs.open(file, "r", "utf-8") as input:
        for i, line in enumerate(input):
            line = line.strip()
            if len(line) == 0:
                # New sentence
                if len(tokens) == 0:
                    continue
                sent = get_sentence(tokens, labels)
                for s in sent:
                    sentences.append(s)
                tokens = []
                labels = []
                continue
            line = line.split("\t")
            word, label = line[0], line[1]
            tokens.append(word)
            labels.append(label)
            
    if len(tokens) == 0:
        return sentences

    sent = get_sentence(tokens, labels)
    sentences.append(sent)
    tokens = []
    labels = []
    return sentences

    
        
def get_sentence(tokens, labels):
    entities = []
    current_labels = ""
    current_start = 0
    in_mention = False
    for i, (t, lab_str) in enumerate(zip(tokens, labels)):
        if lab_str.startswith("B"):
            if in_mention:
                entities.append(Mention(current_start, i, current_labels, tokens))
                in_mention = False
            current_start = i
            current_labels = lab_str.split("-")[1]
            in_mention = True

        if lab_str == "O":
            if in_mention:
                entities.append(Mention(current_start, i, current_labels, tokens))
                in_mention = False
   
    if in_mention:
        entities.append(Mention(current_start, len(labels), current_labels, tokens))
    
    return entities


def load_ontonotes(file):
    tokens = []
    labels = []
    sentences = []
    with codecs.open(file, "r", "utf-8") as input:
        for i, line in enumerate(input):
            if i < 1:
                continue
            line = line.strip()
            if len(line) == 0:
                # New sentence
                if len(tokens) == 0:
                    continue
                sent = get_sentence(tokens, labels)
                for s in sent:
                    sentences.append(s)
                tokens = []
                labels = []
                continue
            line = line.split("\t")
            word, label = line[5], line[0]
            tokens.append(word)
            labels.append(label)
            
    if len(tokens) == 0:
        return sentences

    sent = get_sentence(tokens, labels)
    sentences.append(sent)
    tokens = []
    labels = []
    return sentences


def load_all_data(base_folder):
    ret = []
    for f in os.listdir(base_folder):
        fn = os.path.join(base_folder,f)
        ms = load_ontonotes(fn)
        for m in ms:
            m.doc_id = f
            ret.append(m)
    return ret

class Lexicon(object):
    
    def __init__(self):
        self.curr = 0
        self.m = {}
        self.counter = defaultdict(int)
        self.counter_per_type = defaultdict(lambda:defaultdict(int))
    
    def see_feature(self, f, t = None):
        self.counter[f] += 1
        if t:
            self.counter_per_type[f][t] += 1
        if f not in self.m:
            self.m[f] = self.curr
            self.curr += 1
            
    
    def prune(self, min_support):
        self.curr = 0
        self.m = {}
        for k in self.counter:
            if self.counter[k] > min_support:
                self.m[k] = self.curr
                self.curr += 1
    
    def getOrNegOne(self, f):
        if f in self.m:
            return self.m[f]
        else:
            return -1
        
    def getOneHot(self, f):
        ret = np.zeros((self.curr))
        if f in self.m:
            ret[self.m[f]] = 1.0
        return ret
    
def loadW2V(w2v_file, allowed = None):
    ret = {}
    err = 0
    with codecs.open(w2v_file, "r" , 'utf-8') as input:
        for line in input:
            line = line.strip()
            if len(line) == 0:
                continue
            try:
                w,vec = line.split("\t")
#                 w,vec = line[0], line[1:]
            except ValueError:
#                 print(line)
                err += 1
                continue
            if allowed is not None and w in allowed:
                vec = [float(v) for v in vec.split(" ")]
                ret[w] = np.array(vec)
    print("%d line failed" % err)
    return ret

def generate_vecs(objs, 
                  typ_function,
                  feature_func,
                  dense_real_vec_features = [],
                  lex = None,
                  type_lex = None
                 ):
    len_x = len(objs)
    
    if lex is None:
        lex = Lexicon()
        for x in objs:
            for ff in feature_func:
                for k,v in ff(x.doc, x.start, x.end):
                    lex.see_feature(k, typ_function(x))
        lex.prune(7)

    if type_lex is None:
        type_lex = Lexicon()
        for x in objs:
            y = typ_function(x)
            type_lex.see_feature(y)

    
    
    row_ids = []
    col_ids = []
    vs = []
    
    dense_vecs = []
    
    ys = []
    
    for i, x in enumerate(objs):
        for ff in feature_func:
            for k,v in ff(x.doc, x.start, x.end):
                idx = lex.getOrNegOne(k)
                if idx > -1:
                    row_ids.append(i)
                    col_ids.append(idx)
                    vs.append(v)
                    
        if len(dense_real_vec_features) > 0:
            ds = []
            for dff in dense_real_vec_features:
                v  = dff(x)
                ds.append(v)
#             print(len(ds))
            denv = np.hstack((ds))
            dense_vecs.append(denv)
            
        ys.append(type_lex.getOrNegOne(typ_function(x)))
        
    sp = (len_x, lex.curr)
    print(sp)
    xs = coo_matrix((vs, (row_ids, col_ids)), shape=sp)
    print("dense_vecs[0].shape", dense_vecs[0].shape)
    dense_vecs = np.vstack(dense_vecs)
    print("dense_vecs.shape", dense_vecs.shape)
    if len(dense_real_vec_features) > 0:
        print("shapes : ", xs.shape, dense_vecs.shape, )
        xs = hstack((xs, dense_vecs))
    return lex, type_lex, xs.tocsr(), np.asarray(ys)


In [4]:
train_mentions= load_all_data("/home/haowu4/data/ontonotes_ner/ColumnFormat/TrainAndDev/")
print(len(train_mentions))

# conll_train = load_all_data("/home/haowu4/data/conll_ner/")
# for x in conll_train:
#     train_mentions.append(x)

# dev_mentions= load_all_data("/home/haowu4/data/ontonotes_ner/ColumnFormat/Dev/")
test_mentions= load_all_data("/home/haowu4/data/ontonotes_ner/ColumnFormat/Test/")
figer_datas = read_figer()

VOCABS = set()
mention_counter = 0
for mention in [train_mentions,test_mentions, figer_datas]:
    for men in mention:
        mention_counter += 1
        for t in men.tokens:
            VOCABS.add(t)
print("%d mention loaded" % mention_counter)

print("%d word loaded" % len(VOCABS))

w2vdict=loadW2V("/home/haowu4/data/autoextend/GoogleNews-vectors-negative300.combined_500k.txt", VOCABS)
print("%d words have w2v" % len(w2vdict))

default_w2v_mean = np.mean(list(w2vdict.values()), axis=0)
default_w2v_zero = np.zeros(default_w2v_mean.shape)

141634
159298 mention loaded
52467 word loaded
1 line failed
41459 words have w2v


In [5]:
# import cPickle as pickle
# with open("/home/haowu4/data/ontonotes_model/w2vdict",'wb') as o:
#     pickle.dump(w2vdict,o)
    
# with open("/home/haowu4/data/ontonotes_model/mean_w2v",'wb') as o:
#     pickle.dump(default_w2v_mean,o)

In [None]:
# Define features:

# from nltk.stem.wordnet import WordNetLemmatizer
# lmtzr = WordNetLemmatizer().lemmatize


def word_shape_func(text):
    text = re.sub("[a-z]+", "a" ,text)
    text = re.sub("[A-Z]+", "A" ,text)
    text = re.sub("[0-9]+", "0" ,text)
    return text
    

class FeatureFunc(object):
    def __init__(self, name):
        self.name = name
        
    def __call__(self, original_func):
        decorator_self = self
        def wrappee( *args, **kwargs):
            for feat in original_func(*args,**kwargs):
                yield ("%s=%s" % (self.name, feat), 1.0) 
        return wrappee

class RealFeatureFunc(object):
    def __init__(self, name):
        self.name = name
        
    def __call__(self, original_func):
        decorator_self = self
        def wrappee( *args, **kwargs):
            for feat,v in original_func(*args,**kwargs):
                yield ("%s=%s" % (self.name, feat), v)
        return wrappee

@FeatureFunc("dep_feature")
def mention_details(doc, start, end):
    heads = [token.head for token in doc[start:end]]
    deps = [list(token.children) for token in doc[start:end]]
    for token, head, children in zip(doc[start:end], heads, deps):
        if not (head.i >= start and head.i < end):            
            yield "<-%s- %s" % (token.dep_, head.lemma_)
            yield "<-%s" % (head.lemma_)
        for child in children:
            if not (child.i >= start and child.i < end):
                yield  "-%s-> %s" % (child.dep_,child.lemma_)
                yield  "-> %s" % (child.lemma_)

def word_before(pos):
    @FeatureFunc("word_before")
    def f(doc, start, end):
        for i in range(max(start-pos,0), start):
            yield doc[i].text
#             yield word_shape_func(doc[i].text)
    return f

def word_before_loc(pos):
    @FeatureFunc("word_before")
    def f(doc, start, end):
        for i in range(max(start-pos,0), start):
            yield "%d-%s" % (start - i,doc[i].text)
    return f

def word_before_lemma(pos):
    @FeatureFunc("word_before_lemma")
    def f(doc, start, end):
        for i in range(max(start-pos,0), start):
            yield doc[i].lemma_
    return f


def word_after(pos):
    @FeatureFunc("word_after")
    def f(doc, start, end):
        for i in range(end, min(end+pos,len(doc))):
            yield doc[i].text
#             yield word_shape_func(doc[i].text)
    return f

def word_after_loc(pos):
    @FeatureFunc("word_after_loc")
    def f(doc, start, end):
        for i in range(end, min(end+pos,len(doc))):
            yield "%d-%s" % (i - end,doc[i].text)
    return f


def word_after_lemma(pos):
    @FeatureFunc("word_after_lemma")
    def f(doc, start, end):
        for i in range(end, min(end+pos,len(doc))):
            yield doc[i].lemma_
        
    return f

@FeatureFunc("wim")
def word_in_mention(doc, start, end):
    for x in doc[start:end]:
        yield x.text
#         yield word_shape_func(x.text)

@FeatureFunc("wim_lemma")
def word_in_mention_lemma(doc, start, end):
    for x in doc[start:end]:
        yield x.lemma_
        

@FeatureFunc("wim_loc")
def word_in_mention_loc(doc, start, end):
    for i,x in enumerate(doc[start:end]):
        yield "f%d-%s" % (i,x.text)
        yield "b%d-%s" % ((end-start) - i,x.text)
        
@FeatureFunc("wim_loc_lemma")
def word_in_mention_loc_lemma(doc, start, end):
    for i,x in enumerate(doc[start:end]):
        x = x.lemma_
        yield "f%d-%s" % (i,x)
        yield "b%d-%s" % ((end-start)-i,x)



# @FeatureFunc("wim_ext")
# def wim_ext(doc, start, end):
#     for x in mention.ext_surface:
#         yield x
#         yield word_shape_func(x)
    
# @FeatureFunc("wim_ext_lemma")
# def wim_ext_lemma(doc, start, end):
#     for x in mention.ext_surface:
#         x = lmtzr(x)
#         yield x
    
@FeatureFunc("wim_bigram")
def wim_bigram(doc, start, end):
    for i,x in zip(doc[start:end-1], doc[start+1:end]):
        yield "%s-%s" % (i.text, x.text)
        
@FeatureFunc("wim_bigram_lemma")
def wim_bigram_lemma(doc, start, end):
    lms = [x.lemma_ for x in doc[start:end]]
    for i,x in zip(lms[:-1], lms[1:]):
        yield "%s-%s" % (i,x)


        
@FeatureFunc("word_shape")
def word_shape(doc, start, end):
    t = " ".join([x.text for x in doc[start:end]])
    return [word_shape_func(t)]
        
@FeatureFunc("length")
def mention_length(doc, start, end):
    return ["%d" % (end-start)]
        
@FeatureFunc("prefix")
def prefix(doc, start, end):
    for w in doc[start:end]:
        for i in range(3, min(5, len(w.text))):
            yield w.text[:i]
        
@FeatureFunc("surfix")
def postfix(doc, start, end):
    for w in doc[start:end]:
        for i in range(3, min(5, len(w.text))):
            yield w.text[-i:]

@FeatureFunc("bias")
def CONSTANT_BIAS(doc, start, end):
    return ["bias"]

# def gazzarteer(gas):
#     @FeatureFunc("gazzarteer")
#     def gazzarteer_wrappee(mention):
#         for gz_name in gas:
#             if mention.surface:
#                 return

def getOrDefault(m, k, d):
    if k in m:
        return m[k]
    else:
        return d

def w2vBefore(ws, default_w2v, pos = 4):    
    def wrappee(mention):
        words = []
        for i in range(mention.start-pos, mention.start):
            if i < 0:
                words.append(np.zeros(default_w2v.shape))
            else:
                words.append(getOrDefault(ws,mention.tokens[i],default_w2v))
        words.append(np.mean(words, axis=0))
        return np.hstack(words)
    return wrappee

def w2vAfrer(ws, default_w2v, pos = 4):    
    def wrappee(mention):
        words = []
        for i in range(mention.end + 1, mention.end + pos + 1):
            if i >= len(mention.tokens):
                words.append(np.zeros(default_w2v.shape))
            else:
                words.append(getOrDefault(ws,mention.tokens[i],default_w2v))
        words.append(np.mean(words, axis=0))
        return np.hstack(words)

    return wrappee

def w2vMention(ws, default_w2v):    
    def wrappee(mention):
        if len(mention.surface) == 0:
            print(mention)
        ms = [getOrDefault(ws,w,default_w2v) for w in mention.surface]
        return np.mean(ms, axis=0)
    return wrappee

In [None]:
# train_mentions = [x for x in train_mentions if len(x.tokens) > 5]

features = [CONSTANT_BIAS,
            mention_details,
            word_before(2), word_before_lemma(2),
            word_after(2), word_after_lemma(2),
            word_in_mention, word_in_mention_lemma,
            word_in_mention_loc, word_in_mention_loc_lemma,
            wim_bigram, wim_bigram_lemma,
#             wim_ext, wim_ext_lemma,
            word_shape,
#             mention_length,
            prefix,
            postfix,
           ]

default_w2v = default_w2v_mean

dense_feature = [ #w2vBefore(w2vdict, default_w2v),
                  #w2vAfrer(w2vdict, default_w2v),
                  w2vMention(w2vdict, default_w2v)]

def typ_func(m):
    if m.label == "PER":
        m.label = "PERSON"
    return m.label

lex, type_lex, xs_train, ys_train = generate_vecs([x for x in train_mentions if len(x.tokens) >= 5],
                                      typ_func,
                                      features,
                                      dense_feature)

_,_, xs_test, ys_test = generate_vecs(test_mentions,
                            typ_func,
                            features,
                            dense_feature,
                            lex,
                            type_lex)

# _,_, xs_dev, ys_dev = generate_vecs(dev_mentions,
#                             typ_func,
#                             features,
#                             dense_feature,
#                             lex,
#                             type_lex)

_,_, xs_figer, _ = generate_vecs(figer_datas,
                            lambda m : "LOC",
                            features,
                            dense_feature,
                            lex,
                            type_lex)

In [None]:
from sklearn.metrics import f1_score, confusion_matrix
from sklearn import linear_model, datasets, svm, ensemble, naive_bayes

# logreg = ensemble.BaggingClassifier(base_estimator=linear_model.Perceptron())
logreg = linear_model.Perceptron(penalty="l2", alpha=1e-6)
logreg.fit(xs_train, ys_train)
y_pred = logreg.predict(xs_test)
y_pred_figer = logreg.predict(xs_figer)

    

In [None]:

def fast_gen_vec(m):
    _,_, e, _ = generate_vecs([m],
                            lambda m : "LOC",
                            features,
                            dense_feature,
                            lex,
                            type_lex)
    return e

def generate_example(tokens):
    m = Mention(0, len(tokens), "", tokens)
    _,_, e, _ = generate_vecs([m],
                            lambda m : "LOC",
                            features,
                            dense_feature,
                            lex,
                            type_lex)
    return e

print("Binary f1  %.3f" % (f1_score(ys_test, y_pred)))

f1 = f1_score(ys_test, y_pred, average=None).tolist()
print(f1)
rd = {type_lex.m[x]:x for x in type_lex.m}

class_names = [None] * len(rd)
for i in range(len(rd)):
    class_names[i] = rd[i]

for k,v in sorted([(rd[k], f1[k]) for k in rd], key=lambda a:a[1], reverse=True):
    print("%15s : %.3f" % (k,v)) 

Binary f1  0.945
[0.9561104805145667, 0.962999185004075, 0.9865470852017937, 0.9855574812247255, 0.928506513177825, 0.676470588235294, 0.9652351738241309, 0.9710485133020343, 0.9, 0.6827309236947792, 0.7031700288184438, 0.9870129870129871, 0.5, 0.9977081741787623, 0.9535353535353536, 0.5730659025787965, 0.7912087912087913, 0.6949152542372882]
        PERCENT : 0.998
          MONEY : 0.987
        ORDINAL : 0.987
           DATE : 0.986
       CARDINAL : 0.971
           NORP : 0.965
            GPE : 0.963
         PERSON : 0.956
           TIME : 0.954
            ORG : 0.929
       QUANTITY : 0.900
            LOC : 0.791
    WORK_OF_ART : 0.703
            LAW : 0.695
          EVENT : 0.683
        PRODUCT : 0.676
            FAC : 0.573
       LANGUAGE : 0.500

In [None]:
counter = 0
for i,m in enumerate(test_mentions):
    if y_pred[i] == ys_test[i]:
        continue
    if m.label == "FAC":
#         print(rd[y_pred[i]],m)
        counter += 1
    if counter == 10:
        break

In [None]:
matches =  defaultdict(int)
alllabel_matches =  defaultdict(int)

match_examples =  defaultdict(list)
all_labels = 0
for i in range(len(figer_datas)):
    preded = rd[y_pred_figer[i]]
    fine_types = sorted(figer_datas[i].label.split(","))
    alllabel_matches[(preded,",".join(fine_types))] += 1
    for l in fine_types:
        all_labels += 1
        matches[(preded, l)] += 1
        match_examples[(preded, l)].append(figer_datas[i])

In [None]:
figer_len = float(len(figer_datas))
for k in sorted(matches.keys(), key=lambda x : matches[x], reverse=True):
    print(k, "\t\t",matches[k],"\t\t",matches[k]/figer_len)

In [10]:
def get_example(t):
    return match_examples[t]
kk=get_example((('ORG', '/person') ))

In [11]:
# figer_len = float(len(figer_datas))
# for k in sorted(alllabel_matches.keys(), key=lambda x : alllabel_matches[x], reverse=True):
#     print(k, "\t\t",alllabel_matches[k],"\t\t",alllabel_matches[k]/figer_len)

In [12]:
# figer_datas[0]

In [13]:
# counter = 0
# ds = []
# import json
# for i,m in enumerate(figer_datas):
#     preded = rd[y_pred_figer[i]]
#     s = json.loads(figer_datas[i].__repr__())
#     s["predicted"] = preded
#     s = json.dumps(s)
#     ds.append(s)

In [14]:
# rd[logreg.predict(generate_example(["Hirsh"]))[0]]

In [15]:
# logreg.coef_

In [16]:
# logreg.coef_[4].shape

In [17]:
# np.asarray(xv_.todense()).flatten() * logreg.coef_[4]

In [17]:
pred_r_arr

array([7])

In [14]:
np.random.random((1, xdim))

array([[ 0.14990763,  0.04445518,  0.72226002, ...,  0.6039336 ,
         0.88440977,  0.87510776]])

In [2]:
from sklearn.externals import joblib
m = joblib.load("/home/haowu4/data/ontonotes_model/logreg.pkl")

  if __name__ == '__main__':


In [18]:
import numpy as np
ydim, xdim = m.coef_.shape
pred_r_arr = m.score(np.random.random((1, xdim)))

TypeError: score() takes at least 3 arguments (2 given)

In [19]:

def generate_revert_map_from_lex(lex):
    r = [None] * len(lex.m)
    for k in lex.m:
        r[lex.m[k]] = k
    return r

rlex = generate_revert_map_from_lex(lex)

def check_detail(example_, kkk_):
    xv_ = fast_gen_vec(example_)
    print("\n\n")
    xv_ = np.asarray(xv_.todense()).flatten()
    prod_ = xv_ * logreg.coef_[kkk_]
    print("Score : %.5f\n" % (np.sum(prod_) + logreg.intercept_[kkk_]))
    inds = np.argsort(-np.abs(prod_))[:200]
    
    weight_of_w2v = logreg.coef_[kkk_][len(lex.m):].reshape((11,300))
    fweight_of_w2v = xv_[len(lex.m):].reshape((11,300))
    
    w2v_scores = np.sum(weight_of_w2v * fweight_of_w2v, axis = 1).tolist()
    
    words = []
    for i in range(mention.start-pos, mention.start):
        if i < 0:
            words.append("-*-")
        else:
            words.append(example_.tokens[i])    
    for i in range(mention.end + 1, mention.end + pos + 1):
        if i >= len(mention.tokens):
            words.append(np.zeros(default_w2v.shape))
        else:
            words.append(getOrDefault(ws,mention.tokens[i],default_w2v))
    
    for ind in inds:
        if ind >= len(rlex):
            fname = "W2V"
            continue
        else:
            fname = rlex[ind]
        w_ = prod_[ind]
        if w_ > 0:
            print("%-20s : %.4f" % (fname, w_) )
    print( "----" * 10)
    for ind in inds:
        if ind >= len(rlex):
            fname = "W2V"
            continue
        else:
            fname = rlex[ind]
        w_ = prod_[ind]
        if w_ < 0:
            print("%-20s : %.4f" % (fname, w_) )
    print(example_) 

In [20]:
check_detail(get_example(('ORG', '/person'))[0],0)

(1, 67843)
dense_vecs[0].shape (300,)
dense_vecs.shape (1, 300)
shapes :  (1, 67843) (1, 300)



Score : -9.45392



ValueError: total size of new array must be unchanged

In [None]:
" ".join(get_example(('ORG', '/person'))[0].tokens)

In [None]:
lex.counter_per_type["dep_feature=-> 55"]

In [None]:
pc=0
for _ment in train_mentions:
#     if _ment.label == "PERSON":
    for k,v in mention_details(_ment.doc,_ment.start, _ment.end ):
        if k == "dep_feature=-> 55":
            print(_ment)
            pc += 1
    if pc == 10:
        break

In [None]:
pp = [x for x in train_mentions if len(x.tokens) == 5]

In [None]:
pp[:10]

In [None]:
_m = get_example(('ORG', '/person'))[1]
for k in mention_details(_m.doc, _m.start, _m.end ):
    print(k)

In [None]:
bad_examples= get_example((('ORG', '/person') ))

In [None]:
lex.counter["dep_feature=<-spend"]

In [None]:
lex.counter_per_type["dep_feature=<-spend"]

In [None]:
sortedfeatures = sorted(lex.counter.keys(), key=lambda x:lex.counter[x], reverse=True)

In [None]:
for x in sortedfeatures[:100]:
    print("%-28s : %d" % (x, lex.counter[x]))

In [None]:
type_lex.m

In [12]:
with open("/home/haowu4/data/ontonotes_model/w2vdict",'r') as inp:
    new_w2dict = pickle.load(inp)


In [14]:
new_w2dict["Hello"] - w2vdict["Hello"]

array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0

In [6]:
import cPickle as pickle
with open("/home/haowu4/data/ontonotes_model/type_lex.pkl",'r') as inp:
    ty_lex_pk = pickle.load(inp)


In [7]:
ty_lex_pk

{u'CARDINAL': 8,
 u'DATE': 4,
 u'EVENT': 6,
 u'FAC': 14,
 u'GPE': 1,
 u'LANGUAGE': 16,
 u'LAW': 15,
 u'LOC': 10,
 u'MONEY': 13,
 u'NORP': 3,
 u'ORDINAL': 5,
 u'ORG': 7,
 u'PERCENT': 11,
 u'PERSON': 0,
 u'PRODUCT': 17,
 u'QUANTITY': 12,
 u'TIME': 2,
 u'WORK_OF_ART': 9}

In [None]:
l