In [176]:
import json
from collections import Counter, defaultdict, OrderedDict
import codecs
from nltk.util import ngrams
from nltk.corpus import stopwords
import pandas as pd
from tqdm import tqdm
pd.options.display.max_rows = 750
stop = set(stopwords.words('english'))

In [149]:
def get_all_examples():
    ret = set()
    with codecs.open("/home/haowu4/codes/dataless_finer/resources/type_to_alias.txt", "r", "utf-8") as inp:
        for oline in tqdm(inp):
            line = oline.strip()
            if len(line) == 0:
                continue
            ps = line.strip().split("\t")

            if len(ps) != 2:
                continue

            typ, mention = ps[0], ps[1]
                
            ret.add((typ, mention))
    return ret
all_examples = get_all_examples()

7558042it [00:25, 296906.02it/s]


In [150]:
type_mention_count = Counter([x for x,_ in all_examples])

In [151]:
def extract_all_pattern(tokens, ks):
    for i, w in enumerate(tokens):
        word_before = i


        for k in ks:            
            word_after = len(tokens) - i - k
            if word_after < 0:
                break
            yield (word_before,tuple(tokens[i:i+k])  , word_after)
                
# print list(extract_all_pattern("This is a example test".split(), [2,3,4]))         
                
def count_all_k_gram(all_examples, ks):

    patterns = defaultdict(Counter)
    for typ, mention in tqdm(all_examples):
        tokens = mention.split(" ")
        for pat in extract_all_pattern(tokens, ks):
            patterns[typ].update([pat])
    return patterns
        

In [152]:
tri_gram_pattern = count_all_k_gram(all_examples, range(2,5))

cleaned_tri_gram_pattern = {}

for typ, pat_counter in tri_gram_pattern.iteritems():
    cleaned_counts = {}
    for pat, c in pat_counter.iteritems():
        if c < 5:
            continue
        wb,ws,wa = pat
        if len(ws) == 0:
            continue
        cleaned_counts[pat] = c
        
    cleaned_tri_gram_pattern[typ] = Counter(cleaned_counts)
tri_gram_pattern = cleaned_tri_gram_pattern

100%|██████████| 5873473/5873473 [03:56<00:00, 24843.31it/s]


In [159]:
def get_pattern_db(top_n, min_support = 0):
    ret = []
    oc_in = defaultdict(set)
    for typ in tqdm(cleaned_tri_gram_pattern):
        for i, (p, c) in enumerate(cleaned_tri_gram_pattern[typ].most_common()):
            if i <= top_n and c >= min_support:
                word_before, pat_text, word_after = p
                pat_surface = " ".join(["X"] * word_before)
                pat_surface += " "
                pat_surface += " ".join(list(pat_text))
                pat_surface += " "
                pat_surface += " ".join(["X"] * word_after)
                pat_surface = pat_surface.strip()
                o = OrderedDict([
                    ("Type", typ),
                    ("Pattern", pat_surface),
                    ("Support", c)
                    ("Support%", c * 1.0/ type_mention_count[typ])
                ])
                oc_in[pat_surface].add(typ)
                ret.append(o)
                
    real_ret = []
    
    for x in ret:
        typ = x["Type"]
        pat_surface = x["Pattern"]
        c = x["Support"]
        o = OrderedDict([
            ("Type", typ),
            ("Pattern", pat_surface),
            ("Support", c),
            ("Support%", c * 1.0/ type_mention_count[typ]),            
            ("ConfusiongCount", len(oc_in[pat_surface])),
            ("ConfusedIn", ",".join(oc_in[pat_surface]))
        ])
        real_ret.append(o)


    return pd.DataFrame.from_dict(real_ret)

pat_db = get_pattern_db(100)

100%|██████████| 113/113 [00:02<00:00, 47.61it/s]


In [191]:
ENOUGH_SUPPPORT = pat_db["Support"] > 20

LOW_CONFUSION = pat_db["ConfusiongCount"] < 5

HAS_TYPE = pat_db["Type"] == "government.political_party"

pat_db[ENOUGH_SUPPPORT & LOW_CONFUSION & HAS_TYPE].sort_values(["Support"], ascending=[False])



Unnamed: 0,Type,Pattern,Support,Support%,ConfusiongCount,ConfusedIn
6359,government.political_party,X Party of X,469,0.060704,2,"organization,government.political_party"
6360,government.political_party,X X Party of X,212,0.02744,2,"organization,government.political_party"
6361,government.political_party,X Party of X X,141,0.01825,1,government.political_party
6362,government.political_party,Communist Party X X,109,0.014108,1,government.political_party
6363,government.political_party,Communist Party of X,105,0.01359,1,government.political_party
6364,government.political_party,X Democratic Party,101,0.013073,1,government.political_party
6365,government.political_party,X People's Party,81,0.010484,1,government.political_party
6366,government.political_party,X Party of X X X,76,0.009837,1,government.political_party
6367,government.political_party,X X Democratic Party,74,0.009578,1,government.political_party
6368,government.political_party,X Communist Party,69,0.008931,1,government.political_party


In [170]:
_=set(pat_db["Type"].tolist())

0.011041


In [186]:
pat_db.sort_values(["Support%"], ascending=[False])[750:1500]

Unnamed: 0,Type,Pattern,Support,Support%,ConfusiongCount,ConfusedIn
3789,train,LNER Class X,27,0.009398,1,train
3790,train,SNCF Class BB X,27,0.009398,1,train
3788,train,X Class BB X,27,0.009398,1,train
2625,building.power_station,X Thermal Power X,11,0.009362,1,building.power_station
1422,government_agency,X X Sheriff's Office,78,0.009331,2,"military,government_agency"
2128,religion.religion,X of the X,6,0.009317,30,"art,organization.company,event.military_confli..."
3088,transit,X X Transit System,11,0.009298,1,transit
5684,location.country,Grand Duchy of X,16,0.009243,1,location.country
5683,location.country,X Duchy of X,16,0.009243,1,location.country
5682,location.country,Grand Duchy X X,16,0.009243,1,location.country


In [192]:
def get_pattern_dump(top_n, min_support = 0.002, min_count = 50):
    ret = defaultdict(set)

    for typ in tqdm(cleaned_tri_gram_pattern):
        for i, (p, c) in enumerate(cleaned_tri_gram_pattern[typ].most_common()):
            if c < min_count:
                continue
            sup_prc = c * 1.0/ type_mention_count[typ]
            if i <= top_n and sup_prc >= min_support:
                ret[p].add((typ, sup_prc))                
    return ret

pat_dump = get_pattern_dump(100)

100%|██████████| 113/113 [00:00<00:00, 800.22it/s]


In [193]:
import cPickle as pickle
with open("/tmp/pat_dump", "wb") as out:
    pickle.dump(pat_dump, out)

In [185]:
pat_dump

defaultdict(set,
            {(0,
              (u'Action', u'of'),
              3): {(u'event.military_conflict', 0.014499239237447418)},
             (0,
              (u'Alabama', u'State'),
              2): {(u'transportation.road', 0.013068760865655536)},
             (0,
              (u'Alabama', u'State', u'Route'),
              1): {(u'transportation.road', 0.013068760865655536)},
             (0,
              (u'Alfa', u'Romeo'),
              1): {(u'product.car', 0.009750152346130409)},
             (0,
              (u'Alpha', u'Delta'),
              1): {(u'organization.fraternity_sorority', 0.01126126126126126)},
             (0,
              (u'Alpha', u'Sigma'),
              1): {(u'organization.fraternity_sorority', 0.01126126126126126)},
             (0,
              (u'Arkansas', u'Highway'),
              1): {(u'transportation.road', 0.011330255979857323)},
             (0,
              (u'Armstrong', u'Siddeley'),
              1): {(u'product.engine_dev