In [6]:
import codecs
from scipy import stats
from collections import defaultdict
import string
import re

import yaml
with open("/home/haowu4/codes/dataless_finer/resources/figer_hier.yaml") as input:
    types = yaml.load(input.read())
    
class FinerTypeSystem(object):
    def __init__(self, tree):
        self.tree = tree

    def is_coarse_type(self, node_name):
        return self.tree[node_name]["parent"] is None
    
    def parent_of(self, node_name):
        return self.tree[node_name]["parent"]

    def types(self):
        return self.tree
    
    def is_figer_type(self, node_name):
        return self.tree[node_name]["is_figer_type"]

    def a_belongs_to_b(self, fine, coarse):
        it = fine
        while self.tree[it]["parent"]:
            if self.tree[it]["parent"] == coarse:
                return True
            it = self.tree[it]["parent"]
        return False
    
    def has_type(self, t):
        return t in self.tree
    
    def get_path(self, node):
        path = []
        it = node
        while self.tree[it]["parent"]:
            path.append(self.tree[it]["parent"])
            it = self.tree[it]["parent"]
        return path
def unquotekey(key, encoding=None):
    """
    unquote a namespace key and turn it into a unicode string
    """

    valid_always = string.ascii_letters + string.digits

    output = []
    i = 0
    while i < len(key):
        if key[i] in valid_always:
            output.append(key[i])
            i += 1
        elif key[i] in '_-' and i != 0 and i != len(key):
            output.append(key[i])
            i += 1
        elif key[i] == '$' and i+4 < len(key):
            # may raise ValueError if there are invalid characters
            output.append(chr(int(key[i+1:i+5],16)))
            i += 5
        else:
            raise ValueError("unquote key saw invalid character '%s' at position %d" % (key[i], i))

    ustr = u''.join(output)
    
    if encoding is None:
        return ustr

    return ustr.encode(encoding)

system_print = print

printed = 0

def to_chunk(v, chunk_size = 0.05 ):
    v = float(v)
    return "%.2f" % (round(v/chunk_size) * chunk_size)


def load_redirect_map():
    redirects = {}
    with codecs.open("/home/haowu4/data/wikidump/redirect_graph_node.graph") as input:
        for line in input:
            line = line.strip()
            if "\t" in line:
                a,b = line.split("\t")
                a = a.replace(" ", "_")
                b = b.replace(" ", "_")
                redirects[a] = b
            else:
                redirects[line] = line
    return redirects


def load_mention_to_type():
    surface_to_map =defaultdict(set)

    with codecs.open("/home/haowu4/data/wikidump/wiki_title2figer_type.txt", "r", "utf-8") as input:
        for i,line in enumerate(input):
            line = line.strip()
            a,b = line.split("\t")
            a = unquotekey(a)
            surface_to_map[a].add(b)

    surface_to_type_name = {}

    for k in surface_to_map:
        surface_to_type_name[k] = frozenset(surface_to_map[k])
    return surface_to_type_name

def sprint(*args, **kwargs):
    global printed
    printed += 1
    if printed > 10:
        return
    print(*args, **kwargs)

def analysis_line(line):
    line = line.split("\t")
    surface = line[0]
    ks = []
    print(line)
    print(line)
    for i in range(1, int(len(line)/2)):
        k = int(line[i*2 +1])
        print(k)
        ks.append(k)
    print(sum(ks))

def entropy_score(k):
    stats.entropy(list(k["titles"].values()))

def max_dist(k):
    l = sorted(list(k["titles"].values()), reverse=True)
    if len(l) == 0:
        return 0.0
    
    if len(l) == 1:
        return l[0]
    else:
        return l[0] - l[1]
    
def print_mention_surface_dist(k):
    print("Surface: [%s]" % k["surface"])
    titles = k["titles"]
    print("Entropy : %.2f" % stats.entropy(list(k["titles"].values())))
    print("MaxDist : %.2f" % max_dist(k))
    for k in sorted(titles.keys(), key = lambda x : titles[x], reverse=True):
        print("\t %-50s: %.3f %%" % (k, titles[k] * 100 )) 

def read_mentions(redirects,mention_to_type, type_system):
    ds = []

    with codecs.open("/home/haowu4/data/wikidump/mention_surface_map.tsv", "r", "utf-8") as input:
        for line in input:
            oline = line
            line = line.split("\t")
            total = 0.0
            apperance_count = 0.0
            dist = defaultdict(float)
            for i in range(1, int(len(line)/2)):
                title = line[i*2].strip()
                count = float(line[i*2+1])    
                apperance_count += count

                try:
                    title = redirects[title]
                except KeyError:
                    title = title

                try:
                    title = mention_to_type[title]

                except KeyError:
                    title = ["UNK"]
                    
                for l in title:
                    if type_system.has_type(l) and not type_system.is_coarse_type(l):
                        total += count
                        dist[l] += count
    
            if apperance_count < 5:
                continue

            for title in dist:
                dist[title] = dist[title]/total
#                 vs.append(dist[title])

            ds.append({
                    "titles": dist,
                    "surface" :line[0],
                    "total" : total,
                    "apperance_count" : apperance_count,
                    "line": oline})
            
        return ds

        
def group_by_score(mentions, scoring_func, chunk = 0.05):
    ess = []
    value_to_collections = defaultdict(list)
    
    for m in mentions:
        score = scoring_func(m)
        chunk_id =  to_chunk(score, chunk_size=chunk)
        value_to_collections[chunk_id].append(m)
        ess.append(score)

    return value_to_collections, ess

In [7]:
finer_types = FinerTypeSystem(types)
redirects = load_redirect_map()
mention_to_type = load_mention_to_type()
mentions = read_mentions(redirects, mention_to_type, finer_types)


In [14]:
score_to_mention, scores = group_by_score(mentions, max_dist)

for s,length in [(a,len(score_to_mention[a])) for a in sorted(score_to_mention.keys(), key=lambda x :len(score_to_mention[x]), reverse=True)]:
    print("%s : %d" % (s, length))
    

0.00 : 960892
1.00 : 553576
0.95 : 4847
0.90 : 3865
0.05 : 3152
0.80 : 3096
0.85 : 3004
0.35 : 2987
0.10 : 2859
0.20 : 2399
0.50 : 2395
0.60 : 2354
0.75 : 2305
0.15 : 2267
0.65 : 2247
0.25 : 2228
0.70 : 2059
0.45 : 1837
0.55 : 1725
0.40 : 1717
0.30 : 1575


In [21]:
import random 

for k in score_to_mention:
    for s in score_to_mention[k]:
        if s['surface']  == "New Jersey Nets":
            if random.random() < 2.00001:
                print_mention_surface_dist(s)
                break

Surface: [New Jersey Nets]
Entropy : 0.69
MaxDist : 0.00
	 organization.sports_team                          : 50.000 %
	 person.actor                                      : 50.000 %


In [None]:
finer_types = FinerTypeSystem(types)

finer_types.a_belongs_to_b("building.theater", "location")

In [None]:
finer_types.get_path("building.theater")

{'art': {'is_figer_type': True, 'parent': 'work'},
 'art.film': {'is_figer_type': True, 'parent': 'work'},
 'astral_body': {'is_figer_type': True, 'parent': 'geography'},
 'award': {'is_figer_type': True, 'parent': None},
 'broadcast_program': {'is_figer_type': True, 'parent': 'work'},
 'building': {'is_figer_type': True, 'parent': None},
 'building.airport': {'is_figer_type': True, 'parent': 'building'},
 'building.dam': {'is_figer_type': True, 'parent': 'building'},
 'building.hospital': {'is_figer_type': True, 'parent': 'building'},
 'building.hotel': {'is_figer_type': True, 'parent': 'building'},
 'building.library': {'is_figer_type': True, 'parent': 'building'},
 'building.power_station': {'is_figer_type': True, 'parent': 'building'},
 'building.restaurant': {'is_figer_type': True, 'parent': 'building'},
 'building.sports_facility': {'is_figer_type': True, 'parent': 'building'},
 'building.theater': {'is_figer_type': True, 'parent': 'building'},
 'computer.programming_language': {

In [9]:
from sys import getsizeof

In [10]:
getsizeof(mentions)

2115952

In [11]:
getsizeof([x for x in range(10)])

192

In [17]:
def compact_mention(m):
    return {
#         "total_type" :  len(m["titles"])
        "total" : m["titles"],
        "surface" : m["surface"],
        "type_dist" : m["titles"]
    }

In [13]:
import json
with codecs.open("/home/haowu4/data/wikidump/mention_to_type_dist","w", "utf-8") as output:
    for m in mentions:
        if len(m["titles"]) == 0:
            continue
        m = compact_mention(m)
        output.write(json.dumps(m))
        output.write("\n")

In [15]:
compact_mention(mentions[1111])

{'surface': 'writer',
 'total': defaultdict(float,
             {'education.department': 0.1875,
              'language': 0.0625,
              'person.actor': 0.125,
              'person.artist': 0.0625,
              'person.author': 0.25,
              'person.musician': 0.3125}),
 'type_dist': defaultdict(float,
             {'education.department': 0.1875,
              'language': 0.0625,
              'person.actor': 0.125,
              'person.artist': 0.0625,
              'person.author': 0.25,
              'person.musician': 0.3125})}

In [14]:
import codecs
import gzip 

zf = gzip.open("/home/haowu4/codes/dataless_finer/resources/mention_to_type_dist.txt.gz", 'rb')
reader = codecs.getreader("utf-8")
contents = reader(zf)
for line in contents:
    print(line)
    break


{"surface": "Read more...", "type_dist": {"government_agency": 0.0018811136192626034, "person.politician": 0.07336343115124154, "organization.sports_team": 0.0071482317531978935, "person.athlete": 0.0417607223476298, "product.airplane": 0.0007524454477050414, "military": 0.004514672686230248, "organization.educational_institution": 0.006019563581640331, "building.sports_facility": 0.001128668171557562, "location.cemetery": 0.004138449962377728, "building.restaurant": 0.0003762227238525207, "product.computer": 0.0007524454477050414, "building.library": 0.0007524454477050414, "language": 0.008276899924755455, "location.province": 0.002257336343115124, "medicine.symptom": 0.001128668171557562, "person.coach": 0.003386004514672686, "play": 0.0003762227238525207, "person.monarch": 0.014296463506395787, "person.religious_leader": 0.007524454477050414, "art": 0.003386004514672686, "location.bridge": 0.003762227238525207, "location.body_of_water": 0.009781790820165538, "event.natural_disaster"

In [15]:
len(mentions)

1563386

In [18]:
import json
with codecs.open("/home/haowu4/data/wikidump/mention_to_type_dist_lg5.txt","w", "utf-8") as output:
    for m in mentions:
        if len(m["titles"]) == 0:
            continue
        m = compact_mention(m)
        output.write(json.dumps(m))
        output.write("\n")