Copyright 2017 Hussein S. Al-Olimat, hussein@knoesis.org

This software is released under the GNU Affero General Public License (AGPL) v3.0 License.

# Evaluate LNEx performance using the locations gold standard

In [None]:
!pip install PyHamcrest
!pip install wordsegment
!pip install shapely
!pip install nltk
!pip install elasticsearch
!pip install elasticsearch_dsl
!pip install geopy

In [1]:
import json, re, os
from shapely.geometry import MultiPoint
import operator
from collections import defaultdict, OrderedDict

import sys 
sys.path.append("LNEx")
import LNEx as lnex

In [2]:
def do_they_overlap(tub1, tub2):
    '''Checks whether two substrings of the tweet overlaps based on their start
    and end offsets.'''

    if tub2[1] >= tub1[0] and tub1[1] >= tub2[0]:
        return True

def read_annotations(filename):

    filename = os.path.join("_Data/Brat_Annotations", filename)

    # read tweets from file to list
    with open(filename) as f:
        data = json.load(f)

    return data

def init_using_elasticindex(bb, cache, augmentType, dataset, capital_word_shape):
    lnex.elasticindex(conn_string='localhost:9200', index_name="photon")

    geo_info = lnex.initialize( bb, augmentType=augmentType,
                                    cache=cache,
                                    dataset_name=dataset,
                                    capital_word_shape=capital_word_shape)
    return geo_info

In [3]:
bbs = { "chennai": [12.74, 80.066986084, 13.2823848224, 80.3464508057],
        "louisiana": [29.4563, -93.3453, 31.4521, -89.5276],
        "houston": [29.4778611958, -95.975189209, 30.1463147381, -94.8889160156],
        "columbus": [39.808631, -83.2102799, 40.1572719, -82.7713781],
        "test": [41.6187434973, -83.7106928844, 41.6245055116, -83.7017216664]}

dataset = "houston"

geo_info = init_using_elasticindex(bbs[dataset], cache=False, augmentType="HP", 
                                   dataset=dataset, capital_word_shape=False)

Initializing LNEx ...
Done Initialization ...


In [70]:
filename = "101018-Brat-Annotations/"+dataset.title()+"_annotations.json"
filename = dataset.title()+"_annotations.json"
filename = dataset+"_annotations.json"
anns = read_annotations(filename)

In [71]:
TPs_count = 0
FPs_count = 0
FNs_count = 0
overlaps_count = 0

fns = defaultdict(int)

count = 0
one_geolocation = 0
all_geolocation = 0
geo_codes_length_dist = defaultdict(int)

FPs_set = defaultdict(set)
FNs_set = defaultdict(set)

for key in list(anns.keys()):
    
    #print(key)
    
    count += 1

    # skip the development set
    if dataset != "houston" and count < 500:
        continue

    tweet_lns = set()
    lnex_lns = set()
    tweet_text = ""

    for ann in anns[key]:
        if ann != "text":
            ln = anns[key][ann]

            tweet_lns.add(((int(ln['start_idx']), int(ln['end_idx'])),
                                ln['type']))
        else:
            tweet_text = anns[key][ann]
            #print tweet_text

            r = lnex.extract(tweet_text)

            # how many are already disambiguated +++++++++++++++++++++++
            for res in r:
                if len(res[3]) < 2:
                    one_geolocation += 1

                    #if len(res[3]) == 0:
                        #print res[2]
                else:
                    geo_codes_length_dist[len(res[3])] += 1

                all_geolocation += 1
            # ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

            lnex_lns = set([x[1] for x in r])

    #print("BEFORE > ", )
    #print("TPs_count:[", TPs_count, "] FPs_count:[", FPs_count, "] FNs_count:[", FNs_count, "] overlaps_count:[", overlaps_count, "]")
    #print([(x[0], tweet_text[x[0][0]:x[0][1]]) for x in tweet_lns if x[1] == "inLoc"])
    #print(lnex_lns, [tweet_text[x[0]:x[1]] for x in lnex_lns])
    
    tweet_lns = set([x[0] for x in tweet_lns if x[1] == "inLoc"])
    
    # True Positives +++++++++++++++++++++++++++++++++++++++++++++++++++
    TPs = tweet_lns.intersection(lnex_lns)

    TPs_count += len(TPs)

    # Left in both sets ++++++++++++++++++++++++++++++++++++++++++++++++
    tweet_lns -= TPs
    lnex_lns -= TPs

    # Find Overlapping LNs to be counted as 1/2 FPs and 1/2 FNs++
    overlaps = set()
    for x in tweet_lns:
        for y in lnex_lns:
            if do_they_overlap(x, y):
                overlaps.add(x)
                overlaps.add(y)

    overlaps_count += len(overlaps)

    # remove the overlapping lns from lnex_lns and tweet_lns
    lnex_lns -= overlaps
    tweet_lns -= overlaps

    # False Positives ++++++++++++++++++++++++++++++++++++++++++++++++++
    # lnex_lns = all - (TPs and overlaps and !inLoc)
    FPs = lnex_lns - tweet_lns
    FPs_count += len(FPs)
    
    if len(FPs) > 0:
        #FPs_set.update(set([tweet_text[x[0]:x[1]] for x in FPs]))
        
        for x in FPs:
            FPs_set[tweet_text[x[0]:x[1]]].add((key,tweet_text[x[0]-2:x[1]+2],x))
        #print("FPs >>>>>", key, tweet_text, [tweet_text[x[0]:x[1]] for x in FPs])

    # False Negatives ++++++++++++++++++++++++++++++++++++++++++++++++++
    FNs = tweet_lns - lnex_lns
    FNs_count += len(FNs)

    '''if len(FNs) > 0:
        for x in [tweet_text[x[0]:x[1]] for x in FNs]:
            fns[x.lower()] += 1'''
                                               
    if len(FNs) > 0:
        for x in FNs:
            FNs_set[tweet_text[x[0]:x[1]]].add((key,tweet_text[x[0]-2:x[1]+2],x))

    
    #if len(FNs) > 0:
    #    print(FNs, tweet_text.find("#houstonflood"), [tweet_text[x[0]:x[1]] for x in lnex_lns])
    #    print("FNs >>>>>", key, tweet_text, [tweet_text[x[0]:x[1]] for x in FNs])

    ####################################################################
    #print TPs_count, FPs_count, FNs_count, overlaps_count
    #print "#"*100
    
    #print("AFTER > ", )
    #print("TPs_count:[", TPs_count, "] FPs_count:[", FPs_count, "] FNs_count:[", FNs_count,"] overlaps_count:[", overlaps_count, "]")
    #print("-"*50)

'''
since we add 2 lns one from lnex_lns and one from tweet_lns if they
overlap the equation of counting those as 1/2 FPs and 1/2 FNs is going
to be:
    overlaps_count x
        1/2 (since we count twice) x
            1/2 (since we want 1/2 of all the errors made)
'''

Precision = TPs_count/(TPs_count + FPs_count + 0.5 * .5 * overlaps_count)
Recall = TPs_count/(TPs_count + FNs_count + 0.5 * .5 * overlaps_count)
F_Score = (2 * Precision * Recall)/(Precision + Recall)

percentage_disambiguated = one_geolocation/all_geolocation

#percentage_amb_out_extracted = out_and_amb_extracted_lns/all_geolocation

print ("\t".join([dataset, str(Precision), str(Recall), str(F_Score)]))

#for x in FPs_set:
#    print (x, len(FPs_set[x]))
                                               
#for x in FNs_set:
#    print (x, len(FNs_set[x]))                                           

>>>> 1200 people i harris county have had to be rescued so far today due to houston flood houston via
[('1200', 0, 3), ('people', 5, 10), ('i', 12, 12), ('harris', 14, 19), ('county', 21, 26), ('have', 28, 31), ('had', 33, 35), ('to', 37, 38), ('be', 40, 41), ('rescued', 43, 49), ('so', 51, 52), ('far', 54, 56), ('today', 58, 62), ('due', 64, 66), ('to', 68, 69), ('houston', 72, 78), ('flood', 79, 83), ('houston', 86, 92), ('via', 118, 120)]
>>>> video : dozens of horses rescued from flood waters via houston flood
[('video', 0, 4), (':', 5, 5), ('dozens', 7, 12), ('of', 14, 15), ('horses', 17, 22), ('rescued', 24, 30), ('from', 32, 35), ('flood', 37, 41), ('waters', 43, 48), ('via', 74, 76), ('houston', 84, 90), ('flood', 100, 104)]
>>>> man found dead in submerged 18 wheeler in north houston via houston flood
[('man', 0, 2), ('found', 4, 8), ('dead', 10, 13), ('in', 15, 16), ('submerged', 18, 26), ('18', 28, 29), ('wheeler', 31, 37), ('in', 39, 40), ('north', 42, 46), ('houston', 48, 

In [68]:
lname = "Stuebner Airline"
FNs_set[lname], len(FNs_set[lname])

({('722190824599990300', 'f Stuebner Airline a', (54, 70))}, 1)