Copyright 2017 Hussein S. Al-Olimat, hussein@knoesis.org

This software is released under the GNU Affero General Public License (AGPL) v3.0 License.

# Evaluate LNEx performance using the locations gold standard

In [None]:
#!pip install PyHamcrest
#!pip install wordsegment
#!pip install shapely
#!pip install nltk
#!pip install elasticsearch
#!pip install elasticsearch_dsl
#!pip install geopy

In [None]:
import operator
import json, re, os
import pprint
pp = pprint.PrettyPrinter(indent=4)
from collections import defaultdict, OrderedDict

import sys 
sys.path.append("LNEx")
import LNEx as lnex

from shapely.geometry import MultiPoint

In [None]:
def do_they_overlap(tub1, tub2):
    '''Checks whether two substrings of the tweet overlaps based on their start
    and end offsets.'''

    if tub2[1] >= tub1[0] and tub1[1] >= tub2[0]:
        return True

def read_annotations(filename):

    filename = os.path.join("_Data/Brat_Annotations", filename)

    # read tweets from file to list
    with open(filename) as f:
        data = json.load(f)

    return data

def init_using_elasticindex(bb, cache, augmentType, dataset, capital_word_shape):
    lnex.elasticindex(conn_string='localhost:9200', index_name="photon")

    geo_info = lnex.initialize( bb, augmentType=augmentType,
                                    cache=cache,
                                    dataset_name=dataset,
                                    capital_word_shape=capital_word_shape)
    return geo_info

def evaluate(anns):
    TPs_count = 0
    FPs_count = 0
    FNs_count = 0
    overlaps_count = 0

    fns = defaultdict(int)

    count = 0
    one_geolocation = 0
    all_geolocation = 0
    geo_codes_length_dist = defaultdict(int)

    FPs_set = defaultdict(set)
    FNs_set = defaultdict(set)

    for key in list(anns.keys()):

        count += 1

        # skip the development set
        if dataset != "houston" and count < 500:
            continue

        tweet_lns = set()
        lnex_lns = set()
        tweet_text = ""

        for ann in anns[key]:
            if ann != "text":
                ln = anns[key][ann]

                tweet_lns.add(((int(ln['start_idx']), int(ln['end_idx'])),
                                    ln['type']))
            else:
                tweet_text = anns[key][ann]

                r = lnex.extract(tweet_text)

                # how many are already disambiguated +++++++++++++++++++++++
                for res in r:
                    if len(res[3]) < 2:
                        one_geolocation += 1

                        #if len(res[3]) == 0:
                            #print res[2]
                    else:
                        geo_codes_length_dist[len(res[3])] += 1

                    all_geolocation += 1
                # ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

                lnex_lns = set([x[1] for x in r])

        tweet_lns = set([x[0] for x in tweet_lns if x[1] == "inLoc"])

        # True Positives +++++++++++++++++++++++++++++++++++++++++++++++++++
        TPs = tweet_lns.intersection(lnex_lns)

        TPs_count += len(TPs)

        # Left in both sets ++++++++++++++++++++++++++++++++++++++++++++++++
        tweet_lns -= TPs
        lnex_lns -= TPs

        # Find Overlapping LNs to be counted as 1/2 FPs and 1/2 FNs++
        overlaps = set()
        for x in tweet_lns:
            for y in lnex_lns:
                if do_they_overlap(x, y):
                    overlaps.add(x)
                    overlaps.add(y)

        overlaps_count += len(overlaps)

        # remove the overlapping lns from lnex_lns and tweet_lns
        lnex_lns -= overlaps
        tweet_lns -= overlaps

        # False Positives ++++++++++++++++++++++++++++++++++++++++++++++++++
        # lnex_lns = all - (TPs and overlaps and !inLoc)
        FPs = lnex_lns - tweet_lns
        FPs_count += len(FPs)

        if len(FPs) > 0:
            for x in FPs:
                FPs_set[tweet_text[x[0]:x[1]]].add((key,tweet_text[x[0]-2:x[1]+2],x))

        # False Negatives ++++++++++++++++++++++++++++++++++++++++++++++++++
        FNs = tweet_lns - lnex_lns
        FNs_count += len(FNs)

        if len(FNs) > 0:
            for x in FNs:
                FNs_set[tweet_text[x[0]:x[1]]].add((key,tweet_text[x[0]-2:x[1]+2],x))

    '''
    since we add 2 lns one from lnex_lns and one from tweet_lns if they
    overlap the equation of counting those as 1/2 FPs and 1/2 FNs is going
    to be:
        overlaps_count x
            1/2 (since we count twice) x
                1/2 (since we want 1/2 of all the errors made)
    '''

    Precision = TPs_count/(TPs_count + FPs_count + 0.5 * .5 * overlaps_count)
    Recall = TPs_count/(TPs_count + FNs_count + 0.5 * .5 * overlaps_count)
    F_Score = (2 * Precision * Recall)/(Precision + Recall)

    percentage_disambiguated = one_geolocation/all_geolocation

    return {"precision": Precision, 
            "recall": Recall, 
            "f-score": F_Score}

    #for x in FPs_set:
    #    print (x, len(FPs_set[x]))

    #for x in FNs_set:
    #    print (x, len(FNs_set[x]))    

In [None]:
bbs = { "chennai": [12.74, 80.066986084, 13.2823848224, 80.3464508057],
        "louisiana": [29.4563, -93.3453, 31.4521, -89.5276],
        "houston": [29.4778611958, -95.975189209, 30.1463147381, -94.8889160156]}

augmentTypes = ["FILTER", "HP", "NA"]# "FULL", 

for augmentType in augmentTypes:
    print(augmentType)
    
    results = dict()
    for dataset in bbs:

        print(dataset)
        geo_info = init_using_elasticindex(bbs[dataset], cache=False, augmentType=augmentType, 
                                           dataset=dataset, capital_word_shape=False)

        filename = dataset+"_annotations.json"
        anns = read_annotations(filename)

        results[dataset] = evaluate(anns)

    results["average-f"] = sum([results[ds]["f-score"] for ds in results])/len(results)
    pp.pprint(results)
    print("#"*50)

## FULL

{   'average-f': 0.8282360007690093,
    'chennai': {   'f-score': 0.8900994789199431,
                   'precision': 0.9470766129032258,
                   'recall': 0.8395889186773905},
    'houston': {   'f-score': 0.7369206180074587,
                   'precision': 0.848068669527897,
                   'recall': 0.651530852567122},
    'louisiana': {   'f-score': 0.857687905379626,
                     'precision': 0.895617529880478,
                     'recall': 0.8228404099560761}}

## FILTER

{   'average-f': 0.8359112258688132,
    'chennai': {   'f-score': 0.8766069926709119,
                   'precision': 0.9362248171435904,
                   'recall': 0.8241274144357845},
    'houston': {   'f-score': 0.765867567852994,
                   'precision': 0.8945355865301591,
                   'recall': 0.6695595974517589},
    'louisiana': {   'f-score': 0.8652591170825337,
                     'precision': 0.9147727272727273,
                     'recall': 0.820830298616169}}

## HP

{   'average-f': 0.836498466629482,
    'chennai': {   'f-score': 0.8732905724313204,
                   'precision': 0.942159550855203,
                   'recall': 0.8138039923311153},
    'houston': {   'f-score': 0.768642447418738,
                   'precision': 0.904952476238119,
                   'recall': 0.6680206794682423},
    'louisiana': {   'f-score': 0.8675623800383877,
                     'precision': 0.9186991869918699,
                     'recall': 0.8218181818181818}}

## NA

{   'average-f': 0.7123791000175698,
    'chennai': {   'f-score': 0.8141632837167342,
                   'precision': 0.811965811965812,
                   'recall': 0.8163726820443238},
    'houston': {   'f-score': 0.6648441771459813,
                   'precision': 0.6485333333333333,
                   'recall': 0.6819966348850253},
    'louisiana': {   'f-score': 0.658129839189994,
                     'precision': 0.554718875502008,
                     'recall': 0.8089311859443631}}