Copyright 2017 Hussein S. Al-Olimat, hussein@knoesis.org

This software is released under the GNU Affero General Public License (AGPL) v3.0 License.

# Evaluate LNEx performance using the locations gold standard

In [None]:
!pip install PyHamcrest
!pip install wordsegment
!pip install shapely
!pip install nltk
!pip install elasticsearch
!pip install elasticsearch_dsl
!pip install geopy

In [22]:
import json, re, os
from shapely.geometry import MultiPoint
import operator
from collections import defaultdict, OrderedDict

import sys 
sys.path.append("LNEx")
import LNEx as lnex

In [23]:
def do_they_overlap(tub1, tub2):
    '''Checks whether two substrings of the tweet overlaps based on their start
    and end offsets.'''

    if tub2[1] >= tub1[0] and tub1[1] >= tub2[0]:
        return True

def read_annotations(filename):

    filename = os.path.join("_Data/Brat_Annotations", filename)

    # read tweets from file to list
    with open(filename) as f:
        data = json.load(f)

    return data

def init_using_elasticindex(bb, cache, augmentType, dataset, capital_word_shape):
    lnex.elasticindex(conn_string='localhost:9200', index_name="photon")

    geo_info = lnex.initialize( bb, augmentType=augmentType,
                                    cache=cache,
                                    dataset_name=dataset,
                                    capital_word_shape=capital_word_shape)
    return geo_info

In [24]:
bbs = { "chennai": [12.74, 80.066986084, 13.2823848224, 80.3464508057],
        "louisiana": [29.4563, -93.3453, 31.4521, -89.5276],
        "houston": [29.4778611958, -95.975189209, 30.1463147381, -94.8889160156],
        "columbus": [39.808631, -83.2102799, 40.1572719, -82.7713781],
        "test": [41.6187434973, -83.7106928844, 41.6245055116, -83.7017216664]}

dataset = "louisiana"

geo_info = init_using_elasticindex(bbs[dataset], cache=False, augmentType="HP", 
                                   dataset=dataset, capital_word_shape=False)

Initializing LNEx ...
Done Initialization ...


In [25]:
filename = "101018-Brat-Annotations/"+dataset.title()+"_annotations.json"
filename = dataset.title()+"_annotations.json"
anns = read_annotations(filename)

In [26]:
TPs_count = 0
FPs_count = 0
FNs_count = 0
overlaps_count = 0

fns = defaultdict(int)

count = 0
one_geolocation = 0
all_geolocation = 0
geo_codes_length_dist = defaultdict(int)

for key in list(anns.keys()):
    
    count += 1

    # skip the development set
    if dataset != "houston" and count < 500:
        continue

    tweet_lns = set()
    lnex_lns = set()
    tweet_text = ""

    for ann in anns[key]:
        if ann != "text":
            ln = anns[key][ann]

            tweet_lns.add(((int(ln['start_idx']), int(ln['end_idx'])),
                                ln['type']))
        else:
            tweet_text = anns[key][ann]
            #print tweet_text

            r = lnex.extract(tweet_text)

            # how many are already disambiguated +++++++++++++++++++++++
            for res in r:
                if len(res[3]) < 2:
                    one_geolocation += 1

                    #if len(res[3]) == 0:
                        #print res[2]
                else:
                    geo_codes_length_dist[len(res[3])] += 1

                all_geolocation += 1
            # ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

            lnex_lns = set([x[1] for x in r])

    #print("BEFORE > ", )
    #print("TPs_count:[", TPs_count, "] FPs_count:[", FPs_count, "] FNs_count:[", FNs_count, "] overlaps_count:[", overlaps_count, "]")
    #print([(x[0], tweet_text[x[0][0]:x[0][1]]) for x in tweet_lns if x[1] == "inLoc"])
    #print(lnex_lns, [tweet_text[x[0]:x[1]] for x in lnex_lns])
    
    tweet_lns = set([x[0] for x in tweet_lns if x[1] == "inLoc"])
    
    # True Positives +++++++++++++++++++++++++++++++++++++++++++++++++++
    TPs = tweet_lns.intersection(lnex_lns)

    TPs_count += len(TPs)

    # Left in both sets ++++++++++++++++++++++++++++++++++++++++++++++++
    tweet_lns -= TPs
    lnex_lns -= TPs

    # Find Overlapping LNs to be counted as 1/2 FPs and 1/2 FNs++
    overlaps = set()
    for x in tweet_lns:
        for y in lnex_lns:
            if do_they_overlap(x, y):
                overlaps.add(x)
                overlaps.add(y)

    overlaps_count += len(overlaps)

    # remove the overlapping lns from lnex_lns and tweet_lns
    lnex_lns -= overlaps
    tweet_lns -= overlaps

    # False Positives ++++++++++++++++++++++++++++++++++++++++++++++++++
    # lnex_lns = all - (TPs and overlaps and !inLoc)
    FPs = lnex_lns - tweet_lns
    FPs_count += len(FPs)
    
    if len(FPs) > 0:
        print("FPs >>>>>", key, tweet_text, [tweet_text[x[0]:x[1]] for x in FPs])

    # False Negatives ++++++++++++++++++++++++++++++++++++++++++++++++++
    FNs = tweet_lns - lnex_lns
    FNs_count += len(FNs)

    '''if len(FNs) > 0:
        for x in [tweet_text[x[0]:x[1]] for x in FNs]:
            fns[x.lower()] += 1'''
    
    if len(FNs) > 0:
        print("FNs >>>>>", key, tweet_text, [tweet_text[x[0]:x[1]] for x in FNs])

    ####################################################################
    #print TPs_count, FPs_count, FNs_count, overlaps_count
    #print "#"*100
    
    #print("AFTER > ", )
    #print("TPs_count:[", TPs_count, "] FPs_count:[", FPs_count, "] FNs_count:[", FNs_count,"] overlaps_count:[", overlaps_count, "]")
    #print("-"*50)

'''
since we add 2 lns one from lnex_lns and one from tweet_lns if they
overlap the equation of counting those as 1/2 FPs and 1/2 FNs is going
to be:
    overlaps_count x
        1/2 (since we count twice) x
            1/2 (since we want 1/2 of all the errors made)
'''

Precision = TPs_count/(TPs_count + FPs_count + 0.5 * .5 * overlaps_count)
Recall = TPs_count/(TPs_count + FNs_count + 0.5 * .5 * overlaps_count)
F_Score = (2 * Precision * Recall)/(Precision + Recall)

percentage_disambiguated = one_geolocation/all_geolocation

#percentage_amb_out_extracted = out_and_amb_extracted_lns/all_geolocation

print ("\t".join([dataset, str(Precision), str(Recall), str(F_Score)]))

FPs >>>>> 766735160813301760 In the Wake of Devastating Flooding in Louisiana, Groups Urge Obama Administration to Cancel Upcoming Gulf Dr... https://t.co/NpUdMyZU3I ['Gulf Dr']
FPs >>>>> 766748622394241024 How does Louisiana's flooding compare to Katrina?: The images of families stranded on island-roofs and surrounded https://t.co/rjpWRvOdiy ['Katrina']
FNs >>>>> 764486275940265984 RT @Dave_Nussbaum: Ongoing heavy rain & #flooding from the Northshore to #BatonRouge will continue for the rest of today. #lawx @WWLTV http ['la', 'Northshore']
FNs >>>>> 766492905502638080 Shelf Cloud in West Monroe, La. 8/18/16 @weatherchannel @bbcweather #Louisiana https://t.co/q43DvZw1cA ['La', 'West Monroe']
FNs >>>>> 766805655692779520 Praying for my fam down in Louisiana right now. #forthechurch #Louisiana  #LouisianaFlood @forthechurch https://t.co/gK62CpygcQ ['Louisiana']
FPs >>>>> 766669470911770628 Mississippi medical center receives record obesity grant: The University of Mississippi Medical ...

FNs >>>>> 766755631705444352 #politics #Usa Obama will visit Louisiana next week: The area has been damaged by catastrophic flooding over ... https://t.co/DlhdNnEBll ['Usa']
FPs >>>>> 766720631505379328 Strong T-Storm Near Fort Adams or 14 Miles E of Simmesport Moving NE At 15 MPH. Locations Impacted Include... #lawx https://t.co/ZKAIWHMors ['Fort Adams']
FNs >>>>> 766720631505379328 Strong T-Storm Near Fort Adams or 14 Miles E of Simmesport Moving NE At 15 MPH. Locations Impacted Include... #lawx https://t.co/ZKAIWHMors ['la']
FPs >>>>> 764487607736598528 South Louisiana is flooding and family is being affected. We're going to travel home from Texas to help :( people are needing to be rescued ['Texas']
FNs >>>>> 766617637493305344 Could Obama and Hillary at least have the common decency to ACT like they care?!  #LaFlooding ['La']
FNs >>>>> 766596237822799872 I give you... The new #Louisiana "open floor plan" bleached and move-in ready!  #laflood #BatonRouge https://t.co/cG18lxHlEX ['l

FNs >>>>> 766659440984547329 https://t.co/RmpuAOP9ln.  Where is #CrookedHillary and her #CrookedfFoundation  while #LaFlooding and Americans in Crisis! ['La']
FPs >>>>> 766737343621443584 Louisiana Flooding COULD Have Been Katrina 2.0.But, Obama Was ... https://t.co/8nZI0VQ5nm via @Bipartisanism https://t.co/BAOFfPQGCU ['Katrina']
FNs >>>>> 766730236851519488 It's 8x in one year a 500-year rainfall has hammered US as #climate makes extreme weather in #Louisiana more common. https://t.co/oPT8qjxiZl ['US']
FNs >>>>> 766643909317500928 RT @Dave_Nussbaum: A small shower with heavy rain is over Downtown #NewOrleans. #lawx @WWLTV https://t.co/n2hu2zbTyv ['la']
FPs >>>>> 766527131207634944 Mississippi State redshirt corner Chris Stamps is ready to roll this season.: "This is the best ... https://t.co/Z61KujStdh #mississippi ['Mississippi', 'mississippi']
FNs >>>>> 766671214395940864 The LBA is collecting gift cards to @Walmart @Lowes & @HomeDepot for Louisiana flooding victims. Learn more: ht

FNs >>>>> 766472458702458880 RT @ScarboroughMike: Celtic Studios helping anyone they can to help #Louisiana people in need. Great vid @MichaelPapajohn https://t.co/0FBU ['Celtic Studios']
FPs >>>>> 766777507374698496 Police charge man, woman after a night in the pickup in the Mississippi River: An Ottawa man and... https://t.co/gWEBbsi2Ng #mississippi ['mississippi']
FPs >>>>> 766822989895704576 Reading @bowie_tori Twitter feed inspires. Olympian breakout star from Sand Hill, an "unincorporated" #Mississippi town. ['Mississippi']
FPs >>>>> 764480397581377536 RT @LindaDono: At least 2 dead in southern Louisiana, Mississippi flooding https://t.co/SVyyC8DTFr via @theadvertiser https://t.co/WTZD6JoJ ['Mississippi']
FNs >>>>> 766838310677327873 @NOLAnews features @CristoReyBR during #LAFlooding - https://t.co/HZzGi9YNze ['LA']
FPs >>>>> 766640741011460097 Clouds forming-lawn a rain forest-color of car unknown-besides that it's #TGIF, #Louisiana #SECFootball soon! ['unknown']
FNs >>>>> 76667

FPs >>>>> 766472356411891713 I've seen in 5 places Deez Nuts is beating Jill Stein in a Texas POTUS poll.   <NEWLINE>  <NEWLINE> Meanwhile, Louisiana flooding info is buried on news sites. ['Texas']
FPs >>>>> 766464810540118016 #jobs Flood Demo Team Member PALA Group Inc Norco LA: Bachelors degree in a field related to natu... https://t.co/CKpYVtpNNl #Louisiana ['Norco']
FNs >>>>> 766464810540118016 #jobs Flood Demo Team Member PALA Group Inc Norco LA: Bachelors degree in a field related to natu... https://t.co/CKpYVtpNNl #Louisiana ['LA']
FPs >>>>> 766622043483824130 Scores of Louisiana schools remain closed after flooding via @educationweek https://t.co/hRTO61sMWj #edchat #suptchat #education ['Louisiana']
FPs >>>>> 766690452481404928 Hard to believe we set up temporary law office here for Katrina. Louisiana flooding: One family, 13 homes destroyed https://t.co/TGE4SSDTCV ['Katrina']
FPs >>>>> 766474419870978048 So this river flood water just moving from one parish to the next floodi

FPs >>>>> 766771237360140292 Trump Visits Flooding In Louisiana, Clinton Says State Needs No Distractions https://t.co/JWWWawshHx via @NwoReport ['Clinton']
FNs >>>>> 766741296476254208 Maybe Hillary will go down to #Louisiana if they put up some rails and place down some stools to lean on. #LousianaFlood ['Lousiana']
FPs >>>>> 766729820059205632 Ways to help the folks in #Louisiana #LouisianaFlood ~ a request from #HillaryClinton  https://t.co/OdDxkZvk4C ['Clinton']
FNs >>>>> 766729820059205632 Ways to help the folks in #Louisiana #LouisianaFlood ~ a request from #HillaryClinton  https://t.co/OdDxkZvk4C ['Louisiana']
FPs >>>>> 766676403899072512 HE'S GOING https://t.co/hC7ivEGywb via @AllenWest NOT #Obama nor #Clinton #Louisiana doesn't matter 2 them ['Clinton']
FNs >>>>> 766656709783916544 Louisiana devastated by flooding. It's worse than whatever you've seen. Titletown Brewing offers way to help. <NEWLINE> https://t.co/TmjgpH9ShQ ['Titletown Brewing']
FNs >>>>> 766641165181423617 RT

FPs >>>>> 766860629621735424 RT @ROaRR4Hillary: Closely monitoring the flooding in Louisiana & Mississippi. The @RedCross is... https://t.co/ZGGBET5m7k by @cscstars via ['Mississippi']
FNs >>>>> 764484888930103296 @ReggieChatman The water is over Nicholson and Highland right there. Highland has been shut down by @La_DOTD. #lawx ['Highland', 'Highland', 'la']
FNs >>>>> 766752406050746368 Schein hotline is open to support those affected by flooding inLouisiana https://t.co/0Px8tGGczr ['Louisiana']
FPs >>>>> 766740151724892160 #LouisianaFlood Hillary not visiting b/c you don't need distractions. #ClintonFoundation only 10% to charity Give it to #Louisiana victims. ['Clinton']
FPs >>>>> 766455686125514752 People are really trying to draw an equivalence between the flooding in Louisiana and Katrina? ['Katrina']
FPs >>>>> 766645401252401152 Obama Ripped Bush For Katrina, Absent During Louisiana Flooding https://t.co/bPE8ZEfmsU <NEWLINE> He's to busy golfing & keeping Malia out of jail. ['Kat