# Lab 5: GDELT (Geohash)

This lab will explore calculating distances of events with great circle calculations and geohashing. To begin, we will pull 30 days worth of event data from the GDELT website.

## Task 0

In [268]:
import datetime
import re 
from tzwhere import tzwhere
import pandas as pd
import numpy as np
import pytz
import gdelt

# tzwhere variable for time normalization
tz1 = tzwhere.tzwhere(forceTZ=True)
# instantiate the gdeltPyR object for searches
gd = gdelt.gdelt()

#Last 30 days
last_30 = gd.Search(date=['2017 Oct 1','2017 Oct 31'],normcols=True,coverage=False)



GDELT does not have a url for date time 20171001234500


GDELT did not return data for date time 20171001234500


GDELT does not have a url for date time 20171030234500


GDELT did not return data for date time 20171030234500



## Task 1

After importing 30 days worth of event data, we will write a definition that returns the top ten events that occurred within a specified radius of a given lat/long location for specific type(s) of event. This function will calulate distance using great circle distances.

In [193]:
from haversine import haversine
#x=(lat,long), y=radius, z=[event codes], t=number of closest events we wish to return
def GreatCircleEvents(x,y,z,t):
    codes=[]
    #turn list elements into strings
    for code in z:
        code=str(code)
        codes.append(code)
    #subset data by desired event codes (given by z)
    data=pd.DataFrame(last_30[last_30['eventrootcode'].isin(codes)])   
    rows=[]
    d=[]
    #calculate distances for every row of subsetted data to x input
    for i in range(0, len(data)):
        dist=haversine(x,(data.iloc[i]['actiongeolat'], data.iloc[i]['actiongeolong']))
        #determine if distance falls within radius (y)
        if dist<=y:
            #create list of global event ids that fall within radius
            rows.append(data.iloc[i]['globaleventid'])
            #create list that includes distances acssociated with each id
            d.append((dist,data.iloc[i]['globaleventid']))
            i=i+1
        else:
            i=i+1
    #subset early data frame further by removing all global event ids outside of desired radius
    data=pd.DataFrame(data[data['globaleventid'].isin(rows)])
    #sort list of distances and ids to identify closest events
    d.sort(key=lambda x:x[0])
    #set index of data frame to event ids
    data.set_index('globaleventid', inplace=True)
    ids=[]
    #create data set that prints in order of distances
    for event in d:
        ids.append(event[1])
    #return top 10 in pandas data frame    
    data=data.loc[ids[0:t]]
    return pd.DataFrame(data)

#Call definition for specific parameters as proof of concept
GreatCircleEvents((15.05,1.82),300,[13,14,18,19,20],10)  


Unnamed: 0_level_0,sqldate,monthyear,year,fractiondate,actor1code,actor1name,actor1countrycode,actor1knowngroupcode,actor1ethniccode,actor1religion1code,...,actiongeotype,actiongeofullname,actiongeocountrycode,actiongeoadm1code,actiongeoadm2code,actiongeolat,actiongeolong,actiongeofeatureid,dateadded,sourceurl
globaleventid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
695524173,20171005,201710,2017,2017.7534,UAF,MILITANT,,,,,...,4,"Tongo Tongo, Tillabé, Niger",NG,NG09,22583,15.0522,1.8318,-1087028,20171005234500,http://in.reuters.com/article/niger-usa/french...
695524175,20171005,201710,2017,2017.7534,UAF,MILITANT,,,,,...,4,"Tongo Tongo, Tillabé, Niger",NG,NG09,22583,15.0522,1.8318,-1087028,20171005234500,http://in.reuters.com/article/niger-usa/french...
695523865,20171005,201710,2017,2017.7534,NGA,NIGERIA,NGA,,,,...,4,"Tillabery, Tillabé, Niger",NG,NG09,22583,14.2117,1.45306,-1086768,20171005234500,https://www.crisisgroup.org/africa/west-africa...
699403935,20171019,201710,2017,2017.7918,NER,NIGER,NER,,,,...,4,"Tillaberi, Tillabé, Niger",NG,NG09,22583,14.2117,1.45306,-1086768,20171019234500,http://www.france24.com/en/20171020-pentagon-o...
695523832,20171005,201710,2017,2017.7534,NERCOP,NIGER,NER,,,,...,4,"Koutoukale, Tillabé, Niger",NG,NG09,22579,13.6951,1.742,-1083605,20171005234500,https://www.crisisgroup.org/africa/west-africa...
695523834,20171005,201710,2017,2017.7534,NERCOP,NIGER,NER,,,,...,4,"Koutoukale, Tillabé, Niger",NG,NG09,22579,13.6951,1.742,-1083605,20171005234500,https://www.crisisgroup.org/africa/west-africa...
695523796,20171005,201710,2017,2017.7534,MOSRAD,ISLAMIC,,,,MOS,...,4,"Niamey, Niamey, Niger",NG,NG08,22581,13.5167,2.11667,-1084956,20171005234500,http://www.sandiegouniontribune.com/news/natio...
697422773,20171012,201710,2017,2017.7726,,,,,,,...,4,"Niamey, Niamey, Niger",NG,NG08,22581,13.5167,2.11667,-1084956,20171012234500,http://wfxl.com/news/state-news/flags-to-be-lo...
699726429,20171020,201710,2017,2017.7945,CVL,VILLAGE,,,,,...,4,"Niamey, Niamey, Niger",NG,NG08,22581,13.5167,2.11667,-1084956,20171020234500,http://www.newsweek.com/who-killed-4-us-soldie...
700437358,20171023,201710,2017,2017.8027,,,,,,,...,4,"Niamey, Niamey, Niger",NG,NG08,22581,13.5167,2.11667,-1084956,20171023234500,http://dailycaller.com/2017/10/23/dunford-reve...


## Task 2

Now, we will write another similiar function, but calculate distances with geohashing, instead. 

In [270]:
import pygeohash as pgh
#x=(lat,long), y=radius, z=[event codes], t=number of closest events we wish to return
def GeoHashEvents(x,y,z,t):
    codes=[]
    #turn list elements into strings
    for code in z:
        code=str(code)
        codes.append(code)
    #subset data by desired event codes (given by z)
    data=pd.DataFrame(last_30[last_30['eventrootcode'].isin(codes)])   
    rows=[]
    d=[]
    #get encoded location for specified lat/long (x)
    location=pgh.encode(x[0],x[1],precision=8)
    #geohash every row of subsetted data to x input
    for i in range(0, len(data)):
        #get encoded location for each event
        dataPoint=pgh.encode(data.iloc[i]['actiongeolat'], data.iloc[i]['actiongeolong'],precision=8)
        dist=pgh.geohash_approximate_distance(location, dataPoint)/1000
            #determine if distance falls within radius (y)
        if dist<=y:
            #create list of global event ids that fall within radius
            rows.append(data.iloc[i]['globaleventid'])
            #create list that includes distances acssociated with each id
            d.append((dist,data.iloc[i]['globaleventid']))
            i=i+1
        else:
            i=i+1
    #subset early data frame further by removing all global event ids outside of desired radius
    data=pd.DataFrame(data[data['globaleventid'].isin(rows)])
    #sort list of distances and ids to identify closest events
    d.sort(key=lambda x:x[0])
    #set index of data frame to event ids
    data.set_index('globaleventid', inplace=True)
    ids=[]
    #create data set that prints in order of distances
    for event in d:
        ids.append(event[1])
    #return in pandas data frame    
    data=data.loc[ids[0:t]]
    return pd.DataFrame(data)    


#Call definition for specific parameters as proof of concept
GeoHashEvents((15.05,1.82),300,[13,14,18,19,20],10) 

Unnamed: 0_level_0,sqldate,monthyear,year,fractiondate,actor1code,actor1name,actor1countrycode,actor1knowngroupcode,actor1ethniccode,actor1religion1code,...,actiongeotype,actiongeofullname,actiongeocountrycode,actiongeoadm1code,actiongeoadm2code,actiongeolat,actiongeolong,actiongeofeatureid,dateadded,sourceurl
globaleventid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
695524173,20171005,201710,2017,2017.7534,UAF,MILITANT,,,,,...,4,"Tongo Tongo, Tillabé, Niger",NG,NG09,22583,15.0522,1.8318,-1087028,20171005234500,http://in.reuters.com/article/niger-usa/french...
695524175,20171005,201710,2017,2017.7534,UAF,MILITANT,,,,,...,4,"Tongo Tongo, Tillabé, Niger",NG,NG09,22583,15.0522,1.8318,-1087028,20171005234500,http://in.reuters.com/article/niger-usa/french...
695523865,20171005,201710,2017,2017.7534,NGA,NIGERIA,NGA,,,,...,4,"Tillabery, Tillabé, Niger",NG,NG09,22583,14.2117,1.45306,-1086768,20171005234500,https://www.crisisgroup.org/africa/west-africa...
699403935,20171019,201710,2017,2017.7918,NER,NIGER,NER,,,,...,4,"Tillaberi, Tillabé, Niger",NG,NG09,22583,14.2117,1.45306,-1086768,20171019234500,http://www.france24.com/en/20171020-pentagon-o...


## Task 3

Now let's compare the difference in run times between great circle calculations and geohash. We will start by ammending the definitions that we created above so that we do not take into account the time it takes to build the final data structure.

In [195]:
#shortened code
def GC_time(x,y,z):
    codes=[]
    #turn list elements into strings
    for code in z:
        code=str(code)
        codes.append(code)
    #subset data by desired event codes (given by z)
    data=pd.DataFrame(last_30[last_30['eventrootcode'].isin(codes)])   
    rows=[]
    d=[]
    #calculate distances for every row of subsetted data to x input
    for i in range(0, len(data)):
        dist=haversine(x,(data.iloc[i]['actiongeolat'], data.iloc[i]['actiongeolong']))
        #determine if distance falls within radius (y)
        if dist<=y:
            #create list of global event ids that fall within radius
            rows.append(data.iloc[i]['globaleventid'])
            #create list that includes distances acssociated with each id
            d.append((dist,data.iloc[i]['globaleventid']))
            i=i+1
        else:
            i=i+1
    #subset early data frame further by removing all global event ids outside of desired radius
    data=pd.DataFrame(data[data['globaleventid'].isin(rows)])

#shortened code
def GH_time(x,y,z):
    codes=[]
    #turn list elements into strings
    for code in z:
        code=str(code)
        codes.append(code)
    #subset data by desired event codes (given by z)
    data=pd.DataFrame(last_30[last_30['eventrootcode'].isin(codes)])   
    rows=[]
    d=[]
    #get encoded location for specified lat/long (x)
    location=pgh.encode(x[0],x[1],precision=8)
    #geohash every row of subsetted data to x input
    for i in range(0, len(data)):
        #get encoded location for each event
        dataPoint=pgh.encode(data.iloc[i]['actiongeolat'], data.iloc[i]['actiongeolong'],precision=8)
        dist=pgh.geohash_approximate_distance(location, dataPoint)/1000
            #determine if distance falls within radius (y)
        if dist<=y:
            #create list of global event ids that fall within radius
            rows.append(data.iloc[i]['globaleventid'])
            #create list that includes distances acssociated with each id
            d.append((dist,data.iloc[i]['globaleventid']))
            i=i+1
        else:
            i=i+1
    #subset early data frame further by removing all global event ids outside of desired radius
    data=pd.DataFrame(data[data['globaleventid'].isin(rows)])


Now lets find the run times for both codes: 

In [196]:
import timeit

start = timeit.default_timer()

#great circle
GC_time((15.05,1.82),300,[13,14,18,19,20])

stop = timeit.default_timer()

print(stop - start)

2.4423691279953346


In [197]:
start = timeit.default_timer()

#geohash
GH_time((15.05,1.82),300,[13,14,18,19,20])

stop = timeit.default_timer()

print(stop - start)

2.5568057630443946


Both approaches are similar in terms of computation time. Geohashing, as it is implemented above, takes slightly longer to complete. It should be noted, that there are other ways to implement geohashing that can significantly improve computation time, like using geohash with a tree like structure. 

## Task 4

We will write another definition that creates an interactive map and report with the dataframe created from the GreatCircleEvents function.

In [259]:
#return a report of all incidents within a 250 km radius
import folium
#dataframe=greatcircle definition, lat=point of interest lat, long= point of interest long
def Map_Report(dataframe,lat,long):
    info=pd.DataFrame(dataframe)
    #store locations in a list
    locations = info[['actiongeolat', 'actiongeolong']]
    locationlist = locations.values.tolist()
    #store names of actors in list
    names=info['actor1name']
    nameslist=names.values.tolist()
    #create map of new york city
    maps = folium.Map(location=[lat, long], zoom_start=9)
    #add points to map and include actor names to points
    for point in range(0, len(locationlist)):
       folium.Marker(location=locationlist[point], popup=str(nameslist[point])).add_to(maps)
    return maps

In [260]:
#create report table
data=GreatCircleEvents((40.785091, -73.968285),250,[13,14,18,19,20],50)
data

Unnamed: 0_level_0,sqldate,monthyear,year,fractiondate,actor1code,actor1name,actor1countrycode,actor1knowngroupcode,actor1ethniccode,actor1religion1code,...,actiongeotype,actiongeofullname,actiongeocountrycode,actiongeoadm1code,actiongeoadm2code,actiongeolat,actiongeolong,actiongeofeatureid,dateadded,sourceurl
globaleventid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
694900385,20171003,201710,2017,2017.7479,USA,UNITED STATES,USA,,,,...,3,"Manhattan, New York, United States",US,USNY,NY061,40.7834,-73.9662,971945,20171003234500,http://gothamist.com/2017/10/03/columbia_sexua...
694900409,20171003,201710,2017,2017.7479,USA,CHICAGO,USA,,,,...,3,"Manhattan, New York, United States",US,USNY,NY061,40.7834,-73.9662,971945,20171003234500,http://www.policemag.com/channel/patrol/news/2...
695206075,20171004,201710,2017,2017.7507,LEG,LEGISLATOR,,,,,...,3,"Manhattan, New York, United States",US,USNY,NY061,40.7834,-73.9662,971945,20171004234500,https://www.wsj.com/articles/beware-of-hurrica...
695206110,20171004,201710,2017,2017.7507,LEG,LEGISLATOR,,,,,...,3,"Manhattan, New York, United States",US,USNY,NY061,40.7834,-73.9662,971945,20171004234500,https://www.wsj.com/articles/beware-of-hurrica...
696468451,20171009,201710,2017,2017.7644,COP,POLICE,,,,,...,3,"Manhattan, New York, United States",US,USNY,NY061,40.7834,-73.9662,971945,20171009234500,http://kansaspublicradio.org/kpr-news/topeka-p...
698422428,20171016,201710,2017,2017.7836,MED,WRITER,,,,,...,3,"Manhattan, New York, United States",US,USNY,NY061,40.7834,-73.9662,971945,20171016234500,https://townhall.com/tipsheet/micahrate/2017/1...
700121539,20171022,201710,2017,2017.8,COP,POLICE,,,,,...,3,"Manhattan, New York, United States",US,USNY,NY061,40.7834,-73.9662,971945,20171022234500,http://www.ny1.com/nyc/all-boroughs/news/2017/...
700121554,20171022,201710,2017,2017.8,COP,POLICE,,,,,...,3,"Manhattan, New York, United States",US,USNY,NY061,40.7834,-73.9662,971945,20171022234500,http://www.ny1.com/nyc/all-boroughs/news/2017/...
702773285,20171031,201710,2017,2017.8247,COP,POLICE,,,,,...,3,"Manhattan, New York, United States",US,USNY,NY061,40.7834,-73.9662,971945,20171031234500,http://www.startribune.com/the-latest-police-s...
695806905,20171006,201710,2017,2017.7562,COP,POLICE,,,,,...,3,"Upper West Side, New York, United States",US,USNY,,40.787,-73.9754,2062672,20171006234500,http://nicholasstixuncensored.blogspot.com/201...


In [261]:
#create map
Map_Report(data,40.8,-73.9)