In [1]:
import pickle
import json
import numpy as np
import pandas as pd
import math
import urllib
import requests

from bokeh.plotting import figure, show, output_notebook
from bokeh.tile_providers import CARTODBPOSITRON

In [2]:
def fix_JSON(json_message=None):
    '''This function fixes errors in the JSON if any and return a json dictionary'''
    result = None
    try:        
        result = json.loads(json_message)
    except Exception as e:      
        # Find the offending character index:
        e_message = str(e)
        idx_to_replace = int(e_message.split(' ')[-1].replace(')',''))   
        
        #print(e_message,idx_to_replace)
        # Remove the offending character:
        json_message = list(json_message)
        json_message[idx_to_replace] = ' '
        new_message = ''.join(json_message)     
        return fix_JSON(json_message=new_message)
    return result

In [3]:
def get_lat_long(bbox):
    bbox = bbox['coordinates']
    try:
        bb = np.array(bbox[0])
        lat = np.average(bb[:,0])
        lon = np.average(bb[:,1])
    except:
        lat,lon = None,None
    return lon,lat

In [4]:
def get_location_data(x,y):
    '''
    This function gets the location details for a given x,y location
    https://geocoding.geo.census.gov/geocoder/geographies/coordinates?x=-111.739979&y=33.381074&benchmark=Public_AR_Current&vintage=Current_Current&format=json
    '''
    #base_url = 'https://nominatim.openstreetmap.org/search.php?%s'
    #search_term = {'format':'json','limit':1}
    base_url = 'https://geocoding.geo.census.gov/geocoder/geographies/coordinates?%s'
    search_term = {'format':'json','vintage':'Current_Current','benchmark':'Public_AR_Current'}
    
    search_term['x'] = x
    search_term['y'] = y
    url_args = urllib.parse.urlencode(search_term)
    url = base_url%(url_args)
    response = requests.get(url)
    res = [{}]
    if response.status_code != 200:
        #Some error handling
        print("Unable to fetch data from API, Reponse = ",response.status_code)
    else:
        try:
            res = json.loads(response.content)
        except Exception as e:
            print("Error for content,",e)
    return res['result']

## Processinf file : neighborhoods.json

In [5]:
BASE_PATH = 'E:\Datasets\TWITTER_SOCG\CONTROL\%s'
file_name = 'neighborhoods.json'
file = BASE_PATH%(file_name)

In [8]:
neigh = []

In [9]:
with open(file) as f:
    for line in f:
        tmp_all = fix_JSON(line)
        lat,lon = get_lat_long(tmp_all.get('bounding_box',None))
        tmp_all['lat'] = lat
        tmp_all['lon'] = lon
        counties = get_location_data(lon,lat)
        tmp_all.update(counties)
        neigh.append(tmp_all)

In [11]:
## Dumping location data
with open(BASE_PATH%('neighborhoods.pkl'), 'wb') as handle:
    pickle.dump(neigh, handle, protocol=pickle.HIGHEST_PROTOCOL)

## Processing file : points.json

In [12]:
file_name = 'points.json'
file = BASE_PATH%(file_name)

In [13]:
points = []

In [14]:
with open(file) as f:
    for line in f:
        tmp_all = fix_JSON(line)
        lat,lon = get_lat_long(tmp_all.get('bounding_box',None))
        tmp_all['lat'] = lat
        tmp_all['lon'] = lon
        counties = get_location_data(lon,lat)
        tmp_all.update(counties)
        points.append(tmp_all)

In [15]:
## Dumping location data
with open(BASE_PATH%('points.pkl'), 'wb') as handle:
    pickle.dump(points, handle, protocol=pickle.HIGHEST_PROTOCOL)

## Processing : missing cities

In [17]:
points[0]

{'attributes': {},
 'bounding_box': {'coordinates': [[[-93.23625, 44.98042],
    [-93.23625, 44.98042],
    [-93.23625, 44.98042],
    [-93.23625, 44.98042]]],
  'type': 'Polygon'},
 'contained_within': [],
 'country': 'United States',
 'country_code': 'US',
 'full_name': 'Loring Bar & Restaurant',
 'geographies': {'2010 Census Blocks': [{'AREALAND': 15790,
    'AREAWATER': 0,
    'BASENAME': '2005',
    'BLKGRP': '2',
    'BLOCK': '2005',
    'CENTLAT': '+44.9802734',
    'CENTLON': '-093.2368042',
    'COUNTY': '053',
    'FUNCSTAT': 'S',
    'GEOID': '270531039002005',
    'INTPTLAT': '+44.9802734',
    'INTPTLON': '-093.2368042',
    'LSADC': 'BK',
    'LWBLKTYP': 'L',
    'MTFCC': 'G5040',
    'NAME': 'Block 2005',
    'OBJECTID': 7084032,
    'OID': 210404046013537,
    'STATE': '27',
    'STGEOMETRY.AREA': 31560.307,
    'STGEOMETRY.LEN': 710.1859,
    'SUFFIX': '',
    'TRACT': '103900'}],
  'Census Tracts': [{'AREALAND': 269683,
    'AREAWATER': 0,
    'BASENAME': '1039',
    

In [19]:
## Dumping coordinate data
with open(BASE_PATH%('missing_cities.pkl'), 'rb') as handle:
    missing_cities_county = pickle.load(handle)

In [21]:
for m in missing_cities_county:
    m_city = missing_cities_county[m]
    lat = m_city['lat']
    lon = m_city['lon']
    counties = get_location_data(lon,lat)
    m_city.update(counties)

In [22]:
## Dumping location data
with open(BASE_PATH%('missing_cities_county.pkl'), 'wb') as handle:
    pickle.dump(missing_cities_county, handle, protocol=pickle.HIGHEST_PROTOCOL)