# Parse Locations
Purpose is to find locations from tweets and label them.<br>
By: Jonathan Lo<br>
Date: 5/22/23

## Overhead

In [2]:
# Imports
import spacy
from pymongo import MongoClient
from json import load
from pprint import pprint
from geopy.geocoders import GeoNames
import requests as r
from time import sleep

In [3]:
# Obtain secrets information
secrets_data = load(open("secrets.json", "r"))
mongo_user = secrets_data['MongoUser']
mongo_pass = secrets_data['MongoPass']
geo_user = secrets_data['GeoUser']
twitter_bearer = secrets_data['TwitterBearer']

# Initialize Geolocater
geolocator = GeoNames(username = geo_user)

# Connect to Atlas
connection_string = f"mongodb+srv://{mongo_user}:{mongo_pass}@dsc104-final-project.6oeuizv.mongodb.net/"
client = MongoClient(connection_string)
db = client.tweets
tweets = db.tweets

# Load Spacy
nlp = spacy.load("en_core_web_sm")

## Process (GeoLocator)

In [15]:
# Query for tweet coords
query = {
    "tweet_coord": {
        "$exists": True,
        "$ne": [0, 0]
    }
}
projection = {
    "_id": 0,
    "tweet_id": 1,
    "tweet_coord": 1
}
results = list(tweets.find(query, projection))

In [16]:
results

[{'tweet_id': 569942903683813376, 'tweet_coord': [36.08457854, -115.13780136]},
 {'tweet_id': 569304436612444160, 'tweet_coord': [38.9128188, -77.00798226]},
 {'tweet_id': 569244860286263297, 'tweet_coord': [32.8437698, -96.84928399]},
 {'tweet_id': 568815689684815873, 'tweet_coord': [37.77465018, -122.44032176]},
 {'tweet_id': 568577243200376833, 'tweet_coord': [33.94696831, -118.40747994]},
 {'tweet_id': 570124596180955136, 'tweet_coord': [33.94540417, -118.4062472]},
 {'tweet_id': 570088404156698625, 'tweet_coord': [33.94209449, -118.40410103]},
 {'tweet_id': 569847920192655361, 'tweet_coord': [26.074379, -80.1416831]},
 {'tweet_id': 569083230361673728, 'tweet_coord': [26.06726717, -80.14433663]},
 {'tweet_id': 570264145116819457, 'tweet_coord': [40.74804263, -73.99295302]},
 {'tweet_id': 569967019958730753, 'tweet_coord': [40.6413712, -73.78311558]},
 {'tweet_id': 569881548515708928, 'tweet_coord': [37.79374402, -122.39327564]},
 {'tweet_id': 569666477265018881, 'tweet_coord': [51.

In [47]:
# Hard cap at 1k/hr => 24k/day

def find_location(coords: tuple):
    """ Takes in coordinates and locates the city
    """
    location = geolocator.reverse(coords, exactly_one=True)
    if location:
        data = location.raw
        return data['name'], data['adminName1']
    return ""

# Find location for tweets
for tweet_obj in results:
    tweet_obj['location'] = find_location(tweet_obj['tweet_coord'])

GeocoderQuotaExceeded: the hourly limit of 1000 credits for jonathanlo411 has been exceeded. Please throttle your requests or use the commercial service.

In [48]:
pprint(results)

[{'location': ('Paradise', 'Nevada'),
  'reverse_geocode': 'Paradise, NV',
  'tweet_coord': [36.08457854, -115.13780136],
  'tweet_id': 569942903683813376},
 {'location': ('Youngsborough (historical)', 'Washington, D.C.'),
  'reverse_geocode': 'Washington, DC',
  'tweet_coord': [38.9128188, -77.00798226],
  'tweet_id': 569304436612444160},
 {'location': ('Oldham', 'Texas'),
  'reverse_geocode': 'Dallas, TX',
  'tweet_coord': [32.8437698, -96.84928399],
  'tweet_id': 569244860286263297},
 {'location': ('North of the Panhandle', 'California'),
  'reverse_geocode': 'San Francisco, CA',
  'tweet_coord': [37.77465018, -122.44032176],
  'tweet_id': 568815689684815873},
 {'location': ('Westchester', 'California'),
  'reverse_geocode': 'Los Angeles, CA',
  'tweet_coord': [33.94696831, -118.40747994],
  'tweet_id': 568577243200376833},
 {'location': ('Westchester', 'California'),
  'reverse_geocode': 'Los Angeles, CA',
  'tweet_coord': [33.94540417, -118.4062472],
  'tweet_id': 5701245961809551

## Process (Spacy.io)

In [11]:
# Query for tweet locations (ambigous)
query = {
    "tweet_location": {
        "$exists": True,
        "$ne": [0, 0]
    }
}
projection = {
    "_id": 0,
    "tweet_id": 1,
    "tweet_location": 1
}
results = list(tweets.find(query, projection))

In [13]:
def extract_locations(text):
    doc = nlp(text)
    locations = []
    # Looking for geopolitcal entities
    for entity in doc.ents:
        if entity.label_ == "GPE": 
            locations.append(entity.text)
    return locations

# Query for locations
for tweet_obj in results:
    locations = extract_locations(tweet_obj['tweet_location'])
    tweet_obj['location_label'] = locations

In [14]:
pprint(results)

[{'location_label': [],
  'tweet_id': 570289724453216256,
  'tweet_location': 'NYC'},
 {'location_label': [],
  'tweet_id': 570277724385734656,
  'tweet_location': 'west covina'},
 {'location_label': [],
  'tweet_id': 570038941497192448,
  'tweet_location': "i'm creating a monster "},
 {'location_label': ['San Francisco'],
  'tweet_id': 570035876845084672,
  'tweet_location': 'San Francisco, CA'},
 {'location_label': ['Los Angeles'],
  'tweet_id': 570012257549070337,
  'tweet_location': 'Los Angeles'},
 {'location_label': ['Oakland'],
  'tweet_id': 570010571707256832,
  'tweet_location': 'Oakland via Midwest '},
 {'location_label': ['new york', 'new york'],
  'tweet_id': 569996412286582784,
  'tweet_location': 'new york, new york'},
 {'location_label': [],
  'tweet_id': 569996245462159361,
  'tweet_location': 'brooklyn, Ny'},
 {'location_label': ['USA'],
  'tweet_id': 569982307634794497,
  'tweet_location': 'USA'},
 {'location_label': ['New York'],
  'tweet_id': 569972508499283968,
  '

## Process (Twitter Reverse Geocode)

In [4]:
# Query for tweet coords
query = {
    "tweet_coord": {
        "$exists": True,
        "$ne": [0, 0]
    }
}
projection = {
    "_id": 0,
    "tweet_id": 1,
    "tweet_coord": 1
}
results = list(tweets.find(query, projection))

In [5]:
# 15/15min => 60/hr => 1440/day

def request_rgeocode(latitude, longitude):
    """ Makes a request to Twitter's Reverse GeoCode API
    """
    url = "https://api.twitter.com/1.1/geo/reverse_geocode.json"
    params = {
        "lat": latitude,
        "long": longitude,
        "granularity": "city"
    }
    auth = {"Authorization": f"Bearer {twitter_bearer}"}
    res = r.get(url, params = params, headers = auth)
    return res.json()

for tweet_obj in results:
    # Set timeout
    coords = tweet_obj['tweet_coord']
    raw_json = request_rgeocode(coords[0], coords[1])
    
    # If issues encountered
    if 'errors' in raw_json:
        print(f"Error encountered\nResponse:{raw_json}")
    
    # Saving
    try:
        tweet_obj['reverse_geocode'] = raw_json['result']['places'][0]['full_name']
    except Exception as e:
        print(f"Error encountered during query!\nHead:{tweet_obj}\nError:{e.message}")

KeyboardInterrupt: 

In [45]:
results

[{'tweet_id': 569942903683813376,
  'tweet_coord': [36.08457854, -115.13780136],
  'reverse_geocode': 'Paradise, NV'},
 {'tweet_id': 569304436612444160,
  'tweet_coord': [38.9128188, -77.00798226],
  'reverse_geocode': 'Washington, DC'},
 {'tweet_id': 569244860286263297,
  'tweet_coord': [32.8437698, -96.84928399],
  'reverse_geocode': 'Dallas, TX'},
 {'tweet_id': 568815689684815873,
  'tweet_coord': [37.77465018, -122.44032176],
  'reverse_geocode': 'San Francisco, CA'},
 {'tweet_id': 568577243200376833,
  'tweet_coord': [33.94696831, -118.40747994],
  'reverse_geocode': 'Los Angeles, CA'},
 {'tweet_id': 570124596180955136,
  'tweet_coord': [33.94540417, -118.4062472],
  'reverse_geocode': 'Los Angeles, CA'},
 {'tweet_id': 570088404156698625,
  'tweet_coord': [33.94209449, -118.40410103],
  'reverse_geocode': 'Los Angeles, CA'},
 {'tweet_id': 569847920192655361,
  'tweet_coord': [26.074379, -80.1416831],
  'reverse_geocode': 'Dania Beach, FL'},
 {'tweet_id': 569083230361673728,
  'twe