In [104]:
import pandas as pd
import geopandas as gpd
import geopandas.tools
from geopy.geocoders import Nominatim
import geopy
from geopandas.tools import geocode
from geopy.extra.rate_limiter import RateLimiter
from difflib import SequenceMatcher

import numpy as np

In [88]:
missing_stations = ['Crystal City Metro / 18th & Bell St',
 '21st & M St NW',
 'Eastern Market Metro / Pennsylvania Ave & 7th St SE',
 'Connecticut Ave & Newark St NW / Cleveland Park',
 '18th & Eads St.',
 '19th & L St NW',
 '23rd & Crystal Dr',
 'Aurora Hills Community Ctr/18th & Hayes St',
 'S Joyce & Army Navy Dr',
 'Georgia Ave and Fairmont St NW',
 '20th & Crystal Dr',
 'S Glebe & Potomac Ave',
 'USDA / 12th & Independence Ave SW',
 '27th & Crystal Dr',
 'Pentagon City Metro / 12th & S Hayes St',
 '12th & Army Navy Dr',
 '26th & S Clark St',
 '15th & Crystal Dr',
 'Eads & 22nd St S',
 '1st & N St  SE',
 'Lynn & 19th St North',
 'N Rhodes & 16th St N',
 'Rosslyn Metro / Wilson Blvd & Ft Myer Dr',
 'Wilson Blvd & Franklin Rd',
 '11th & H St NE']

In [56]:
missing_stations

['Crystal City Metro / 18th & Bell St',
 '21st & M St NW',
 'Eastern Market Metro / Pennsylvania Ave & 7th St SE',
 'Connecticut Ave & Newark St NW / Cleveland Park',
 '18th & Eads St.',
 '19th & L St NW',
 '23rd & Crystal Dr',
 'Aurora Hills Community Ctr/18th & Hayes St',
 'S Joyce & Army Navy Dr',
 'Georgia Ave and Fairmont St NW',
 '20th & Crystal Dr',
 'S Glebe & Potomac Ave',
 'USDA / 12th & Independence Ave SW',
 '27th & Crystal Dr',
 'Pentagon City Metro / 12th & S Hayes St',
 '12th & Army Navy Dr',
 '26th & S Clark St',
 '15th & Crystal Dr',
 'Eads & 22nd St S',
 '1st & N St  SE',
 'Lynn & 19th St North',
 'N Rhodes & 16th St N',
 'Rosslyn Metro / Wilson Blvd & Ft Myer Dr',
 'Wilson Blvd & Franklin Rd',
 '11th & H St NE']

In [133]:
missing = pd.DataFrame(missing_stations)
missing = missing.rename(columns = {0: 'station'})

In [134]:
missing.head(10)

Unnamed: 0,station
0,Crystal City Metro / 18th & Bell St
1,21st & M St NW
2,Eastern Market Metro / Pennsylvania Ave & 7th ...
3,Connecticut Ave & Newark St NW / Cleveland Park
4,18th & Eads St.
5,19th & L St NW
6,23rd & Crystal Dr
7,Aurora Hills Community Ctr/18th & Hayes St
8,S Joyce & Army Navy Dr
9,Georgia Ave and Fairmont St NW


In [135]:
missing.head()

Unnamed: 0,station
0,Crystal City Metro / 18th & Bell St
1,21st & M St NW
2,Eastern Market Metro / Pennsylvania Ave & 7th ...
3,Connecticut Ave & Newark St NW / Cleveland Park
4,18th & Eads St.


In [136]:
station_loc = pd.read_csv('station_loc.csv')

In [137]:
station_loc.head()

Unnamed: 0.1,Unnamed: 0,name,lat,lon,region_id,region
0,0,Eads St & 15th St S,38.858971,-77.05323,41,"Arlington, VA"
1,1,Crystal Dr & 20th St S,38.856425,-77.049232,41,"Arlington, VA"
2,2,Crystal Dr & 15th St S,38.861056,-77.049417,41,"Arlington, VA"
3,3,Aurora Hills Cmty Ctr / 18th St & S Hayes St,38.857866,-77.05949,41,"Arlington, VA"
4,4,Pentagon City Metro / 12th St & S Hayes St,38.862303,-77.059936,41,"Arlington, VA"


In [138]:
# If the address in "missing" is very close to the address in "station_loc", then use the region name from "station_loc"

for i in range(len(missing)):
    for j in range (len(station_loc)):
        if (SequenceMatcher(None, missing.loc[i,'station'], station_loc.loc[j, 'name'])).ratio()>= 0.9:
            #missing.loc[i,'alt_station'] = station_loc.loc[j,'name']
            missing.loc[i,'region'] = station_loc.loc[j,'region']

            

In [139]:
missing

Unnamed: 0,station,region
0,Crystal City Metro / 18th & Bell St,"Arlington, VA"
1,21st & M St NW,"Washington, DC"
2,Eastern Market Metro / Pennsylvania Ave & 7th ...,"Washington, DC"
3,Connecticut Ave & Newark St NW / Cleveland Park,
4,18th & Eads St.,
5,19th & L St NW,"Washington, DC"
6,23rd & Crystal Dr,
7,Aurora Hills Community Ctr/18th & Hayes St,
8,S Joyce & Army Navy Dr,
9,Georgia Ave and Fairmont St NW,"Washington, DC"


In [140]:
missing['region'] = missing['region'].fillna(0)

In [141]:
# Initialise 'lat' and 'lon' columns
missing['lat'] = np.zeros((len(missing),1))
missing['lon'] = np.zeros((len(missing),1))

In [142]:
geolocator = Nominatim(user_agent="bike_search")
geocode = RateLimiter(geolocator.geocode, min_delay_seconds=1)

In [143]:
# Add a space at the beginning of the "region" entries + make sure format of "region" column is "string"
missing['region'] = " " + missing['region'].astype(str)

In [145]:
for i in range(len(missing)):

    try:
        if missing.loc[i, 'region'] == 0:           # If the "region" is missing, then do a bare geocode search
            dummy_lat = geocode(missing.loc[i,'station'], timeout = 15).latitude
            dummy_lon = geocode(missing.loc[i,'station'], timeout = 15).longitude

            if (dummy_lon > -78 and dummy_lon <-76) and (dummy_lat > 38.5 and dummy_lat<39.5):
                missing.loc[i, 'lat'] = dummy_lat
                missing.loc[i, 'lon'] = dummy_lon



        else:                                       # If "region" name is available, add it to the geocode search
            dummy_lat = geocode(missing.loc[i,'station'] + missing.loc[i, 'region'], timeout = 15).latitude
            dummy_lon = geocode(missing.loc[i,'station'] + missing.loc[i, 'region'], timeout = 15).longitude

            if (dummy_lon > -79 and dummy_lon <-76) and (dummy_lat > 38 and dummy_lat<40):
                missing.loc[i, 'lat'] = dummy_lat
                missing.loc[i, 'lon'] = dummy_lon



    except AttributeError:
        pass




In [146]:
missing

Unnamed: 0,station,region,lat,lon
0,Crystal City Metro / 18th & Bell St,"Arlington, VA",0.0,0.0
1,21st & M St NW,"Washington, DC",38.905107,-77.057402
2,Eastern Market Metro / Pennsylvania Ave & 7th ...,"Washington, DC",38.884056,-76.995262
3,Connecticut Ave & Newark St NW / Cleveland Park,0,0.0,0.0
4,18th & Eads St.,0,0.0,0.0
5,19th & L St NW,"Washington, DC",38.903799,-77.053958
6,23rd & Crystal Dr,0,38.853166,-77.050493
7,Aurora Hills Community Ctr/18th & Hayes St,0,0.0,0.0
8,S Joyce & Army Navy Dr,0,0.0,0.0
9,Georgia Ave and Fairmont St NW,"Washington, DC",38.9249,-77.0222


In [149]:
string1 = 'abcde/fghi'
string1.split('/')[0], string1.split('/')[1]

('abcde', 'fghi')

In [150]:
for i in range(len(missing)):

    try:
        if missing.loc[i, 'lat'] == 0 and missing.loc[i, 'region'] != 0:
            dummy_lat = geocode(missing.loc[i,'station'], timeout = 15).latitude
            dummy_lon = geocode(missing.loc[i,'station'], timeout = 15).longitude

            if (dummy_lon > -78 and dummy_lon <-76) and (dummy_lat > 38.5 and dummy_lat<39.5):
                missing.loc[i, 'lat'] = dummy_lat
                missing.loc[i, 'lon'] = dummy_lon
            
    

    except AttributeError:
        pass
            


In [151]:
missing

Unnamed: 0,station,region,lat,lon
0,Crystal City Metro / 18th & Bell St,"Arlington, VA",0.0,0.0
1,21st & M St NW,"Washington, DC",38.905107,-77.057402
2,Eastern Market Metro / Pennsylvania Ave & 7th ...,"Washington, DC",38.884056,-76.995262
3,Connecticut Ave & Newark St NW / Cleveland Park,0,38.934267,-77.057979
4,18th & Eads St.,0,0.0,0.0
5,19th & L St NW,"Washington, DC",38.903799,-77.053958
6,23rd & Crystal Dr,0,38.853166,-77.050493
7,Aurora Hills Community Ctr/18th & Hayes St,0,38.857874,-77.059492
8,S Joyce & Army Navy Dr,0,38.86571,-77.061773
9,Georgia Ave and Fairmont St NW,"Washington, DC",38.9249,-77.0222


In [152]:
for i in range(len(missing)):

    try:
        if "/" in missing.loc[i, 'station']:
            dummy_string = missing.loc[i, 'station'].split('/')[0]          # Check part that comes before separator '/'
            
            dummy_lat = geocode(dummy_string, timeout = 15).latitude
            dummy_lon = geocode(dummy_string, timeout = 15).longitude

            if (dummy_lon > -78 and dummy_lon <-76) and (dummy_lat > 38.5 and dummy_lat<39.5):
                missing.loc[i, 'lat'] = dummy_lat
                missing.loc[i, 'lon'] = dummy_lon
    

    except AttributeError:
        pass
            
            
            

In [153]:
missing

Unnamed: 0,station,region,lat,lon
0,Crystal City Metro / 18th & Bell St,"Arlington, VA",38.857756,-77.051196
1,21st & M St NW,"Washington, DC",38.905107,-77.057402
2,Eastern Market Metro / Pennsylvania Ave & 7th ...,"Washington, DC",38.884056,-76.995262
3,Connecticut Ave & Newark St NW / Cleveland Park,0,38.934267,-77.057979
4,18th & Eads St.,0,0.0,0.0
5,19th & L St NW,"Washington, DC",38.903799,-77.053958
6,23rd & Crystal Dr,0,38.853166,-77.050493
7,Aurora Hills Community Ctr/18th & Hayes St,0,38.857792,-77.059103
8,S Joyce & Army Navy Dr,0,38.86571,-77.061773
9,Georgia Ave and Fairmont St NW,"Washington, DC",38.9249,-77.0222


In [154]:
missing_2011 = missing.copy(deep = True)

In [156]:
missing_2011.to_csv('cleaned_data/missing_2011.csv', index = False)

In [62]:
geo = geocode(missing['station'], provider='nominatim', user_agent = "bike_search", timeout = 15)

TypeError: geocode() got an unexpected keyword argument 'provider'

In [23]:
geo

Unnamed: 0,geometry,address
0,GEOMETRYCOLLECTION EMPTY,
1,POINT (-77.05740 38.90511),"M St NW, Peter Square, Georgetown, Washington,..."
2,POINT (-76.99526 38.88406),Eastern Market Metro/Pennsylvania Ave & 7th St...
3,POINT (-77.05798 38.93427),Connecticut Ave and Newark St NW / Cleveland P...
4,POINT (-90.05047 30.01950),"Eads Street, Lake Oaks, New Orleans, Orleans P..."
5,POINT (-77.05396 38.90380),"L St NW, West End, Washington, District of Col..."
6,POINT (-77.04969 38.85303),"23rd and Crystal Dr, South Crystal Drive, Nati..."
7,POINT (-77.05949 38.85787),"Aurora Hills Community Ctr/18th and Hayes St, ..."
8,POINT (-77.06177 38.86571),"Army Navy Dr at S Joyce St, Army Navy Drive, P..."
9,POINT (-77.02220 38.92490),"Georgia Ave and Fairmont St NW, 2400, 6th Stre..."
