In [13]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
from sklearn import preprocessing
from sklearn import linear_model
from sklearn import svm
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import BernoulliNB
from sklearn.ensemble import RandomForestClassifier
%matplotlib inline

In [33]:
#load the data
violations_df = pd.read_csv('../health_inspect_cleaned.csv', index_col=0) # file is in GitHub 
violations_df["address2"] = violations_df["address"].map(str) + ", NY"

In [None]:
#This script converts the adress column of the health_inspect.csv file into
#X Coordinate (State Plane); Geo validated, X coordinate of the incident location
#Y Coordinate (State Plane); Geo validated, Y coordinate of the incident location.

from geopy.geocoders import Nominatim
geolocator = Nominatim() 

#run the first part
from tqdm import tqdm 
import time 

# Create two lists for the loop results to be placed
lat = []
lon = []

# For each row in a varible,
for row in tqdm(violations_df['address2']): 
    time.sleep(1) 
    # Try to,
    try: 
        location = geolocator.geocode(row)
        lat.append(location.latitude)
        lon.append(location.longitude)        
    # But if you get an error
    except:
        # append a missing value to lat
        lat.append(np.NaN)
        # append a missing value to lon
        lon.append(np.NaN)

# Create two new columns from lat and lon
violations_df['latitude'] = lat
violations_df['longitude'] = lon 

#save the data
violations_df.to_csv('violations_converted.csv')

In [3]:
#install previously converted data; note that Nominatim failed to convert almost half of the  
#entries, below we are rerunning those failed ones in sections (because it takes long term  
#to run each chunk)
violations_converted = pd.read_csv('violations_converted.csv', index_col=0) # file is in GitHub 
failed_entries=pd.isnull(violations_converted['latitude'])  
fe=violations_converted[failed_entries]

In [4]:
#divide the failed entries into chunks, around 11000 rows weren't converted
#first chunk
fe_p1=fe[:5000]  
#second chunk
fe_p2=fe[5000:]

In [5]:
#This script converts the adress column of the health_inspect.csv file into
#X Coordinate (State Plane); Geo validated, X coordinate of the incident location
#Y Coordinate (State Plane); Geo validated, Y coordinate of the incident location.

from geopy.geocoders import Nominatim
geolocator = Nominatim() 

#run the first part
from tqdm import tqdm 
import time 

# Create two lists for the loop results to be placed
lat = []
lon = []

# For each row in a varible,
for row in tqdm(fe_p1['address2']): 
    time.sleep(1) 
    # Try to,
    try: 
        location = geolocator.geocode(row)
        lat.append(location.latitude)
        lon.append(location.longitude)        
    # But if you get an error
    except:
        # append a missing value to lat
        lat.append(np.NaN)
        # append a missing value to lon
        lon.append(np.NaN)

# Create two new columns from lat and lon
fe_p1['latitude'] = lat
fe_p1['longitude'] = lon


100%|██████████| 5000/5000 [2:08:12<00:00,  1.50s/it]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [8]:
#number of nans
sum(pd.isnull(fe_p1['latitude']))

#save 
fe_p1.to_csv('violations_converted_fe_p1.csv')

In [13]:
#This script converts the adress column of the health_inspect.csv file into
#X Coordinate (State Plane); Geo validated, X coordinate of the incident location
#Y Coordinate (State Plane); Geo validated, Y coordinate of the incident location.

from geopy.geocoders import Nominatim
geolocator = Nominatim() 

#run the first part
from tqdm import tqdm 
import time 

# Create two lists for the loop results to be placed
lat = []
lon = []

# For each row in a varible,
for row in tqdm(fe_p2['address2']): 
    time.sleep(1) 
    # Try to,
    try: 
        location = geolocator.geocode(row)
        lat.append(location.latitude)
        lon.append(location.longitude)        
    # But if you get an error
    except:
        # append a missing value to lat
        lat.append(np.NaN)
        # append a missing value to lon
        lon.append(np.NaN)

# Create two new columns from lat and lon
fe_p2['latitude'] = lat
fe_p2['longitude'] = lon

100%|██████████| 6261/6261 [2:45:31<00:00,  1.53s/it]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [15]:
#number of nans
sum(pd.isnull(fe_p2['latitude']))

#seperate the nans and converted ones 
fe_p2.to_csv('violations_converted_fe_p2.csv')

### Prepare the failed data for google api

In [2]:
#combine failed and converted entries here   
#reload the data 

first_run = pd.read_csv('violations_converted.csv', index_col=0) # file is in GitHub 
fe_p1 = pd.read_csv('violations_converted_fe_p1.csv', index_col=0) # file is in GitHub 
fe_p2 = pd.read_csv('violations_converted_fe_p2.csv', index_col=0) # file is in GitHub 

#combine successfully converted entries and nans
failed_entries=pd.isnull(first_run['latitude']) 
failed_entries2=pd.isnull(fe_p1['latitude'])  
failed_entries3=pd.isnull(fe_p2['latitude'])  

c1=first_run[~failed_entries]#main converted 
c2=fe_p1[~failed_entries2]#converted from fe_p1 
c3=fe_p2[~failed_entries3]#converted from fe_p2  

frames = [c1, c2, c3]
converted_nominatim = pd.concat(frames)
 
#failed entries 
f1=fe_p1[failed_entries2]#failed from fe_p1 
f2=fe_p2[failed_entries3]#failed from fe_p2  

frames2 = [f1, f2]
for_google_api = pd.concat(frames2)

In [3]:
#divide the failed entries into chunks of 2500
#first chunk
g1=for_google_api[:2500]  
#second chunk
g2=for_google_api[2500:]  

### Use Google API for entries which returned Null

In [19]:
from geopy.geocoders import GoogleV3
       
geolocator = GoogleV3() 

from tqdm import tqdm 
import time 

# Create two lists for the loop results to be placed
lat = []
lon = []

# For each row in a varible,
for row in tqdm(g1['address2']): 
    # Try to,
    try: 
        location = geolocator.geocode(row)
        lat.append(location.latitude)
        lon.append(location.longitude)        
    # But if you get an error
    except:
        # append a missing value to lat
        lat.append(np.NaN)
        # append a missing value to lon
        lon.append(np.NaN)

# Create two new columns from lat and lon
g1['latitude'] = lat
g1['longitude'] = lon


100%|██████████| 2500/2500 [38:52<00:00,  1.51it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [21]:
#number of nans
sum(pd.isnull(g1['latitude']))

#seperate the nans and converted ones 
g1.to_csv('g1.csv')

In [4]:
from geopy.geocoders import GoogleV3
       
geolocator = GoogleV3() 

from tqdm import tqdm 
import time 

# Create two lists for the loop results to be placed
lat = []
lon = []

# For each row in a varible,
for row in tqdm(g2['address2']): 
    # Try to,
    try: 
        location = geolocator.geocode(row)
        lat.append(location.latitude)
        lon.append(location.longitude)        
    # But if you get an error
    except:
        # append a missing value to lat
        lat.append(np.NaN)
        # append a missing value to lon
        lon.append(np.NaN)

# Create two new columns from lat and lon
g2['latitude'] = lat
g2['longitude'] = lon


100%|██████████| 1803/1803 [19:34<00:00,  1.58it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [7]:
#number of nans
sum(pd.isnull(g2['latitude']))

#len(g2)
#seperate the nans and converted ones 
g2.to_csv('g2.csv')


In [14]:
#delete this later 
g1 = pd.read_csv('g1.csv', index_col=0) # file is in GitHub 
g2 = pd.read_csv('g2.csv', index_col=0) # file is in GitHub 

In [29]:
gcomb = pd.concat([g1,g2]) 
g_fail=gcomb[pd.isnull(gcomb['latitude']) ]#failed from fe_p1 
g_pass=gcomb[~pd.isnull(gcomb['latitude']) ]#failed from fe_p1 

sum(pd.isnull(g_fail['latitude']))

339

In [18]:
from geopy.geocoders import GoogleV3
       
geolocator = GoogleV3() 

from tqdm import tqdm 
import time 

# Create two lists for the loop results to be placed
lat = []
lon = []

# For each row in a varible,
for row in tqdm(g_fail['address2']): 
    # Try to,
    try: 
        location = geolocator.geocode(row)
        lat.append(location.latitude)
        lon.append(location.longitude)        
    # But if you get an error
    except:
        # append a missing value to lat
        lat.append(np.NaN)
        # append a missing value to lon
        lon.append(np.NaN)

# Create two new columns from lat and lon
g_fail['latitude'] = lat
g_fail['longitude'] = lon


100%|██████████| 339/339 [04:04<00:00,  1.39it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [27]:
g_fail2=g_fail[pd.isnull(g_fail['latitude']) ]#failed from fe_p1  
g_pass2=g_fail[~pd.isnull(g_fail['latitude']) ]#failed from fe_p1 

#sum(pd.isnull(g_fail['latitude'])) 


In [24]:
from geopy.geocoders import GoogleV3
       
geolocator = GoogleV3() 

from tqdm import tqdm 
import time 

# Create two lists for the loop results to be placed
lat = []
lon = []

# For each row in a varible,
for row in tqdm(g_fail2['address2']): 
    # Try to,
    try: 
        location = geolocator.geocode(row)
        lat.append(location.latitude)
        lon.append(location.longitude)        
    # But if you get an error
    except:
        # append a missing value to lat
        lat.append(np.NaN)
        # append a missing value to lon
        lon.append(np.NaN)

# Create two new columns from lat and lon
g_fail2['latitude'] = lat
g_fail2['longitude'] = lon

100%|██████████| 33/33 [00:12<00:00,  1.93it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [32]:
#combine datasets 
result = pd.concat([converted_nominatim, g_pass,g_fail2, g_pass2]) 
len(result)
sum(pd.isnull(result['latitude'])) 


33

In [33]:
#save the data 
result.to_csv('health_inspect_cleaned_loc_added.csv')