In [1]:
# Dependencies
import requests
import json
import pandas as pd
import time
import io
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter

In [2]:
# Sample Geo locator code commented out

# geolocator = Nominatim(user_agent="geoapiExercises")

# latitude = "32.322"
# longitude = "65.1190"
 
# location = geolocator.reverse(latitude+","+longitude)
 
# Display
# print(location)

In [3]:
# Source for the CSV Global Power Plant Database v1.3.0
# https://datasets.wri.org/dataset/globalpowerplantdatabase

# The path to our power plant CSV file
file = "Resources/global_ppdb.csv"

# Read our power plant data into pandas
power_df = pd.read_csv(file, low_memory=False)

In [4]:
power_us_df = power_df.loc[power_df['country_long'] == 'United States of America'].reset_index(drop=True)
power_us_df

Unnamed: 0,country,country_long,name,gppd_idnr,capacity_mw,latitude,longitude,primary_fuel,other_fuel1,other_fuel2,...,estimated_generation_gwh_2013,estimated_generation_gwh_2014,estimated_generation_gwh_2015,estimated_generation_gwh_2016,estimated_generation_gwh_2017,estimated_generation_note_2013,estimated_generation_note_2014,estimated_generation_note_2015,estimated_generation_note_2016,estimated_generation_note_2017
0,USA,United States of America,100 Brook Hill Drive Solar,USA0063292,2.0,41.0930,-73.9828,Solar,,,...,,,,,3.25,NO-ESTIMATION,NO-ESTIMATION,NO-ESTIMATION,NO-ESTIMATION,CAPACITY-FACTOR-V1
1,USA,United States of America,1025 Traveller Solar LLC,USA0062660,5.0,35.4273,-79.1263,Solar,,,...,,,,,8.14,NO-ESTIMATION,NO-ESTIMATION,NO-ESTIMATION,NO-ESTIMATION,CAPACITY-FACTOR-V1
2,USA,United States of America,1047 Little Mountain Solar LLC,USA0062661,3.0,36.1971,-80.8067,Solar,,,...,,,,,4.88,NO-ESTIMATION,NO-ESTIMATION,NO-ESTIMATION,NO-ESTIMATION,CAPACITY-FACTOR-V1
3,USA,United States of America,12 Applegate Solar LLC,USA0059371,1.9,40.2003,-74.5761,Solar,,,...,2.92,2.94,3.06,2.85,2.61,SOLAR-V1,SOLAR-V1,SOLAR-V1,SOLAR-V1,SOLAR-V1
4,USA,United States of America,126 Grove Solar LLC,USA0060858,2.0,42.0761,-71.4227,Solar,,,...,3.01,2.98,3.11,2.98,2.64,SOLAR-V1,SOLAR-V1,SOLAR-V1,SOLAR-V1,SOLAR-V1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9828,USA,United States of America,Zion Landfill Gas to Energy Facility,USA0056871,7.0,42.4803,-87.8861,Waste,,,...,,,,,1.26,NO-ESTIMATION,NO-ESTIMATION,NO-ESTIMATION,NO-ESTIMATION,CAPACITY-FACTOR-V1
9829,USA,United States of America,Zorn,USA0001368,18.0,38.2803,-85.7023,Gas,,,...,,,,,63.47,NO-ESTIMATION,NO-ESTIMATION,NO-ESTIMATION,NO-ESTIMATION,CAPACITY-FACTOR-V1
9830,USA,United States of America,Zotos International WPGF,USA0057648,3.4,42.8869,-76.9683,Wind,,,...,8.31,8.46,6.57,6.52,6.81,WIND-V1,WIND-V1,WIND-V1,WIND-V1,WIND-V1
9831,USA,United States of America,Zumbro Community Solar Garden,USA0061574,1.0,44.3195,-92.6703,Solar,,,...,,,,,1.62,NO-ESTIMATION,NO-ESTIMATION,NO-ESTIMATION,NO-ESTIMATION,CAPACITY-FACTOR-V1


In [5]:
# Checking for any efficiencies gained from repeated values - none
power_us_df[['latitude', 'longitude']].nunique()

latitude     9271
longitude    9519
dtype: int64

In [6]:
# Separted calls to Nominatimin in batches

# power_us_batch_df = power_us_df.iloc[:2000, :]    
# power_us_batch_df = power_us_df.iloc[2000:4000, :]   
# power_us_batch_df = power_us_df.iloc[4000:6000, :]   
# power_us_batch_df = power_us_df.iloc[6000:8000, :]  
power_us_batch_df = power_us_df.iloc[8000:, :]
power_us_batch_df

Unnamed: 0,country,country_long,name,gppd_idnr,capacity_mw,latitude,longitude,primary_fuel,other_fuel1,other_fuel2,...,estimated_generation_gwh_2013,estimated_generation_gwh_2014,estimated_generation_gwh_2015,estimated_generation_gwh_2016,estimated_generation_gwh_2017,estimated_generation_note_2013,estimated_generation_note_2014,estimated_generation_note_2015,estimated_generation_note_2016,estimated_generation_note_2017
8000,USA,United States of America,Solar Star California II LLC,USA0057933,2.0,38.5164,-121.4753,Solar,,,...,4.16,3.58,4.20,3.70,3.40,SOLAR-V1,SOLAR-V1,SOLAR-V1,SOLAR-V1,SOLAR-V1
8001,USA,United States of America,Solar Star New Jersey NJ LLC,USA0058402,1.5,40.4015,-74.6697,Solar,,,...,2.37,2.27,2.49,2.32,2.05,SOLAR-V1,SOLAR-V1,SOLAR-V1,SOLAR-V1,SOLAR-V1
8002,USA,United States of America,Solar Star North Carolina II LLC,USA0057435,6.5,36.4250,-77.0640,Solar,,,...,10.73,10.67,11.40,10.82,10.87,SOLAR-V1,SOLAR-V1,SOLAR-V1,SOLAR-V1,SOLAR-V1
8003,USA,United States of America,Solar Star Oregon II,USA0061048,56.3,44.1843,-120.9180,Solar,,,...,,,,,91.74,NO-ESTIMATION,NO-ESTIMATION,NO-ESTIMATION,NO-ESTIMATION,CAPACITY-FACTOR-V1
8004,USA,United States of America,Solar Star Palo Alto I LLC,USA0062732,1.3,37.4110,-122.1489,Solar,,,...,,,,,2.11,NO-ESTIMATION,NO-ESTIMATION,NO-ESTIMATION,NO-ESTIMATION,CAPACITY-FACTOR-V1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9828,USA,United States of America,Zion Landfill Gas to Energy Facility,USA0056871,7.0,42.4803,-87.8861,Waste,,,...,,,,,1.26,NO-ESTIMATION,NO-ESTIMATION,NO-ESTIMATION,NO-ESTIMATION,CAPACITY-FACTOR-V1
9829,USA,United States of America,Zorn,USA0001368,18.0,38.2803,-85.7023,Gas,,,...,,,,,63.47,NO-ESTIMATION,NO-ESTIMATION,NO-ESTIMATION,NO-ESTIMATION,CAPACITY-FACTOR-V1
9830,USA,United States of America,Zotos International WPGF,USA0057648,3.4,42.8869,-76.9683,Wind,,,...,8.31,8.46,6.57,6.52,6.81,WIND-V1,WIND-V1,WIND-V1,WIND-V1,WIND-V1
9831,USA,United States of America,Zumbro Community Solar Garden,USA0061574,1.0,44.3195,-92.6703,Solar,,,...,,,,,1.62,NO-ESTIMATION,NO-ESTIMATION,NO-ESTIMATION,NO-ESTIMATION,CAPACITY-FACTOR-V1


In [7]:
# The function city_state_county(row) takes a row of the DataFrame as input and extracts
# the coordinates of the location from the latitude and longitude columns. 
# The geolocator.reverse() method is used to reverse geocode the coordinates and obtain
# the location information as a JSON-like dictionary in the address variable.
# The function then extracts the city, state, town, county, and zip code information
# from the address dictionary using the get() method and assigns them to the respective columns
# of the DataFrame row. The function also includes a time.sleep() call with an argument of 1
# to pause the execution for 1 second between each row to avoid overloading the geocoding service with too many requests.

# Finally, the city_state_county() function is applied to each row of the DataFrame using
# the apply() method with the axis=1 parameter to apply the function row-wise. 
# The function returns the modified row, which is assigned back to the original DataFrame.

# Get county, state and zip code for all the power stations in the US
geolocator = Nominatim(user_agent="world_power")
# geocode = RateLimiter(geolocator.geocode, min_delay_seconds=10) # not working

def city_state_county(row):
    coord = f"{row['latitude']}, {row['longitude']}"
    location = geolocator.reverse(coord, exactly_one=True)
    address = location.raw['address']
    city = address.get('city', '')
    state = address.get('state', '')
    town = address.get('town', '')
    county = address.get('county', '')
    zipcode = address.get('postcode', '')
    row['city'] = city
    row['town'] = town
    row['state'] = state
    row['county'] = county
    row['postcode'] = zipcode
    time.sleep(1)
    return row


power_us_b4_df = power_us_batch_df.apply(city_state_county, axis=1 )


In [8]:
power_us_b4_df

Unnamed: 0,country,country_long,name,gppd_idnr,capacity_mw,latitude,longitude,primary_fuel,other_fuel1,other_fuel2,...,estimated_generation_note_2013,estimated_generation_note_2014,estimated_generation_note_2015,estimated_generation_note_2016,estimated_generation_note_2017,city,town,state,county,postcode
8000,USA,United States of America,Solar Star California II LLC,USA0057933,2.0,38.5164,-121.4753,Solar,,,...,SOLAR-V1,SOLAR-V1,SOLAR-V1,SOLAR-V1,SOLAR-V1,Sacramento,,California,Sacramento County,95820
8001,USA,United States of America,Solar Star New Jersey NJ LLC,USA0058402,1.5,40.4015,-74.6697,Solar,,,...,SOLAR-V1,SOLAR-V1,SOLAR-V1,SOLAR-V1,SOLAR-V1,,Montgomery Township,New Jersey,Somerset County,08558
8002,USA,United States of America,Solar Star North Carolina II LLC,USA0057435,6.5,36.4250,-77.0640,Solar,,,...,SOLAR-V1,SOLAR-V1,SOLAR-V1,SOLAR-V1,SOLAR-V1,,,North Carolina,Hertford County,27855
8003,USA,United States of America,Solar Star Oregon II,USA0061048,56.3,44.1843,-120.9180,Solar,,,...,NO-ESTIMATION,NO-ESTIMATION,NO-ESTIMATION,NO-ESTIMATION,CAPACITY-FACTOR-V1,,,Oregon,Crook County,
8004,USA,United States of America,Solar Star Palo Alto I LLC,USA0062732,1.3,37.4110,-122.1489,Solar,,,...,NO-ESTIMATION,NO-ESTIMATION,NO-ESTIMATION,NO-ESTIMATION,CAPACITY-FACTOR-V1,Palo Alto,,California,Santa Clara County,94306
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9828,USA,United States of America,Zion Landfill Gas to Energy Facility,USA0056871,7.0,42.4803,-87.8861,Waste,,,...,NO-ESTIMATION,NO-ESTIMATION,NO-ESTIMATION,NO-ESTIMATION,CAPACITY-FACTOR-V1,,Zion,Illinois,Lake County,60096
9829,USA,United States of America,Zorn,USA0001368,18.0,38.2803,-85.7023,Gas,,,...,NO-ESTIMATION,NO-ESTIMATION,NO-ESTIMATION,NO-ESTIMATION,CAPACITY-FACTOR-V1,Louisville,,Kentucky,Jefferson County,40207
9830,USA,United States of America,Zotos International WPGF,USA0057648,3.4,42.8869,-76.9683,Wind,,,...,WIND-V1,WIND-V1,WIND-V1,WIND-V1,WIND-V1,,City of Geneva,New York,Ontario County,14456
9831,USA,United States of America,Zumbro Community Solar Garden,USA0061574,1.0,44.3195,-92.6703,Solar,,,...,NO-ESTIMATION,NO-ESTIMATION,NO-ESTIMATION,NO-ESTIMATION,CAPACITY-FACTOR-V1,,,Minnesota,Goodhue County,55992


In [10]:
# The path to our CSV file
file = "Output/power_us_comp_df.csv"

# Read our power plant data into pandas that we save after every succesful batch pull to concat with current batch pull
rec_final_df = pd.read_csv(file)
rec_final_df

Unnamed: 0.1,Unnamed: 0,primary_fuel,state,county,postcode
0,0,Solar,New York,Rockland County,10994.0
1,1,Solar,North Carolina,Lee County,27332.0
2,2,Solar,North Carolina,Yadkin County,28642.0
3,3,Solar,New Jersey,Mercer County,8691.0
4,4,Solar,Massachusetts,Norfolk County,2038.0
...,...,...,...,...,...
7995,7995,Solar,California,San Bernardino County,92403.0
7996,7996,Solar,California,Los Angeles County,
7997,7997,Solar,California,Kern County,93560.0
7998,7998,Solar,California,CAL Fire Kern County,


In [11]:
# Concat all batch DFs together
comp_power_df = [rec_final_df, power_us_b4_df]
final_power_df = pd.concat(comp_power_df)

final_power_df

Unnamed: 0.1,Unnamed: 0,primary_fuel,state,county,postcode
0,0.0,Solar,New York,Rockland County,10994
1,1.0,Solar,North Carolina,Lee County,27332
2,2.0,Solar,North Carolina,Yadkin County,28642
3,3.0,Solar,New Jersey,Mercer County,8691
4,4.0,Solar,Massachusetts,Norfolk County,2038
...,...,...,...,...,...
9828,,Waste,Illinois,Lake County,60096
9829,,Gas,Kentucky,Jefferson County,40207
9830,,Wind,New York,Ontario County,14456
9831,,Solar,Minnesota,Goodhue County,55992


In [12]:
# Remove unwanted columns
final_power_df = final_power_df[['primary_fuel', 'state', 'county', 'postcode']]
final_power_df

Unnamed: 0,primary_fuel,state,county,postcode
0,Solar,New York,Rockland County,10994
1,Solar,North Carolina,Lee County,27332
2,Solar,North Carolina,Yadkin County,28642
3,Solar,New Jersey,Mercer County,8691
4,Solar,Massachusetts,Norfolk County,2038
...,...,...,...,...
9828,Waste,Illinois,Lake County,60096
9829,Gas,Kentucky,Jefferson County,40207
9830,Wind,New York,Ontario County,14456
9831,Solar,Minnesota,Goodhue County,55992


In [13]:
# Save DF to CSV file
final_power_df.to_csv("Output/power_us_comp_df.csv")