## Fetch Weather Data: API Exploration
### *This notebook fetches weather data from two sources: NOAA and Open Weather*

#### NOAA API
The documentation for the web API for NOAA climate data can be found [here](https://www.ncdc.noaa.gov/cdo-web/webservices/v2).

In [None]:
import requests
import json
import pandas as pd
from pyproj import Proj
from shapely.geometry import shape
headers = {"token": "xVEIkLnfHyheHhvoheZSxesUerlyrxGN"}

#### All Available Datasets

In [None]:
url = "https://www.ncdc.noaa.gov/cdo-web/api/v2/datasets"
response = requests.get(url=url, headers=headers)
noaa_all_datasets_json = response.json()
print(noaa_all_datasets_json["metadata"])
noaa_all_datasets_df = pd.DataFrame(noaa_all_datasets_json['results'])
noaa_all_datasets_df

#### Daily Summaries Dataset

* For our purposes, we will be working with daily summaries data.
* Fetch all information about the GHCND, Daily Summaries dataset specifically.

In [None]:
url = "https://www.ncdc.noaa.gov/cdo-web/api/v2/datasets/GHCND"
response = requests.get(url=url, headers=headers)
noaa_daily_summaries_json = response.json()

In [None]:
print(noaa_daily_summaries_json)

#### Datatype Filter

In [None]:
url = "https://www.ncdc.noaa.gov/cdo-web/api/v2/datasets?datatypeid=TOBS"
response = requests.get(url=url, headers=headers)
noaa_tobs_json = response.json()

In [None]:
noaa_tobs_json

#### Set of Stations

In [None]:
url = "https://www.ncdc.noaa.gov/cdo-web/api/v2/datasets?stationid=COOP:310090&stationid=COOP:310184&stationid=COOP:310212"
response = requests.get(url=url, headers=headers)
noaa_stations_json = response.json()
print(noaa_stations_json)

Note: The above example is not working as there is no data available at the given list of stations at this moment.

#### Data Categories

In [None]:
url = "https://www.ncdc.noaa.gov/cdo-web/api/v2/datacategories?limit=100"
response = requests.get(url=url, headers=headers)
noaa_data_categories_json = response.json()
print(noaa_data_categories_json["metadata"])
noaa_data_categories_df = pd.DataFrame(noaa_data_categories_json['results'])
print(noaa_data_categories_df.shape)
noaa_data_categories_df

#### WIND Datacategory

* We will be taking the WIND data category for our analysis in the preliminary phase.
* Fetch all information about the Wind dataset specifically

In [None]:
url = "https://www.ncdc.noaa.gov/cdo-web/api/v2/datacategories/WIND"
response = requests.get(url=url, headers=headers)
noaa_wind_json = response.json()

In [None]:
print(noaa_wind_json)

#### Datatypes

In [None]:
url = "https://www.ncdc.noaa.gov/cdo-web/api/v2/datatypes"
response = requests.get(url=url, headers=headers)
noaa_data_types_json = response.json()
print(noaa_data_types_json["metadata"])
noaa_data_types_df = pd.DataFrame(noaa_data_types_json['results'])
print(noaa_data_types_df.shape)
noaa_data_types_df

#### Datatype in Wind Category

In [None]:
url = "https://www.ncdc.noaa.gov/cdo-web/api/v2/datatypes?datacategoryid=WIND&limit=56"
response = requests.get(url=url, headers=headers)
noaa_wind_data_types_json = response.json()
print(noaa_wind_data_types_json["metadata"])
noaa_wind_data_types_df = pd.DataFrame(noaa_wind_data_types_json['results'])
print(noaa_wind_data_types_df.shape)
noaa_wind_data_types_df

#### Locations

In [None]:
url = "https://www.ncdc.noaa.gov/cdo-web/api/v2/locationcategories"
response = requests.get(url=url, headers=headers)
noaa_location_categories_json = response.json()
print(noaa_location_categories_json["metadata"])
noaa_location_categories_df = pd.DataFrame(noaa_location_categories_json['results'])
print(noaa_location_categories_df.shape)
noaa_location_categories_df

#### Location Category: County Level Information

In [None]:
url = "https://www.ncdc.noaa.gov/cdo-web/api/v2/locationcategories/CNTY"
response = requests.get(url=url, headers=headers)
noaa_cnty_json = response.json()

In [None]:
print(noaa_cnty_json)

#### Available Locations for Daily Summaries Data

In [None]:
url = "https://www.ncdc.noaa.gov/cdo-web/api/v2/locations?datasetid=GHCND"
response = requests.get(url=url, headers=headers)
noaa_daily_summaries_locations_json = response.json()
print(noaa_daily_summaries_locations_json["metadata"])
noaa_daily_summaries_locations_df = pd.DataFrame(noaa_daily_summaries_locations_json['results'])
print(noaa_daily_summaries_locations_df.shape)
noaa_daily_summaries_locations_df

#### Available Stations

In [None]:
#url = "https://www.ncdc.noaa.gov/cdo-web/api/v2/locations?datasetid=GHCND"
url = "https://www.ncdc.noaa.gov/cdo-web/api/v2/stations?datasetid=GHCND&datacategoryid=WIND&limit=1000&startdate=2000-01-01&extent=44.3506,-122.3201,46.492,-122.1787"
response = requests.get(url=url, headers=headers)
noaa_daily_summaries_locations_json = response.json()
print(noaa_daily_summaries_locations_json["metadata"])
noaa_daily_summaries_locations_df = pd.DataFrame(noaa_daily_summaries_locations_json['results'])
print(noaa_daily_summaries_locations_df.shape)
noaa_daily_summaries_locations_df

#### Getting Stations for General Electric Project

In [None]:
import numpy as np
import time

In [None]:
def get_weather_stations(lat_center, long_center, square_diagonal, top_n = 5):
    flag = True
    count = 0
    while flag:
        count += 1
        base_url = "https://www.ncdc.noaa.gov/cdo-web/api/v2/stations?datasetid=GHCND&datacategoryid=WIND&limit=1000&startdate=2000-01-01&extent="
        epsilon = round(square_diagonal/np.sqrt(2)/2,4)
        a = [long_center+epsilon, lat_center+epsilon]
        b = [long_center+epsilon, lat_center-epsilon]
        c = [long_center-epsilon, lat_center-epsilon]
        d = [long_center-epsilon, lat_center+epsilon]
        w = (long_center+epsilon, lat_center+epsilon)
        x = (long_center+epsilon, lat_center-epsilon)
        y = (long_center-epsilon, lat_center-epsilon)
        z = (long_center-epsilon, lat_center+epsilon)
        url_a = round(lat_center-epsilon,4)
        url_b = round(long_center-epsilon,4)
        url_c = round(lat_center+epsilon,4)
        url_d = round(long_center+epsilon,4)
        url = base_url + str(url_a) + "," + str(url_b) + "," + str(url_c) + "," + str(url_d)
        response = requests.get(url=url, headers=headers)
        all_stations = response.json()
        try:
            all_stations_results = pd.DataFrame(all_stations['results'])
        except:
            all_stations_results = pd.DataFrame()
        try:
            all_stations_meta = all_stations['metadata']
        except:
            all_stations_meta = None
        n_stations = all_stations_results.shape[0]
        if n_stations >= top_n:
            print('Summary of Request: ')
            print(all_stations_meta)
            print('- . - . - . -')
            n_stations = all_stations_results.shape[0]
            {"type": "Polygon", 
             "coordinates": [[
               a, b, c, d
             ]]}
            co = {"type": "Polygon", "coordinates": [
                [w, x, y, z]]}
            lon, lat = zip(*co['coordinates'][0])
            pa = Proj("+proj=aea +lat_1=37.0 +lat_2=41.0 +lat_0=39.0 +lon_0=-106.55")
            x, y = pa(lon, lat)
            cop = {"type": "Polygon", "coordinates": [zip(x, y)]}
            final_area = shape(cop).area  # 268952044107.43506 square meters
            #print('Square Meters Area Queried: ')
            #print(final_area)
            #print('- . - . - . -')
            flag = False
            #print('Gathered Sufficient Stations')
        else:
            #print(square_diagonal)
            square_diagonal = square_diagonal*1.2
            #print('Increasing Search Area...')
        time.sleep(0.25)
    all_stations_results['square_diagonal'] = square_diagonal
    all_stations_results['query_count'] = count
    all_stations_results['final_area'] = final_area
    return all_stations_results

In [None]:
#get_weather_stations(lat_center, long_center, square_diagonal = 0.2, top_n = 5)

#### Query Stations for GE Projects

* This block fetches the center latitudes and longitudes of the projects for General Electric.

In [None]:
usgs_data = pd.read_csv('./uswtdbCSV/uswtdb_v1_1_20180710.csv')
usgs_ge_data = usgs_data[usgs_data["t_manu"] == "GE Wind"]
ge_projects_df = pd.pivot_table(usgs_ge_data, values=["xlong", "ylat"], columns="p_name", aggfunc="mean").transpose()
ge_projects_df.reset_index(inplace=True)
ge_projects_df.columns = ['p_name', 'center_long', 'center_lat']
ge_projects_df.head()

* All GE Projects: Range of the turbine locations

In [None]:
usgs_data = pd.read_csv('./uswtdbCSV/uswtdb_v1_1_20180710.csv')
usgs_ge_data = usgs_data[usgs_data["t_manu"] == "GE Wind"]
usgs_ge_data = usgs_ge_data[(usgs_ge_data["case_id"] != 3064510) & (usgs_ge_data["case_id"] != 3064731)]
ge_by_project_df = usgs_ge_data.groupby("p_name")[["xlong", "ylat"]].agg(["min", "max", "mean"])
ge_by_project_df.reset_index(inplace=True)
ge_by_project_df.columns = ["p_name", "long_min", "long_max", "long_mean", "lat_min", "lat_max", "lat_mean"]
ge_by_project_df["long_range"] = ge_by_project_df["long_max"] - ge_by_project_df["long_min"]
ge_by_project_df["lat_range"] = ge_by_project_df["lat_max"] - ge_by_project_df["lat_min"]
ge_by_project_df.sort_values(by="long_range", axis=0, ascending=False, inplace=True)
ge_by_project_df.head()

#### Fetch the weather stations for the GE Project

In [None]:
all_project_stations = pd.DataFrame()

In [None]:
count = 0
for index, row in ge_by_project_df.iterrows():
    count += 1
#     if count < 231:
#         continue
    try:
        print(count)
        print(row['p_name'])
        lat_center = row['lat_mean']
        long_center = row['long_mean']
        project_stations = get_weather_stations(lat_center, long_center, square_diagonal = 0.2, top_n = 5)
        project_stations['query_long'] = long_center
        project_stations['query_lat'] = lat_center
        #print(project_stations)
        project_stations['dist_from_center'] = np.sqrt((project_stations['query_lat'] - project_stations['latitude'])**2 + (project_stations['query_long'] - project_stations['longitude'])**2)
        project_stations = project_stations.sort_values('dist_from_center').reset_index()
        project_stations = project_stations.loc[0:2]
        project_stations['p_name'] = row['p_name']
        all_project_stations = all_project_stations.append(project_stations)
    except:
        print('JSON Decode Error!')

In [None]:
all_project_stations.to_csv('./data/01_all_project_stations.csv')