In [1]:
from time import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pandasql as ps
import glob
from dateutil import parser
from datetime import timedelta, date
import requests
import json

In [2]:
mesowest_api_key = "KzxxYow297burLSv2E54TEQhhrx7NQCfWR7"

In [3]:
start_date = date(2016, 1, 1)
end_date = date(2016, 12, 31)

In [4]:
 metrics = ["air_temp_set_1", "altimeter_set_1", "wind_speed_set_1", "relative_humidity_set_1", "precip_accum_24_hour_set_1"]

In [5]:
#--------------------READING AND PRE-PROCESSING DAILY AQS AND WEATHER DATA-------------------
#--------------------------------------------------------------------------------------------

#data = pd.read_csv("data/daily_aqi_by_county_2016.csv")
#states = pd.read_csv("data/States.csv")

# for i in range(2015, 2019):
#     data = pd.concat([data, pd.read_csv("data/daily_aqi_by_county_" + str(i) + ".zip")])    

In [6]:
counties_abbrev = pd.read_csv("counties_abbrev_Tess_939-999.csv")


In [7]:
counties_abbrev.columns = ["OrigIndex", "State", "County", "Abbreviation"]

counties_abbrev["County"] = counties_abbrev["County"].apply(lambda x: x.replace(" ", "%"))

In [8]:
# query = """SELECT DISTINCT `State Name`, `county Name` FROM data"""

# counties = ps.sqldf(query=query)

# counties.columns = ["State", "County"]

# counties_abbrev = pd.merge(counties, states, how="left", on="State")

counties_abbrev["State"] = counties_abbrev["State"].apply(str.strip)
counties_abbrev["County"] = counties_abbrev["County"].apply(str.strip)

counties_abbrev["is_city"] = 0

In [9]:
counties_abbrev

Unnamed: 0,OrigIndex,State,County,Abbreviation,is_city
0,939,Virginia,Frederick,VA,0
1,940,Virginia,Fredericksburg%City,VA,0
2,941,Virginia,Giles,VA,0
3,942,Virginia,Hampton%City,VA,0
4,943,Virginia,Hanover,VA,0
5,944,Virginia,Henrico,VA,0
6,945,Virginia,Hopewell%City,VA,0
7,946,Virginia,King%William,VA,0
8,947,Virginia,Loudoun,VA,0
9,948,Virginia,Lynchburg%City,VA,0


In [10]:
# Helper functions
def get_token(api_key):
    response = requests.get("https://api.mesowest.net/v2/auth?apikey=" + api_key)
    return json.loads(response.content.decode("latin1"))["TOKEN"]

def daterange(start_date, end_date):
    for n in range(int ((end_date - start_date).days)):
        yield start_date + timedelta(n)
        
def get_avg_filt(dic, var):
    if isinstance(dic[var], dict):
        return dic[var]["average"]
    else:
        return np.nan
    
def get_avg(response, var):
    if "STATION" in response.keys():
        stations = response["STATION"]
        N = len(stations)
        if stations:
            return np.nanmean([get_avg_filt(y, var) for y in [x["STATISTICS"] for x in stations if "STATISTICS" in x.keys()] if var in y.keys()])
        else:
            return np.nan
    else:
        print("no STATION in keys")
        return np.nan
    
def run_api_query(url):
    t1 = time()
    response = requests.get(url)
    t2 = time()
    resp_json = json.loads(response.text)
    t3 = time()
#     print("Time for quering data from URL: {}".format(t2-t1))
#     print("Time for loading into json: {}".format(t3-t2))
    return resp_json

In [11]:
def run_mesowest_api(state_abbrev, county, year, month, day, token):
    date1 = str(year) + str(month).zfill(2) + str(day).zfill(2)
    date2 = str(year) + str(month).zfill(2) + str(day).zfill(2)
    is_city = counties_abbrev.loc[(counties_abbrev['County'] == county) & (counties_abbrev['Abbreviation'] == state_abbrev),"is_city"].values[0]
#     print(is_city)
    if(is_city):
        print("This county is actually a city. Querying by city...")
        url = "http://api.mesowest.net/v2/stations/statistics?state=" + state_abbrev + "&city=" + county + "&start=" + date1 + "0000&end=" + date2 + "0000&obtimezone=local&token=" + token + "&type=average"        
    else:
        url = "http://api.mesowest.net/v2/stations/statistics?state=" + state_abbrev + "&county=" + county + "&start=" + date1 + "0000&end=" + date2 + "0000&obtimezone=local&token=" + token + "&type=average"
    
    resp_json = run_api_query(url)
    
    if "STATION" in resp_json.keys():
        if resp_json["STATION"]:
            pass
        else:
            counties_abbrev.loc[(counties_abbrev['County'] == county) & (counties_abbrev['Abbreviation'] == state_abbrev),"is_city"].values[0] = 1
            print("No data found by county. Querying by city...")
            url = "http://api.mesowest.net/v2/stations/statistics?state=" + state_abbrev + "&city=" + county + "&start=" + date1 + "0000&end=" + date2 + "0000&obtimezone=local&token=" + token + "&type=average"        
            resp_json = run_api_query(url)

    return resp_json   

In [12]:
def get_county_weather_data(state, county, start_date, end_date, *argv):
    weather = dict()
    for arg in argv:
        weather[arg] = []
        
    dates = []   
    token = get_token(mesowest_api_key)
    
    for dt in daterange(start_date, end_date):
        response = run_mesowest_api(state, county, dt.year, dt.month, dt.day, token)
        dates.append(dt)
        t1 = time()
        for var in argv:            
            weather[var] += [get_avg(response, var)]
        t2 = time()
#         print("Finished pulling data for date {}. \nTime taken: {}".format(dt,t2-t1))
        
    weather_df = pd.DataFrame(weather)
    weather_df["Date"] = dates
    weather_df["State"] = state
    weather_df["County"] = county
    
    return weather_df

In [13]:
temp = pd.DataFrame()
for i in range(len(counties_abbrev)):  
    county_name = counties_abbrev["County"][i]
    if isinstance(counties_abbrev["Abbreviation"][i], str) & isinstance(counties_abbrev["County"][i], str):
        print("\nGetting weather data for county {} - {}".format(counties_abbrev["OrigIndex"][i], county_name))  
        temp_upd = get_county_weather_data(counties_abbrev["Abbreviation"][i], counties_abbrev["County"][i].replace(" ", "%"), start_date, end_date, "air_temp_set_1", "altimeter_set_1", "wind_speed_set_1", "relative_humidity_set_1", "precip_accum_24_hour_set_1")
        temp = pd.concat([temp, temp_upd])
    else:
        print("\nSkipping county {} - {}".format(i,county_name))
        continue


Getting weather data for county 939 - Frederick
No data found by county. Querying by city...
No data found by county. Querying by city...
No data found by county. Querying by city...
No data found by county. Querying by city...
No data found by county. Querying by city...
No data found by county. Querying by city...
No data found by county. Querying by city...
No data found by county. Querying by city...
No data found by county. Querying by city...
No data found by county. Querying by city...
No data found by county. Querying by city...
No data found by county. Querying by city...
No data found by county. Querying by city...
No data found by county. Querying by city...
No data found by county. Querying by city...
No data found by county. Querying by city...
No data found by county. Querying by city...
No data found by county. Querying by city...
No data found by county. Querying by city...
No data found by county. Querying by city...
No data found by county. Querying by city...
No dat



No data found by county. Querying by city...
No data found by county. Querying by city...
No data found by county. Querying by city...
No data found by county. Querying by city...
No data found by county. Querying by city...
No data found by county. Querying by city...
No data found by county. Querying by city...
No data found by county. Querying by city...
No data found by county. Querying by city...
No data found by county. Querying by city...
No data found by county. Querying by city...
No data found by county. Querying by city...
No data found by county. Querying by city...
No data found by county. Querying by city...
No data found by county. Querying by city...
No data found by county. Querying by city...
No data found by county. Querying by city...
No data found by county. Querying by city...
No data found by county. Querying by city...
No data found by county. Querying by city...
No data found by county. Querying by city...
No data found by county. Querying by city...
No data fo

No data found by county. Querying by city...
No data found by county. Querying by city...
No data found by county. Querying by city...
No data found by county. Querying by city...
No data found by county. Querying by city...
No data found by county. Querying by city...
No data found by county. Querying by city...
No data found by county. Querying by city...
No data found by county. Querying by city...
No data found by county. Querying by city...
No data found by county. Querying by city...
No data found by county. Querying by city...
No data found by county. Querying by city...
No data found by county. Querying by city...
No data found by county. Querying by city...
No data found by county. Querying by city...
No data found by county. Querying by city...
No data found by county. Querying by city...
No data found by county. Querying by city...
No data found by county. Querying by city...
No data found by county. Querying by city...
No data found by county. Querying by city...
No data fo

No data found by county. Querying by city...
No data found by county. Querying by city...
No data found by county. Querying by city...
No data found by county. Querying by city...
No data found by county. Querying by city...
No data found by county. Querying by city...
No data found by county. Querying by city...
No data found by county. Querying by city...


ConnectionError: HTTPConnectionPool(host='api.mesowest.net', port=80): Max retries exceeded with url: /v2/stations/statistics?state=VA&county=Fredericksburg%25City&start=201608060000&end=201608060000&obtimezone=local&token=3335f29b2e314228b95f493aec8d4fb4&type=average (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x119cf7390>: Failed to establish a new connection: [Errno 8] nodename nor servname provided, or not known'))

In [None]:
temp.reset_index(drop=True,inplace=True)
# temp
temp.to_csv("weather_data_Tess_939-999.csv", index=None,header=True)

In [None]:
counties_abbrev = pd.read_csv("counties_abbrev_Tess_1000s.csv")

In [None]:
temp = pd.DataFrame()
for i in range(len(counties_abbrev)):  
    county_name = counties_abbrev["County"][i]
    if isinstance(counties_abbrev["Abbreviation"][i], str) & isinstance(counties_abbrev["County"][i], str):
        print("\nGetting weather data for county {} - {}".format(counties_abbrev["OrigIndex"][i], county_name))  
        temp_upd = get_county_weather_data(counties_abbrev["Abbreviation"][i], counties_abbrev["County"][i].replace(" ", "%"), start_date, end_date, "air_temp_set_1", "altimeter_set_1", "wind_speed_set_1", "relative_humidity_set_1", "precip_accum_24_hour_set_1")
        temp = pd.concat([temp, temp_upd])
    else:
        print("\nSkipping county {} - {}".format(i,county_name))
        continue

In [None]:
temp.reset_index(drop=True,inplace=True)
# temp
temp.to_csv("weather_data_Tess_1000s.csv", index=None,header=True)