# Weather Data Export

Station metadata:

   - name, ID, stid: identifiers for the weather station
   - elevation, latitude, longitude: location of the weather station

Observations:

   - date_time
   - air_temp (Celcius)
   - precip_accum (millimeters)
   - relative_humidity (%)
   - wind_speed (m/s)
   - wind_gust (m/s)


In [1]:
import pandas as pd
import numpy as np
import json
import math
import time
import datetime
from dateutil import rrule
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta
from urllib.request import urlopen
from pandas.io.json import json_normalize

In [2]:
outputPath = "E:/GitHub/w210_Wildfire/data/export/"
baseurl = "https://api.synopticdata.com/v2/stations/timeseries?state=ca&vars=air_temp,wind_speed,wind_gust,pressure,relative_humidity,precip_accum&token=3126cda0bfe5490f91911a15826bbf3b"
obsvars = ['NAME', 'ID', 'STID', 'ELEVATION', 'LATITUDE', 'LONGITUDE',
           'OBSERVATIONS.date_time', 'OBSERVATIONS.air_temp_set_1', 'OBSERVATIONS.precip_accum_set_1', 
           'OBSERVATIONS.relative_humidity_set_1', 'OBSERVATIONS.wind_speed_set_1','OBSERVATIONS.wind_gust_set_1']    

In [3]:
metadataurl = "https://api.synopticdata.com/v2/stations/metadata?state=ca&status=active&start=201601010000&end=201812310000&token=3126cda0bfe5490f91911a15826bbf3b"
response = urlopen(metadataurl)
json_meta_data = response.read().decode('utf-8', 'replace')
metadata = json.loads(json_meta_data)
station_metadata = json_normalize(metadata['STATION'])
stationids = station_metadata['STID']


In [4]:
stationids[5:12]

5     KBIH
6     KBLH
7     KBLU
8     KBUR
9     KBYS
10    KCCR
11    KCEC
Name: STID, dtype: object

In [5]:
# Function to extract observations from single station JSON and return daily summary data

def extract_observations(obs):
    cols = list(obs.columns)
    obs = obs.iloc[0]
    date_time = obs['OBSERVATIONS.date_time']
    date = [pd.to_datetime(d).date() for d in date_time]
    dtlen = len(date_time)
    fake = [np.NaN]*dtlen
    
    # Extract the observations if the columns exist
    air_temp = fake if 'OBSERVATIONS.air_temp_set_1' not in cols else list(np.float_(obs['OBSERVATIONS.air_temp_set_1']))                                                                   
    precip_accum = fake if 'OBSERVATIONS.precip_accum_set_1' not in cols else list(np.float_(obs['OBSERVATIONS.precip_accum_set_1']))
    relative_humidity = fake if 'OBSERVATIONS.relative_humidity_set_1' not in cols else list(np.float_(obs['OBSERVATIONS.relative_humidity_set_1']))
    wind_speed = fake if 'OBSERVATIONS.wind_speed_set_1' not in cols else list(np.float_(obs['OBSERVATIONS.wind_speed_set_1']))
    wind_gust = fake if 'OBSERVATIONS.wind_gust_set_1' not in cols else list(np.float_(obs['OBSERVATIONS.wind_gust_set_1']))

    # Create a dataframe with the observations and date
    full_obs_df = pd.DataFrame(np.column_stack([date, air_temp, precip_accum, relative_humidity, wind_speed, wind_gust]),
                          columns=['date', 'air_temp', 'precip_accum', 'relative_humidity', 'wind_speed', 'wind_gust'])
    full_obs_df[['air_temp', 'precip_accum', 'relative_humidity', 'wind_speed', 'wind_gust']] = full_obs_df[['air_temp', 'precip_accum', 'relative_humidity', 'wind_speed', 'wind_gust']].apply(pd.to_numeric, errors='coerce')

    # Group by date and create daily summary features
    obs_df = full_obs_df.groupby('date', as_index=False).agg({'air_temp':['max', 'min', 'mean'], 'precip_accum':'max', 
                                            'relative_humidity':['max', 'min', 'mean'],
                                            'wind_speed':['max', 'min', 'mean'], 'wind_gust':'max',})
    obs_df.columns = ["_".join(x) for x in obs_df.columns.ravel()]
    
    # Add station metadata
    obs_df['name'] = obs['NAME']
    obs_df['ID'] = obs['ID']
    obs_df['stid'] = obs['STID']
    obs_df['elevation'] = np.float_(obs['ELEVATION'])
    obs_df['latitude'] = np.float_(obs['LATITUDE'])
    obs_df['longitude'] = np.float_(obs['LONGITUDE'])
    
    return obs_df
    

In [15]:
strstart = "201609010000"
strend = "201612312359"

weather = pd.DataFrame()

for stid in stationids[4000:5000]:    
    
    # API call for stationid using start and end day
    url = baseurl + "&stid=" + stid + "&start=" + strstart + "&end=" + strend
#    try:
#        response = urlopen(url)
#    except:
#        time.sleep(5)
#        response = urlopen(url)

    for attempt in range(10):
        try:
            response = urlopen(url)
        except:
            print("Retry connection: " + stid)
            time.sleep(10)
        else:
            break
    else:
        print("Failed connection: " + stid)

    json_data = response.read().decode('utf-8', 'replace')
    data = json.loads(json_data)
        
    # obtain station observations from JSON and append to data frame
    try:
        station = json_normalize(data['STATION'], errors='ignore')
    except:
        print("No data for " + stid + " on " + strstart)
    else:
        if not station.empty:
            observations = extract_observations(station)
            # dump station data to csv
            filename = strstart + "_" + stid + ".csv"
            observations.to_csv(outputPath + filename, header=True, index=False)
            weather = weather.append(observations, ignore_index = True)

# full dump of annual weather data
filename = strstart + "_all" + ".csv"
weather.to_csv(outputPath + filename, header=True, index=False)

Retry connection: AV447
Retry connection: PG115
Retry connection: SE082
Retry connection: F3842
Retry connection: SE095
Retry connection: PG148
Retry connection: PG164
Retry connection: F4319
Retry connection: F4326
Retry connection: PG202
Retry connection: XOMC1
Retry connection: SFEC1
Retry connection: TCICA
Retry connection: PG218
Retry connection: PG244
Retry connection: SE177
Retry connection: PG278
Retry connection: SE195
Retry connection: PG253
Retry connection: SE196
Retry connection: SE198
Retry connection: DLLC1
Retry connection: PG334
Retry connection: SE232
Retry connection: PG340
Retry connection: PG325
Retry connection: PG333
Retry connection: F5340
Retry connection: PG324
Retry connection: A2722
Retry connection: A3581
Retry connection: PG354
Retry connection: PG328
No data for SE264 on 201609010000
No data for SE265 on 201609010000
No data for SE266 on 201609010000
No data for PG376 on 201609010000
Retry connection: PG395
Retry connection: SE287
Retry connection: SE293


In [8]:
stationids[1000]

'BGPC1'

In [13]:
len(stationids)

4778