## Weather Augmentation

In [1]:
import datetime as dt, json, math, matplotlib.pyplot as plt, meteostat, \
       multiprocessing, numpy as np, os, pandas as pd, regex as re, requests
from datetime  import datetime, timedelta
from functools import reduce
from joblib    import Parallel, delayed
from LatLon23  import LatLon, Latitude, Longitude
from meteostat import Point, Daily, Stations
from noaa_sdk  import NOAA
from tqdm.auto import tqdm

tqdm.pandas()
# use regex to find latest iteration of data file
def latest_file(filename, prefix_path = 'Data/New Data', filetype = 'csv'):
    file_re = r'^' + filename + '_\d{4}\-\d{2}\-\d{2}\.' + filetype + '$'
    data_file_path = list(sorted([i for i in os.listdir(prefix_path) if re.match(file_re, i)]))[-1]
    return prefix_path + '/' + data_file_path
output_str = 'Data/New Data/{}_{}.csv'
curr_date  = datetime.now().strftime('%Y-%m-%d')

data      = pd.read_csv(latest_file('final_data'), low_memory = False).set_index('id')
zipcodes  = pd.read_csv(latest_file('zipcodes'), low_memory = False).iloc[:, 1:]
# station codes
# noaa_data = pd.read_csv('ftp://ftp.ncdc.noaa.gov/pub/data/noaa/isd-history.csv', low_memory = False)

In [2]:
def fix_dates(row):
    try:
        # coords = [float(row['init_coords'].lat.to_string('%D')), \
        #           float(row['init_coords'].lon.to_string('%D'))]
        coords = [float(i) for i in row['init_coords'].split(', ')]
    except:
        try:
            # coords = [float(row['final_coords'].lat.to_string('%D')), \
            #           float(row['final_coords'].lon.to_string('%D'))]
            coords = [float(i) for i in row['final_coords'].split(', ')]
        except:
            try:
                # coords = zipcodes.query('zip == {}'.format(row['zip']))[['lat', 'lng']]\
                #                  .values[0].tolist()
                coords = zipcodes.query('zip == {}'.format(row['zip']))[['lat', 'lng']]\
                                 .values[0].tolist()
            except:
                coords = [np.nan, np.nan]
    try:
        try:
            date = datetime.strptime(row['datetime_found'], '%Y-%m-%d %H:%M:%S.%f')
        except:
            date = datetime.strptime(row['datetime_found'], '%Y-%m-%d %H:%M:%S')
    except:
        try:
            try:
                date = datetime.strptime(row['datetime_notif'], '%Y-%m-%d %H:%M:%S.%f')
            except:
                date = datetime.strptime(row['datetime_notif'], '%Y-%m-%d %H:%M:%S')
        except:
            try:
                date = datetime.strptime(row['init_datetime'], '%Y-%m-%d %H:%M:%S.%f')
            except:
                date = datetime.strptime(row['init_datetime'], '%Y-%m-%d %H:%M:%S')
    date_s = date - timedelta(hours = 12)
    date_e = date + timedelta(hours = 12)
    year   = date.year
    month  = date.month
    ## calculate season as categorical variable
    ## from 'https://stackoverflow.com/questions/16139306/determine-season-given-timestamp-in-python-using-datetime'
    # var is day out of 365
    day_of_yr = date.timetuple().tm_yday
    spring = range(80,  172)
    summer = range(172, 264)
    fall   = range(264, 355)
    # winter would require two range, have it classified with 'else'
    if day_of_yr in spring:
        season = 'spring'
    elif day_of_yr in summer:
        season = 'spring'
    elif day_of_yr in fall:
        season = 'fall'
    else:
        season = 'winter'
    return [date_s, date, date_e, year, month, season, *coords, row['zip'], row['state']]

In [3]:
dates_and_coords = data[[i for i in data.columns if 'time' in i]].dropna(thresh = 1)\
                        .join(data[['init_coords', 'final_coords', 'zip']].join(data[['state']])\
                        .replace({'zip': {0: np.nan}}).dropna(thresh = 1), how = 'inner')
    
dc_cols = ['date_start', 'date', 'date_end', 'year', \
           'month', 'season', 'lat', 'lon', 'zip', 'state']
dates_and_coords = dates_and_coords.progress_apply(lambda x: fix_dates(x), axis = 1)
dates_and_coords = pd.DataFrame.from_dict(dict(zip(dates_and_coords.index, \
                                                   dates_and_coords.values)))\
                               .T.set_axis(dc_cols, axis = 1)
dates_and_coords.loc[dates_and_coords['lat'] > 50, 'lat'] = dates_and_coords.loc[dates_and_coords['lat'] > 50, 'lat'] / 10
dates_and_coords = dates_and_coords[~((dates_and_coords['lat'].isna() & \
                                       dates_and_coords['lon'].isna()) | \
                                      (dates_and_coords['zip'].isna()))]
dates_and_coords['station'] = dates_and_coords.progress_apply(lambda x: Stations()\
                                                             .nearby(x['lat'], x['lon'])\
                                                             .fetch(1).index[0], axis = 1)
dates_and_coords.to_csv(output_str.format('dates_and_coords', curr_date))
dates_and_coords.head(10).style.background_gradient()

  0%|          | 0/8351 [00:00<?, ?it/s]

  0%|          | 0/6006 [00:00<?, ?it/s]

Unnamed: 0,date_start,date,date_end,year,month,season,lat,lon,zip,state,station
AZ00000,2005-07-03 07:05:00,2005-07-03 19:05:00,2005-07-04 07:05:00,2005,7,spring,32.7113,-109.58411,85546.0,AZ,KSAD0
AZ00001,2005-07-04 04:30:00,2005-07-04 16:30:00,2005-07-05 04:30:00,2005,7,spring,31.434933,-110.232333,85635.0,AZ,KOLS0
AZ00002,2005-07-01 10:00:00,2005-07-01 22:00:00,2005-07-02 10:00:00,2005,7,spring,33.971817,-109.095033,86503.0,AZ,KSJN0
AZ00003,2005-07-03 08:30:00,2005-07-03 20:30:00,2005-07-04 08:30:00,2005,7,spring,33.8212,-109.149183,86503.0,AZ,KSJN0
AZ00004,2005-07-08 01:00:00,2005-07-08 13:00:00,2005-07-09 01:00:00,2005,7,spring,34.458783,-111.250883,85541.0,AZ,KPAN0
AZ00005,2005-07-11 01:00:00,2005-07-11 13:00:00,2005-07-12 01:00:00,2005,7,spring,35.326533,-111.688433,86004.0,AZ,72376
AZ00006,2005-07-16 19:00:00,2005-07-17 07:00:00,2005-07-17 19:00:00,2005,7,spring,32.738133,-109.569217,85546.0,AZ,KSAD0
AZ00007,2005-07-23 13:00:00,2005-07-24 01:00:00,2005-07-24 13:00:00,2005,7,spring,31.70055,-109.498167,85635.0,AZ,KDUG0
AZ00008,2005-07-24 03:00:00,2005-07-24 15:00:00,2005-07-25 03:00:00,2005,7,spring,31.9024,-109.278467,85635.0,AZ,KDUG0
AZ00009,2005-07-14 06:00:00,2005-07-14 18:00:00,2005-07-15 06:00:00,2005,7,spring,34.65,-112.5,86314.0,AZ,KPRC0


In [8]:
dates_and_coords = pd.read_csv(latest_file('dates_and_coords'),   low_memory = False)\
                     .rename(columns = {'Unnamed: 0': 'id'}).set_index('id')
meteostat_data   = pd.read_csv(latest_file('fixed_weather_data'), low_memory = False)\
                     .rename(columns = {'Unnamed: 0': 'id'})\
                     .set_index('id').dropna(thresh = 3, axis = 0)
def mt_meteostat(list_of_values):
    # structure list as [idx, lat, lon, date_start (dts), date_end (dte)]
    idx = list_of_values[0]
    lat = list_of_values[1]
    lon = list_of_values[2]
    dts = list_of_values[3]
    dte = list_of_values[4]
    try:
        weather = Daily(Point(lat, lon), dts, dte).fetch().values.flatten().tolist()
    except:
        weather = [np.nan] * 10
    return [idx, *weather][0:11]

weather_col_names = ['id', 'temp_avg', 'temp_min', 'temp_max', 'rain', \
                     'snow_depth', 'wdir', 'wspd', 'wpgt', 'pres', 'tsun']
meteostat_mt      = [[idx, row['lat'], row['lon'], row['date_start'], row['date_end']] for \
                      idx, row in dates_and_coords.loc[list(set(dates_and_coords.index) - set(meteostat_data.index))].iterrows()]
meteostat_mt      = Parallel(n_jobs = 8)(delayed(mt_meteostat)(row) for row in tqdm(meteostat_mt))
meteostat_mt      = pd.DataFrame(meteostat_mt, columns = weather_col_names).set_index('id')
meteostat_mt['rain']       = meteostat_mt['rain'].fillna(0)
meteostat_mt['snow_depth'] = meteostat_mt['snow_depth'].fillna(0)
meteostat_data    = pd.concat([meteostat_data, meteostat_mt[weather_col_names[1:6]]], axis = 0)#.dropna(thresh = 3)
meteostat_data.loc[(~(meteostat_data['temp_max'] == 0) & ~(meteostat_data['temp_min'] == 0) & ~(meteostat_data['temp_avg'] == 0))]\
              .to_csv(output_str.format('fixed_meteostat_data', curr_date))
meteostat_data.head(10).style.background_gradient()

  0%|          | 0/1800 [00:00<?, ?it/s]

Unnamed: 0_level_0,temp_max,temp_min,temp_avg,rain,snow_depth
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
AZ00000,41.1,17.8,41.1,0.0,0.0
AZ00001,38.3,21.7,38.3,0.0,0.0
AZ00002,37.2,11.7,37.2,0.0,0.0
AZ00003,37.2,13.3,37.2,0.0,0.0
AZ00005,41.1,11.7,41.1,0.0,0.0
AZ00006,13.9,-12.8,13.9,0.0,0.0
AZ00007,33.3,9.4,22.5,0.0,0.0
AZ00008,42.2,26.1,42.2,0.0,0.0
AZ00009,12.8,-11.7,12.8,0.0,35.6
AZ00010,32.8,19.4,32.8,2.5,0.0


In [18]:
%%html
<style>
table {float:left}
</style>

From [Meteostat API documentation](https://dev.meteostat.net/python/daily.html#data-structure)

| Column | Description | Type |
|---|:--|---|
| station | The Meteostat ID of the weather station (only if query refers to multiple stations) | String |
| time | The date | Datetime64 |
| tavg | The average air temperature in °C | Float64 |
| tmin | The minimum air temperature in °C | Float64 |
| tmax | The maximum air temperature in °C | Float64 |
| prcp | The daily precipitation total in mm | Float64 |
| snow | The snow depth in mm | Float64 |
| wdir | The average wind direction in degrees (°) | Float64 |
| wspd | The average wind speed in km/h | Float64 |
| wpgt | The peak wind gust in km/h | Float64 |
| pres | The average sea-level air pressure in hPa | Float64 |
| tsun | The daily sunshine total in minutes (m) | Float64 |dates_and_coords.loc[list(set(dates_and_coords.index) - set(meteostat_data.index))].iterrows():

In [15]:
# choose btwn longer/more recent of two: meteostat or combined data
dates_and_coords = pd.read_csv(latest_file('dates_and_coords'), low_memory = False)\
                     .rename(columns = {'Unnamed: 0': 'id'}).set_index('id')
meteostat_data   = pd.read_csv(latest_file('fixed_meteostat_data'), \
                               low_memory = False).set_index('id')
noaa_weather_df  = pd.read_csv(latest_file('noaa_weather'), \
                               low_memory = False).set_index('id')
headers = {0: {'token': 'LOBpbjvKjTrjNLKLHLxXOuAEpHjufMnL'},
           1: {'token': 'nCsRKOeVJRlsKSsPBgUnhzQXhKPjnQfg'},
           2: {'token': 'dGzICFqVeEdsLkSjgpoAePDawRDUREvn'},
           3: {'token': 'pVyuHAdYUTDlxSEgZAiXgbldynkhyOxn'}}
def hist_noaa_data(zipc, start_date, end_date, token):
    url_format = "https://www.ncdc.noaa.gov/cdo-web/api/v2/data?datasetid=GHCND&locationid=ZIP:{0:.0f}&startdate={1:}&enddate={2:}"
    # use strftime to automatically pull strings from timestamp
    try:
        url = url_format.format(zipc, start_date.strftime('%Y-%m-%d'), end_date.strftime('%Y-%m-%d'))
    except:
        s_date = datetime.strptime(start_date, '%Y-%m-%d %H:%M:%S.%f')\
                         .strftime('%Y-%m-%d')
        e_date = datetime.strptime(end_date,   '%Y-%m-%d %H:%M:%S.%f')\
                         .strftime('%Y-%m-%d')
        url = url_format.format(zipc, s_date, e_date)
    r = requests.get(url, headers = token)
    d = json.loads(r.text)
    try:
        return d['results']
    except:
        return ':('
    
with open('Data/New Data/fixed_noaa_weather_data.json') as weather_json:
    noaa_weather_data = json.load(weather_json)
count_idx = 0
for idx, row in dates_and_coords.loc[list(set(dates_and_coords.index) - set(meteostat_data.index))].iterrows():
    count_idx += 1
    noaa_weather_data[idx] = hist_noaa_data(row['zip'], row['date_start'], row['date_end'], \
                                            token = headers[np.random.randint(0, 3)])
    if count_idx % 25 == 0:
        print('{} weather observations'.format(count_idx))

index_failed = []
for idx in noaa_weather_data.keys():
    if noaa_weather_data[idx] == ':(':
        index_failed.append(idx)
    else:
        wdf = pd.DataFrame(noaa_weather_data[idx])[['datatype', 'value']]
        wdf['id'] = idx
        for var in set([i['datatype'] for i in noaa_weather_data[idx]]):
            wdf.loc[wdf['datatype'] == var, 'datatype'] = [j + '-' + str(num) for [num, j] in \
                                                           enumerate(wdf.loc[wdf['datatype'] == var, 'datatype'])]
        noaa_weather_df = pd.concat([noaa_weather_df, wdf])
noaa_weather_df = noaa_weather_df.loc[[i for i in noaa_weather_df.index if not isinstance(i, int)]].sort_index()[meteostat_data.columns]
noaa_weather_df.head(10).style.background_gradient()

Unnamed: 0,temp_max,temp_min,temp_avg,rain,snow_depth
AZ00004,41.1,11.7,41.1,0.0,0.0
AZ00047,30.6,12.8,30.6,0.0,0.0
AZ00049,23.9,5.0,23.9,0.0,0.0
AZ00062,24.4,8.3,24.4,0.0,0.0
AZ00071,26.1,2.8,26.1,0.0,0.0
AZ00100,21.1,0.0,21.1,0.0,0.0
AZ00128,21.7,-0.6,21.7,0.0,0.0
AZ00133,2.8,-12.2,2.8,0.0,0.0
AZ00140,21.1,-5.6,21.1,0.0,0.0
AZ00145,29.4,1.1,29.4,0.0,0.0


In [16]:
noaa_weather_df

Unnamed: 0,temp_max,temp_min,temp_avg,rain,snow_depth
AZ00004,41.1,11.7,41.1,0.0,0.0
AZ00047,30.6,12.8,30.6,0.0,0.0
AZ00049,23.9,5.0,23.9,0.0,0.0
AZ00062,24.4,8.3,24.4,0.0,0.0
AZ00071,26.1,2.8,26.1,0.0,0.0
...,...,...,...,...,...
UT00014,11.7,-6.7,11.7,30.5,139.7
UT00056,3.9,-8.3,3.9,0.0,5.0
UT00156,,,,0.0,109.2
UT00161,,,,0.0,109.2


In [18]:
pd.concat([meteostat_data, noaa_weather_df.loc[list(set(noaa_weather_df.index) - set(meteostat_data.index))]], axis = 0).dropna(thresh = 3)

Unnamed: 0,temp_max,temp_min,temp_avg,rain,snow_depth
AZ00000,41.1,17.8,41.1,0.0,0.0
AZ00001,38.3,21.7,38.3,0.0,0.0
AZ00002,37.2,11.7,37.2,0.0,0.0
AZ00003,37.2,13.3,37.2,0.0,0.0
AZ00005,41.1,11.7,41.1,0.0,0.0
...,...,...,...,...,...
OR00031,11.1,1.7,11.1,3.0,0.0
TN00144,32.2,17.8,32.2,21.3,0.0
OR00390,21.1,0.0,21.1,0.0,0.0
OR00918,11.7,0.0,11.7,3.6,0.0


In [8]:
noaa_weather_df = noaa_weather_df.pivot(index = 'id', columns = 'datatype', values = 'value').dropna(axis = 0, how = 'all')
def mean_list(input_list):
    if len(input_list) == 0:
        m = np.nan
    else:
        m = sum(input_list) / max(len(input_list), 1)
    return m

for idx, row in noaa_weather_df.iterrows():
    try:
        noaa_weather_df.loc[idx, 'temp_max'] = max([val / 10 for idx, val in row.dropna().iteritems() if idx.startswith('TMAX')])
    except:
        noaa_weather_df.loc[idx, 'temp_max'] = np.nan
    try:
        noaa_weather_df.loc[idx, 'temp_min'] = min([val / 10 for idx, val in row.dropna().iteritems() if idx.startswith('TMIN')])
    except:
        noaa_weather_df.loc[idx, 'temp_min'] = np.nan
    try:
        noaa_weather_df.loc[idx, 'temp_avg'] = max([val / 10 for idx, val in row.dropna().iteritems() if idx.startswith('TMAX')])
    except:
        noaa_weather_df.loc[idx, 'temp_avg'] = np.nan
    
    
    try:
        noaa_weather_df.loc[idx, 'rain']       = sum([val / 10 for idx, val in row.dropna().iteritems() if idx.startswith('PRCP')])
    except:
        noaa_weather_df.loc[idx, 'rain']       = np.nan
    try:
        noaa_weather_df.loc[idx, 'snow_fall']  = sum([val / 10 for idx, val in row.dropna().iteritems() if idx.startswith('SNOW')])
    except:
        noaa_weather_df.loc[idx, 'snow_fall']  = np.nan
    try:
        noaa_weather_df.loc[idx, 'snow_depth'] = sum([val / 10 for idx, val in row.dropna().iteritems() if idx.startswith('SNWD')])
    except:
        noaa_weather_df.loc[idx, 'snow_depth'] = np.nan
        
noaa_weather_df.columns.name = None
noaa_weather_df = noaa_weather_df[['temp_max', 'temp_min', 'temp_avg', 'rain', 'snow_depth']]
noaa_weather_df.to_csv(output_str.format('noaa_weather', curr_date))

NameError: name 'noaa_weather_df' is not defined

In [6]:
dates_and_coords = pd.read_csv(latest_file('dates_and_coords'), low_memory = False)\
                     .rename(columns = {'Unnamed: 0': 'id'}).set_index('id')
meteostat_data   = pd.read_csv(latest_file('fixed_meteostat_data'), 
                               low_memory = False).set_index('id')
noaa_list = [[idx, row['zip'], row['date_start'], row['date_end']] for idx, row in \
              dates_and_coords.loc[list(set(dates_and_coords.index) - set(meteostat_data.index))].iterrows()]
# number of observations to attempt at a time
num_noaa  = 100
with open('Data/New Data/fixed_noaa_weather_data.json') as weather_json:
    noaa_weather_data = json.load(weather_json)
def hist_noaa_data(zipc, start_date, end_date, token):
    url_format = "https://www.ncdc.noaa.gov/cdo-web/api/v2/data?datasetid=GHCND&locationid=ZIP:{0:.0f}&startdate={1:}&enddate={2:}"
    # use strftime to automatically pull strings from timestamp
    try:
        url = url_format.format(zipc, start_date.strftime('%Y-%m-%d'), end_date.strftime('%Y-%m-%d'))
    except:
        s_date = datetime.strptime(start_date, '%Y-%m-%d %H:%M:%S.%f')\
                         .strftime('%Y-%m-%d')
        e_date = datetime.strptime(end_date,   '%Y-%m-%d %H:%M:%S.%f')\
                         .strftime('%Y-%m-%d')
        url = url_format.format(zipc, s_date, e_date)
    r = requests.get(url, headers = token)
    d = json.loads(r.text)
    try:
        return d['results']
    except:
        return ':('
def pull_noaa_data(list_of_values, weather_dict = noaa_weather_data):
    # structure list as [idx, zip (zipc), date_start (dts), date_end (dte)]
    idx  = list_of_values[0]
    zipc = list_of_values[1]
    dts  = list_of_values[2]
    dte  = list_of_values[3]
    headers = {0: {'token': 'LOBpbjvKjTrjNLKLHLxXOuAEpHjufMnL'},
               1: {'token': 'nCsRKOeVJRlsKSsPBgUnhzQXhKPjnQfg'},
               2: {'token': 'dGzICFqVeEdsLkSjgpoAePDawRDUREvn'},
               3: {'token': 'pVyuHAdYUTDlxSEgZAiXgbldynkhyOxn'}}
    # randomly select an API key
    weather_dict[idx] = hist_noaa_data(zipc, dts, dte, token = headers[np.random.randint(0, 3)])

# noaa_mt = Parallel(n_jobs = 8)(delayed(pull_noaa_data)(row) for row in tqdm(noaa_list[:num_noaa]))
with open('Data/New Data/fixed_noaa_weather_data.json', 'w') as fp:
    json.dump(noaa_weather_data, fp)

index_failed = []
for idx in noaa_weather_data.keys():
    if noaa_weather_data[idx] == ':(':
        index_failed.append(idx)
    else:
        wdf = pd.DataFrame(noaa_weather_data[idx])[['datatype', 'value']]
        wdf['id'] = idx
        for var in set([i['datatype'] for i in noaa_weather_data[idx]]):
            wdf.loc[wdf['datatype'] == var, 'datatype'] = [j + '-' + str(num) for [num, j] in \
                                                           enumerate(wdf.loc[wdf['datatype'] == var, 'datatype'])]
        noaa_weather_df = pd.concat([noaa_weather_df, wdf])
noaa_weather_df = noaa_weather_df.loc[[i for i in noaa_weather_df.index if not isinstance(i, int)]].sort_index()[meteostat_data.columns]
noaa_weather_df.head(10).style.background_gradient()

  0%|          | 0/3 [00:00<?, ?it/s]

NameError: name 'noaa_weather_df' is not defined

In [25]:
combined_weather_data = pd.concat([meteostat_data, noaa_weather_df.loc[list(set(noaa_weather_df.index) - set(meteostat_data.index))]], axis = 0)

combined_weather_data['rain']       = combined_weather_data['rain'].fillna(0)
combined_weather_data['snow_depth'] = combined_weather_data['snow_depth'].fillna(0)
combined_weather_data.to_csv(output_str.format('fixed_weather_data', curr_date))
combined_weather_data.head(10).style.background_gradient()

Unnamed: 0,temp_max,temp_min,temp_avg,rain,snow_depth
AZ00000,41.1,17.8,41.1,0.0,0.0
AZ00001,38.3,21.7,38.3,0.0,0.0
AZ00002,37.2,11.7,37.2,0.0,0.0
AZ00003,37.2,13.3,37.2,0.0,0.0
AZ00005,41.1,11.7,41.1,0.0,0.0
AZ00006,13.9,-12.8,13.9,0.0,0.0
AZ00007,33.3,9.4,22.5,0.0,0.0
AZ00008,42.2,26.1,42.2,0.0,0.0
AZ00009,12.8,-11.7,12.8,0.0,35.6
AZ00010,32.8,19.4,32.8,2.5,0.0


In [26]:
combined_weather_data

Unnamed: 0,temp_max,temp_min,temp_avg,rain,snow_depth
AZ00000,41.1,17.8,41.1,0.0,0.0
AZ00001,38.3,21.7,38.3,0.0,0.0
AZ00002,37.2,11.7,37.2,0.0,0.0
AZ00003,37.2,13.3,37.2,0.0,0.0
AZ00005,41.1,11.7,41.1,0.0,0.0
...,...,...,...,...,...
OR00314,,,,0.0,0.0
AZ00134,,,,0.0,0.0
OR00390,21.1,0.0,21.1,0.0,0.0
OR00918,11.7,0.0,11.7,3.6,0.0


### Pulling weather data:
* https://github.com/meteostat/meteostat-python
    * [Alternate bulk data download](https://dev.meteostat.net/bulk/)
    * [Guide here](https://meteostat.net/en/blog/obtain-weather-data-any-location-python)
* https://openweathermap.org/history
    * Limit of 60 API calls per minute
* Use NOAA API from previous group
    * [Guide here](https://stackoverflow.com/questions/18828890/how-to-use-the-noaa-api-to-query-past-weather-data-for-a-given-set-of-coordinate)
    * [Another guide](https://grantwinney.com/what-is-noaa-api/)
    * https://github.com/paulokuong/noaa
    * https://www.ncdc.noaa.gov/cdo-web/webservices/v2#datasets
        * Limit of 1000 API calls per day
        * [Request a token here](https://www.ncdc.noaa.gov/cdo-web/token)
        * [API documentation here](https://www.ncei.noaa.gov/support/access-data-service-api-user-documentation)
        * Download bulk with `pd.read_csv('ftp://ftp.ncdc.noaa.gov/pub/data/noaa/isd-history.csv')`

### Previous group's weather function:

```
def getData(startdate,enddate, zipcode):
    try: 
        url = "https://www.ncdc.noaa.gov/cdo-web/api/v2/data?datasetid=GHCND&locationid=ZIP:"+str(zipcode)+"&startdate="+startdate+"&enddate="+enddate
        r = requests.get(url, headers=headers)
    #load the api response as a json
        d = json.loads(r.text)
        max_temps = [item["value"]/10 for item in d['results'] if item['datatype']=='TMAX']
        min_temps = [item["value"]/10 for item in d['results'] if item['datatype']=='TMIN']
        air_temps = [item["value"]/10 for item in d['results'] if item['datatype']=='TEMP']
        return max_temps, min_temps, air_temps
    except: 
        return None, None
```

### To-Do:
* imputation: (e.g. impute age: `data['age'].mean()`