In [None]:
# This file is capable of building an entire year's worth of half-hourly historical weather data for a given city;
# It is set up for beijing; null values have been cleaned, and times are in local time unless otherwise specified

In [616]:
import urllib2
import json
import pandas as pd
import datetime
import pytz
import numpy as np
from datetime import date, timedelta as td
import ast

In [None]:
# magic constants
YEAR = 0
MONTH = 1
DAY = 2
API_KEY = ''

In [720]:
# call the wunderground api for some hourly weather data at a certain point in time
# @param year=string, month=string, day=string
# @return weather data for a day in beijing
def get_one_day(year, month, day):     
    url = 'http://api.wunderground.com/api/' + API_KEY + '/history_' + year + month + day +'/geolookup/q/Beijing/Beijing.json'
    f = urllib2.urlopen(url)
    json_string = f.read()
    parsed_json = json.loads(json_string)
    return parsed_json['history']['observations']

In [None]:
# by their own admission, the API at wunderground is a bit wonky
# in some cases, duplicate historical records are returned for the same moment in time
# we can spot these duplicates via regex
filter_out = df.metar.apply(lambda x: not x.startswith('AAXX'))
df = df[filter_out]

In [596]:
# wunderground uses variations on -9999 to indicate NA values
# replace all -999 with NaN values
df = df.applymap(lambda x: np.nan if x == '-9999' or x == '-9999.0' or x == '-999' else x)

In [602]:
# a quick analysis of the data indicates that a value of '0' for wind direction is also a NaN value
df.wdird = df.wdird.apply(lambda x: np.nan if x == '0' else x)

In [730]:
# get a list of three-tuples (year, month, day)
# @param start-year, month, day; end-year, month, day = ints
# @ return [(year = string, month = string, day = string), ...] e.g. ('2010', '01', '09')
def gdates(start_year_int, start_month_int, start_day_int, end_year_int, end_month_int, end_day_int):
    d1 = date(start_year_int, start_month_int, start_day_int)
    d2 = date(end_year_int, end_month_int, end_day_int)
    delta = d2 - d1
    date_list = []
    for i in range(delta.days + 1):
        val = d1 + td(days=i)
        # convert to string
        v = val.isoformat()
        # pick off year, month, day by indexing into string
        year = v[0:4]
        month = v[5:7]
        day = v[8:11]
        date_list.append((year, month, day))
    return l    

In [566]:
# function make_date
# add a column of datetime values
# @param df = pandas.DataFrame
# @return null *** modifies the dataframe in place
def make_date(df):
    df.temp_date = df.date.apply(lambda obj: ast.literal_eval(obj))
    df['local_datetime'] = df.temp_date.apply(lambda obj: datetime.datetime(int(obj['year']), int(obj['mon']), int(obj['mday']), hour=int(obj['hour']), minute=int(obj['min'])))

In [652]:
# function one_single_day
# get a fully cleaned day of meteorological values
# @param year=string, month=string, day=string
# @return m = 48 rows, various features, one row each half hour
def one_single_day(year, month, day):
    # get day
    dframe = pd.DataFrame(get_one_day(year, month, day))
    # add time field
    dframe['datetime'] = dframe.date.apply(lambda obj: datetime.datetime(int(obj['year']), int(obj['mon']), int(obj['mday']), hour=int(obj['hour']), minute=int(obj['min'])))
    # replace some nulls globally
    dframe = replace_nulls(dframe)
    # replace other nulls in wind direction
    dframe.wdird = df.wdird.apply(lambda x: np.nan if x == '0' else x)
    # get rid of bad api calls
    rv = dframe.metar.apply(lambda x: not x.startswith('AAXX'))
    dframe = dframe[rv]
    # reset index -- necessary when removing whole rows of values
    dframe.reset_index(inplace=True, drop=True)
    return dframe

In [653]:
# example day
# day2 = one_single_day(2013, 1, 2)

In [655]:
# write df to cv
# day2.to_csv('day2.csv')

In [727]:
# example two_days
# two_days = pd.concat([one_single_day('2013', '01', '01'), one_single_day('2013', '01', '02')], ignore_index=True)

In [4]:
# example: build an entire year 2014 - 2015 (includes jan 1st, 2015)

# initialize empty df
empty = pd.DataFrame()

# an i for every day of the year
for date_ in gdates('2014', '01', '01', '2015', '01', '01'):
    # day holder
    tempday = one_single_day(date_[YEAR], date_[MONTH], date_[DAY])
    # concatenate new day, ignore new day's index
    empty = pd.concat([empty, tempday], ignore_index=True)
    # just so we know what's going on
    print date_
    # for the free wunderground API plan, you are only permitted 10 calls / min
    # 60 / 7 = ~8 calls per minute
    time.sleep(7)