## 采集天气数据
这个ipython notebook主要是遍历和纽约州每个区关联的所有天气站，使用Weather Underground API同步历史上的实时天气信息。

In [2]:
import pandas as pd
import numpy as np
import time
import random
import pickle
import os

大家还记得上一个ipython notebook中的关联文件对吧，这里要加载进来使用了。

In [3]:
weather_dict = pickle.load(open('weather_dict.pkl','rb'))

In [4]:
weather_dict

{'NORTH': ('kjfk', 'NYC', 'NYC'),
 'MILLWD': ('krme', 'Mohawk Valley', 'Utica'),
 'GENESE': ('kroc', 'Genese', 'Rochester'),
 'MHK VL': ('kjfk', 'Long Island', 'NYC'),
 'HUD VL': ('kpou', 'Hudson Valley', 'Poughkeepsie'),
 'CENTRL': ('ksyr', 'Central', 'Syracuse'),
 'LONGIL': ('kbuf', 'West', 'Buffalo'),
 'N.Y.C._LONGIL': ('klga', 'Millwood', 'Yonkers'),
 'WEST': ('kpbg', 'North', 'Plattsburgh'),
 'DUNWOD': ('klga', 'Dunwoodie', 'Yonkers'),
 'CAPITL': ('kalb', 'Capital', 'Albany')}

In [5]:
airports = [i[0] for i in weather_dict.values()]

In [6]:
#去重
airports = list(set(airports))

In [7]:
airports

['kpbg', 'krme', 'kroc', 'ksyr', 'kbuf', 'kpou', 'kjfk', 'klga', 'kalb']

In [8]:
dates = pd.date_range(pd.to_datetime('2001-05-01'), \
                       pd.to_datetime('2016-03-11'), freq='D')

In [10]:
def write_daily_weather_data(airport, dates):
    '''把2个python list（天气和日期）整合成一个CSV文件
    
    整合好的CSV文件有以下的字段:
    
    timeest | temperaturef | dewpointf | humidity | sealevelpressurein | visibilitymph | winddirection | windspeedkmh | gustspeedmph
    
        | precipitationmm | events | conditions | winddirdegrees | dateutc
    '''
    for d in dates:
        try:
            df0 = pd.read_csv('https://www.wunderground.com/history/airport/{0}/{1}/{2}/{3}/DailyHistory.html?format=1'\
                                 .format(airport, d.year, d.month, d.day))
            cols = df0.columns

            df0.columns = [col.lower().replace(' ','').replace('<br/>', '').replace('/','') for col in cols]
            #print df0.columns
            df0.dateutc = df0.dateutc.apply(lambda x: pd.to_datetime(x.replace('<br />', '')))

            df0.gustspeedkmh = df0.gustspeedkmh.replace('-', 0)
            df0.windspeedkmh = df0.windspeedkmh.replace('Calm', 0)
            df0.precipitationmm = df0.precipitationmm.replace('NaN', 0)
            df0.events = df0.events.replace('NaN', 0)

            filepath = '../data/wunderground/'+ airport +'/' + str(d.date()).replace('-','') + '.csv'
            print (filepath)
            df0.to_csv(filepath, index=False)



            t = 3
            time.sleep(t)

            if type(df0.dateutc[0]) == pd.tslib.Timestamp:
                continue
            else:
                print ("Something is wrong")
                break
        except:
            print ("date ",d ," can't be downloaded!")
            continue

    print ("Files for %s have been written" % airport)
    return

遍历气象站，导出天气文件

In [None]:
for a in airports:
    write_daily_weather_data(a, dates)

date  2001-05-01 00:00:00  can't be downloaded!
date  2001-05-02 00:00:00  can't be downloaded!
date  2001-05-03 00:00:00  can't be downloaded!
date  2001-05-04 00:00:00  can't be downloaded!
date  2001-05-05 00:00:00  can't be downloaded!
date  2001-05-06 00:00:00  can't be downloaded!
date  2001-05-07 00:00:00  can't be downloaded!
date  2001-05-08 00:00:00  can't be downloaded!
date  2001-05-09 00:00:00  can't be downloaded!
date  2001-05-10 00:00:00  can't be downloaded!
date  2001-05-11 00:00:00  can't be downloaded!
date  2001-05-12 00:00:00  can't be downloaded!
date  2001-05-13 00:00:00  can't be downloaded!
date  2001-05-14 00:00:00  can't be downloaded!
date  2001-05-15 00:00:00  can't be downloaded!
date  2001-05-16 00:00:00  can't be downloaded!
date  2001-05-17 00:00:00  can't be downloaded!
date  2001-05-18 00:00:00  can't be downloaded!
date  2001-05-19 00:00:00  can't be downloaded!
date  2001-05-20 00:00:00  can't be downloaded!
date  2001-05-21 00:00:00  can't be down

date  2001-10-19 00:00:00  can't be downloaded!
date  2001-10-20 00:00:00  can't be downloaded!
date  2001-10-21 00:00:00  can't be downloaded!
date  2001-10-22 00:00:00  can't be downloaded!
date  2001-10-23 00:00:00  can't be downloaded!
date  2001-10-24 00:00:00  can't be downloaded!
date  2001-10-25 00:00:00  can't be downloaded!
date  2001-10-26 00:00:00  can't be downloaded!
date  2001-10-27 00:00:00  can't be downloaded!
date  2001-10-28 00:00:00  can't be downloaded!
date  2001-10-29 00:00:00  can't be downloaded!
date  2001-10-30 00:00:00  can't be downloaded!
date  2001-10-31 00:00:00  can't be downloaded!
date  2001-11-01 00:00:00  can't be downloaded!
date  2001-11-02 00:00:00  can't be downloaded!
date  2001-11-03 00:00:00  can't be downloaded!
date  2001-11-04 00:00:00  can't be downloaded!
date  2001-11-05 00:00:00  can't be downloaded!
date  2001-11-06 00:00:00  can't be downloaded!
date  2001-11-07 00:00:00  can't be downloaded!
date  2001-11-08 00:00:00  can't be down

date  2002-04-08 00:00:00  can't be downloaded!
date  2002-04-09 00:00:00  can't be downloaded!
date  2002-04-10 00:00:00  can't be downloaded!
date  2002-04-11 00:00:00  can't be downloaded!
date  2002-04-12 00:00:00  can't be downloaded!
date  2002-04-13 00:00:00  can't be downloaded!
date  2002-04-14 00:00:00  can't be downloaded!
date  2002-04-15 00:00:00  can't be downloaded!
date  2002-04-16 00:00:00  can't be downloaded!
date  2002-04-17 00:00:00  can't be downloaded!
date  2002-04-18 00:00:00  can't be downloaded!
date  2002-04-19 00:00:00  can't be downloaded!
date  2002-04-20 00:00:00  can't be downloaded!
date  2002-04-21 00:00:00  can't be downloaded!
date  2002-04-22 00:00:00  can't be downloaded!
date  2002-04-23 00:00:00  can't be downloaded!
date  2002-04-24 00:00:00  can't be downloaded!
date  2002-04-25 00:00:00  can't be downloaded!
date  2002-04-26 00:00:00  can't be downloaded!
date  2002-04-27 00:00:00  can't be downloaded!
date  2002-04-28 00:00:00  can't be down

date  2002-09-26 00:00:00  can't be downloaded!
date  2002-09-27 00:00:00  can't be downloaded!
date  2002-09-28 00:00:00  can't be downloaded!
date  2002-09-29 00:00:00  can't be downloaded!
date  2002-09-30 00:00:00  can't be downloaded!
date  2002-10-01 00:00:00  can't be downloaded!
date  2002-10-02 00:00:00  can't be downloaded!
date  2002-10-03 00:00:00  can't be downloaded!
date  2002-10-04 00:00:00  can't be downloaded!
date  2002-10-05 00:00:00  can't be downloaded!
date  2002-10-06 00:00:00  can't be downloaded!
date  2002-10-07 00:00:00  can't be downloaded!
date  2002-10-08 00:00:00  can't be downloaded!
date  2002-10-09 00:00:00  can't be downloaded!
date  2002-10-10 00:00:00  can't be downloaded!
date  2002-10-11 00:00:00  can't be downloaded!
date  2002-10-12 00:00:00  can't be downloaded!
date  2002-10-13 00:00:00  can't be downloaded!
date  2002-10-14 00:00:00  can't be downloaded!
date  2002-10-15 00:00:00  can't be downloaded!
date  2002-10-16 00:00:00  can't be down

date  2003-03-16 00:00:00  can't be downloaded!
date  2003-03-17 00:00:00  can't be downloaded!
date  2003-03-18 00:00:00  can't be downloaded!
date  2003-03-19 00:00:00  can't be downloaded!
date  2003-03-20 00:00:00  can't be downloaded!
date  2003-03-21 00:00:00  can't be downloaded!
date  2003-03-22 00:00:00  can't be downloaded!
date  2003-03-23 00:00:00  can't be downloaded!
date  2003-03-24 00:00:00  can't be downloaded!
date  2003-03-25 00:00:00  can't be downloaded!
date  2003-03-26 00:00:00  can't be downloaded!
date  2003-03-27 00:00:00  can't be downloaded!
date  2003-03-28 00:00:00  can't be downloaded!
date  2003-03-29 00:00:00  can't be downloaded!
date  2003-03-30 00:00:00  can't be downloaded!
date  2003-03-31 00:00:00  can't be downloaded!
date  2003-04-01 00:00:00  can't be downloaded!
date  2003-04-02 00:00:00  can't be downloaded!
date  2003-04-03 00:00:00  can't be downloaded!
date  2003-04-04 00:00:00  can't be downloaded!
date  2003-04-05 00:00:00  can't be down

date  2003-09-03 00:00:00  can't be downloaded!
date  2003-09-04 00:00:00  can't be downloaded!
date  2003-09-05 00:00:00  can't be downloaded!
date  2003-09-06 00:00:00  can't be downloaded!
date  2003-09-07 00:00:00  can't be downloaded!
date  2003-09-08 00:00:00  can't be downloaded!
date  2003-09-09 00:00:00  can't be downloaded!
date  2003-09-10 00:00:00  can't be downloaded!
date  2003-09-11 00:00:00  can't be downloaded!
date  2003-09-12 00:00:00  can't be downloaded!
date  2003-09-13 00:00:00  can't be downloaded!
date  2003-09-14 00:00:00  can't be downloaded!
date  2003-09-15 00:00:00  can't be downloaded!
date  2003-09-16 00:00:00  can't be downloaded!
date  2003-09-17 00:00:00  can't be downloaded!
date  2003-09-18 00:00:00  can't be downloaded!
date  2003-09-19 00:00:00  can't be downloaded!
date  2003-09-20 00:00:00  can't be downloaded!
date  2003-09-21 00:00:00  can't be downloaded!
date  2003-09-22 00:00:00  can't be downloaded!
date  2003-09-23 00:00:00  can't be down

date  2004-02-21 00:00:00  can't be downloaded!
date  2004-02-22 00:00:00  can't be downloaded!
date  2004-02-23 00:00:00  can't be downloaded!
date  2004-02-24 00:00:00  can't be downloaded!
date  2004-02-25 00:00:00  can't be downloaded!
date  2004-02-26 00:00:00  can't be downloaded!
date  2004-02-27 00:00:00  can't be downloaded!
date  2004-02-28 00:00:00  can't be downloaded!
date  2004-02-29 00:00:00  can't be downloaded!
date  2004-03-01 00:00:00  can't be downloaded!
date  2004-03-02 00:00:00  can't be downloaded!
date  2004-03-03 00:00:00  can't be downloaded!
date  2004-03-04 00:00:00  can't be downloaded!
date  2004-03-05 00:00:00  can't be downloaded!
date  2004-03-06 00:00:00  can't be downloaded!
date  2004-03-07 00:00:00  can't be downloaded!
date  2004-03-08 00:00:00  can't be downloaded!
date  2004-03-09 00:00:00  can't be downloaded!
date  2004-03-10 00:00:00  can't be downloaded!
date  2004-03-11 00:00:00  can't be downloaded!
date  2004-03-12 00:00:00  can't be down

date  2004-08-10 00:00:00  can't be downloaded!
date  2004-08-11 00:00:00  can't be downloaded!
date  2004-08-12 00:00:00  can't be downloaded!
date  2004-08-13 00:00:00  can't be downloaded!
date  2004-08-14 00:00:00  can't be downloaded!
date  2004-08-15 00:00:00  can't be downloaded!
date  2004-08-16 00:00:00  can't be downloaded!
date  2004-08-17 00:00:00  can't be downloaded!
date  2004-08-18 00:00:00  can't be downloaded!
date  2004-08-19 00:00:00  can't be downloaded!
date  2004-08-20 00:00:00  can't be downloaded!
date  2004-08-21 00:00:00  can't be downloaded!
date  2004-08-22 00:00:00  can't be downloaded!
date  2004-08-23 00:00:00  can't be downloaded!
date  2004-08-24 00:00:00  can't be downloaded!
date  2004-08-25 00:00:00  can't be downloaded!
date  2004-08-26 00:00:00  can't be downloaded!
date  2004-08-27 00:00:00  can't be downloaded!
date  2004-08-28 00:00:00  can't be downloaded!
date  2004-08-29 00:00:00  can't be downloaded!
date  2004-08-30 00:00:00  can't be down

date  2005-01-28 00:00:00  can't be downloaded!
date  2005-01-29 00:00:00  can't be downloaded!
date  2005-01-30 00:00:00  can't be downloaded!
date  2005-01-31 00:00:00  can't be downloaded!
date  2005-02-01 00:00:00  can't be downloaded!
date  2005-02-02 00:00:00  can't be downloaded!
date  2005-02-03 00:00:00  can't be downloaded!
date  2005-02-04 00:00:00  can't be downloaded!
date  2005-02-05 00:00:00  can't be downloaded!
date  2005-02-06 00:00:00  can't be downloaded!
date  2005-02-07 00:00:00  can't be downloaded!
date  2005-02-08 00:00:00  can't be downloaded!
date  2005-02-09 00:00:00  can't be downloaded!
date  2005-02-10 00:00:00  can't be downloaded!
date  2005-02-11 00:00:00  can't be downloaded!
date  2005-02-12 00:00:00  can't be downloaded!
date  2005-02-13 00:00:00  can't be downloaded!
date  2005-02-14 00:00:00  can't be downloaded!
date  2005-02-15 00:00:00  can't be downloaded!
date  2005-02-16 00:00:00  can't be downloaded!
date  2005-02-17 00:00:00  can't be down

date  2005-07-18 00:00:00  can't be downloaded!
date  2005-07-19 00:00:00  can't be downloaded!
date  2005-07-20 00:00:00  can't be downloaded!
date  2005-07-21 00:00:00  can't be downloaded!
date  2005-07-22 00:00:00  can't be downloaded!
date  2005-07-23 00:00:00  can't be downloaded!
date  2005-07-24 00:00:00  can't be downloaded!
date  2005-07-25 00:00:00  can't be downloaded!
date  2005-07-26 00:00:00  can't be downloaded!
date  2005-07-27 00:00:00  can't be downloaded!
date  2005-07-28 00:00:00  can't be downloaded!
date  2005-07-29 00:00:00  can't be downloaded!
date  2005-07-30 00:00:00  can't be downloaded!
date  2005-07-31 00:00:00  can't be downloaded!
date  2005-08-01 00:00:00  can't be downloaded!
date  2005-08-02 00:00:00  can't be downloaded!
date  2005-08-03 00:00:00  can't be downloaded!
date  2005-08-04 00:00:00  can't be downloaded!
date  2005-08-05 00:00:00  can't be downloaded!
date  2005-08-06 00:00:00  can't be downloaded!
date  2005-08-07 00:00:00  can't be down

date  2006-01-05 00:00:00  can't be downloaded!
date  2006-01-06 00:00:00  can't be downloaded!
date  2006-01-07 00:00:00  can't be downloaded!
date  2006-01-08 00:00:00  can't be downloaded!
date  2006-01-09 00:00:00  can't be downloaded!
date  2006-01-10 00:00:00  can't be downloaded!
date  2006-01-11 00:00:00  can't be downloaded!
date  2006-01-12 00:00:00  can't be downloaded!
date  2006-01-13 00:00:00  can't be downloaded!
date  2006-01-14 00:00:00  can't be downloaded!
date  2006-01-15 00:00:00  can't be downloaded!
date  2006-01-16 00:00:00  can't be downloaded!
date  2006-01-17 00:00:00  can't be downloaded!
date  2006-01-18 00:00:00  can't be downloaded!
date  2006-01-19 00:00:00  can't be downloaded!
date  2006-01-20 00:00:00  can't be downloaded!
date  2006-01-21 00:00:00  can't be downloaded!
date  2006-01-22 00:00:00  can't be downloaded!
date  2006-01-23 00:00:00  can't be downloaded!
date  2006-01-24 00:00:00  can't be downloaded!
date  2006-01-25 00:00:00  can't be down

date  2006-06-25 00:00:00  can't be downloaded!
date  2006-06-26 00:00:00  can't be downloaded!
date  2006-06-27 00:00:00  can't be downloaded!
date  2006-06-28 00:00:00  can't be downloaded!
date  2006-06-29 00:00:00  can't be downloaded!
date  2006-06-30 00:00:00  can't be downloaded!
date  2006-07-01 00:00:00  can't be downloaded!
date  2006-07-02 00:00:00  can't be downloaded!
date  2006-07-03 00:00:00  can't be downloaded!
date  2006-07-04 00:00:00  can't be downloaded!
date  2006-07-05 00:00:00  can't be downloaded!
date  2006-07-06 00:00:00  can't be downloaded!
date  2006-07-07 00:00:00  can't be downloaded!
date  2006-07-08 00:00:00  can't be downloaded!
date  2006-07-09 00:00:00  can't be downloaded!
date  2006-07-10 00:00:00  can't be downloaded!
date  2006-07-11 00:00:00  can't be downloaded!
date  2006-07-12 00:00:00  can't be downloaded!
date  2006-07-13 00:00:00  can't be downloaded!
date  2006-07-14 00:00:00  can't be downloaded!
date  2006-07-15 00:00:00  can't be down

date  2006-12-13 00:00:00  can't be downloaded!
date  2006-12-14 00:00:00  can't be downloaded!
date  2006-12-15 00:00:00  can't be downloaded!
date  2006-12-16 00:00:00  can't be downloaded!
date  2006-12-17 00:00:00  can't be downloaded!
date  2006-12-18 00:00:00  can't be downloaded!
date  2006-12-19 00:00:00  can't be downloaded!
date  2006-12-20 00:00:00  can't be downloaded!
date  2006-12-21 00:00:00  can't be downloaded!
date  2006-12-22 00:00:00  can't be downloaded!
date  2006-12-23 00:00:00  can't be downloaded!
date  2006-12-24 00:00:00  can't be downloaded!
date  2006-12-25 00:00:00  can't be downloaded!
date  2006-12-26 00:00:00  can't be downloaded!
date  2006-12-27 00:00:00  can't be downloaded!
date  2006-12-28 00:00:00  can't be downloaded!
date  2006-12-29 00:00:00  can't be downloaded!
date  2006-12-30 00:00:00  can't be downloaded!
date  2006-12-31 00:00:00  can't be downloaded!
date  2007-01-01 00:00:00  can't be downloaded!
date  2007-01-02 00:00:00  can't be down

date  2007-06-02 00:00:00  can't be downloaded!
date  2007-06-03 00:00:00  can't be downloaded!
date  2007-06-04 00:00:00  can't be downloaded!
date  2007-06-05 00:00:00  can't be downloaded!
date  2007-06-06 00:00:00  can't be downloaded!
date  2007-06-07 00:00:00  can't be downloaded!
date  2007-06-08 00:00:00  can't be downloaded!
date  2007-06-09 00:00:00  can't be downloaded!
date  2007-06-10 00:00:00  can't be downloaded!
date  2007-06-11 00:00:00  can't be downloaded!
date  2007-06-12 00:00:00  can't be downloaded!
date  2007-06-13 00:00:00  can't be downloaded!
date  2007-06-14 00:00:00  can't be downloaded!
date  2007-06-15 00:00:00  can't be downloaded!
date  2007-06-16 00:00:00  can't be downloaded!
date  2007-06-17 00:00:00  can't be downloaded!
date  2007-06-18 00:00:00  can't be downloaded!
date  2007-06-19 00:00:00  can't be downloaded!
date  2007-06-20 00:00:00  can't be downloaded!
date  2007-06-21 00:00:00  can't be downloaded!
date  2007-06-22 00:00:00  can't be down

date  2007-11-20 00:00:00  can't be downloaded!
date  2007-11-21 00:00:00  can't be downloaded!
date  2007-11-22 00:00:00  can't be downloaded!
date  2007-11-23 00:00:00  can't be downloaded!
date  2007-11-24 00:00:00  can't be downloaded!
date  2007-11-25 00:00:00  can't be downloaded!
date  2007-11-26 00:00:00  can't be downloaded!
date  2007-11-27 00:00:00  can't be downloaded!
date  2007-11-28 00:00:00  can't be downloaded!
date  2007-11-29 00:00:00  can't be downloaded!
date  2007-11-30 00:00:00  can't be downloaded!
date  2007-12-01 00:00:00  can't be downloaded!
date  2007-12-02 00:00:00  can't be downloaded!
date  2007-12-03 00:00:00  can't be downloaded!
date  2007-12-04 00:00:00  can't be downloaded!
date  2007-12-05 00:00:00  can't be downloaded!
date  2007-12-06 00:00:00  can't be downloaded!
date  2007-12-07 00:00:00  can't be downloaded!
date  2007-12-08 00:00:00  can't be downloaded!
date  2007-12-09 00:00:00  can't be downloaded!
date  2007-12-10 00:00:00  can't be down

date  2008-05-09 00:00:00  can't be downloaded!
date  2008-05-10 00:00:00  can't be downloaded!
date  2008-05-11 00:00:00  can't be downloaded!
date  2008-05-12 00:00:00  can't be downloaded!
date  2008-05-13 00:00:00  can't be downloaded!
date  2008-05-14 00:00:00  can't be downloaded!
date  2008-05-15 00:00:00  can't be downloaded!
date  2008-05-16 00:00:00  can't be downloaded!
date  2008-05-17 00:00:00  can't be downloaded!
date  2008-05-18 00:00:00  can't be downloaded!
date  2008-05-19 00:00:00  can't be downloaded!
date  2008-05-20 00:00:00  can't be downloaded!
date  2008-05-21 00:00:00  can't be downloaded!
date  2008-05-22 00:00:00  can't be downloaded!
date  2008-05-23 00:00:00  can't be downloaded!
date  2008-05-24 00:00:00  can't be downloaded!
date  2008-05-25 00:00:00  can't be downloaded!
date  2008-05-26 00:00:00  can't be downloaded!
date  2008-05-27 00:00:00  can't be downloaded!
date  2008-05-28 00:00:00  can't be downloaded!
date  2008-05-29 00:00:00  can't be down

In [None]:
dates = pd.date_range(pd.to_datetime('2012-07-03'), \
                       pd.to_datetime('2013-01-01'), freq='D')

In [None]:
write_daily_weather_data('kalb', dates)

In [85]:

def combine_weather_data(airport):
    '''Combine the weather data for each day at an airport into one combined csv'''
    csvs = []
    for file in os.listdir("../data/wunderground/"+airport+"/"):
        if file.endswith(".csv"):
            csvs.append(file)

    fout=open("../data/wunderground/"+airport+"_all.csv","a")

    # 第一个文件完整地写进去:
    for line in open("../data/wunderground/"+airport+"/"+csvs[0]):
        fout.write(line)
    # 后续的文件，去掉头部信息:    
    for file in csvs[1:]:
        f = open("../data/wunderground/"+airport+"/"+file)
        f.next() # 跳过header
        for line in f:
             fout.write(line)
        f.close()
    fout.close()
    print "Files for %s have been combined" % airport

In [None]:
for a in airports:
    combine_weather_data(a)