https://www.wunderground.com/weatherstation/WXDailyHistory.asp?ID=KPACONNE5&graphspan=month&month=02&day=1&year=2018&format=1

In [1]:
from calendar import Calendar
import pandas as pd
import time

In [2]:
station = 'KPACONNE5'
month = 2
year = 2018

In [3]:
def get_daily_in_month(station, month, year, only_ppt=True):
    url = 'https://www.wunderground.com/weatherstation/WXDailyHistory.asp?ID={0}&graphspan=month&month={1}&day=1&year={2}&format=1'.format(station, month, year)
    data = pd.read_csv(url, na_values='<br>', parse_dates=['Date'])
    data.columns = map(lambda x: x.replace('<br>', ''), data.columns)
    if only_ppt:
        data = data[['Date', 'PrecipitationSumIn']]
    return data.dropna().set_index('Date')

In [6]:
dataFeb2018 = get_daily_in_month(station, month, year)

In [7]:
dataFeb2018.head()

Unnamed: 0_level_0,PrecipitationSumIn
Date,Unnamed: 1_level_1
2018-02-01,0.4
2018-02-02,0.01
2018-02-03,0.0
2018-02-04,0.54
2018-02-05,0.01


In [8]:
def get_daily_in_year(station, year, only_ppt=True):
    url = 'https://www.wunderground.com/weatherstation/WXDailyHistory.asp?ID={0}&graphspan=year&month=1&day=1&year={1}&format=1'.format(station, year)
    data = pd.read_csv(url, na_values='<br>', parse_dates=['Date'])
    data.columns = map(lambda x: x.replace('<br>', ''), data.columns)
    if only_ppt:
        data = data[['Date', 'PrecipitationSumIn']]
    return data.dropna().set_index('Date')

In [40]:
def get_daily_in_years(station, years, only_ppt=True, limit_calls=True):
    if limit_calls and len(years) > 500:
        raise Exception('Let us keep to a maximum of 500 requests per day')
        
    t = time.time()
    for i, year in enumerate(sorted(years, reverse=True)):
        year_data = get_daily_in_year(station, year, only_ppt)
        dt = time.time() - t
        if limit_calls and not (i%10) and dt < 60:
            time.sleep(60 - dt)
        if dt > 60:
            t = time.time()
            
        if not year_data.shape[0]:
            # no more data - assuming that if in one year there were no data,
            # then there were no data in the previous years as well.
            break
            
        if i == 0:
            data = year_data.copy()
        else:
            data = data.append(year_data)
            
    return data[~data.index.duplicated(keep='first')]

In [10]:
west_conn = get_daily_in_years('KPACONNE5', range(2016, 2019))

In [16]:
west_conn.shape

(454, 1)

In [17]:
west_conn.to_csv('west_connellsville_2016-2018.csv')

In [18]:
south_conn = get_daily_in_years('KPACONNE6', range(2016, 2019))
south_conn.shape

(437, 1)

In [20]:
south_conn.to_csv('south_connellsville_2016-2018.csv')

To get 5-minute data we need to request a day at a time

https://www.wunderground.com/weatherstation/WXDailyHistory.asp?ID=KPACONNE6&day=24&year=2018&month=2&graphspan=day&format=1

In [83]:
def get_5min_daily(station, day, month, year, only_ppt=True):
    url = ('https://www.wunderground.com/weatherstation/WXDailyHistory.asp?ID={0}&'
           'graphspan=day&month={1}&day={2}&year={3}&format=1'.format(station, month, day, year))
    data = pd.read_csv(url, index_col=False, na_values='<br>', parse_dates=['Time'], infer_datetime_format=True)
    data.columns = map(lambda x: x.replace('<br>', ''), data.columns)
    if only_ppt:
        data = data[['Time', 'HourlyPrecipIn', 'dailyrainin']]
    return data.dropna().set_index('Time')

In [84]:
t = get_5min_daily('KPACONNE6', 24, 2, 2018)

In [85]:
t.head()

Unnamed: 0_level_0,HourlyPrecipIn,dailyrainin
Time,Unnamed: 1_level_1,Unnamed: 2_level_1
2018-02-24 00:05:00,0.0,0.0
2018-02-24 00:10:00,0.0,0.0
2018-02-24 00:15:00,0.0,0.0
2018-02-24 00:21:00,0.0,0.0
2018-02-24 00:26:00,0.0,0.0


In [None]:
def get_5min_daily_in_year(station, year, only_ppt=True, limit_calls=True):
    cal = Calendar()
    t = time.time()
    i = 0
    for month in xrange(1, 13):
        days = [day for day in cal.itermonthdays(2018, 2) if day]
        for day in days:
            daily_data = get_5min_daily(station, day, month, year, only_ppt)
            dt = time.time() - t
            if limit_calls and not (i%10) and dt < 60:
                time.sleep(60 - dt)
            if dt > 60:
                t = time.time()

            if i == 0:
                data = daily_data.copy()
            else:
                data = data.append(daily_data)
                
            i += 1
            
    return data[~data.index.duplicated(keep='first')]

In [91]:
wc_5min_2018 = get_5min_daily_in_year('KPACONNE5', 2018)

In [93]:
wc_5min_2018.shape[0] / (24 * 60 / 5)

47

In [94]:
wc_5min_2018.head()

Unnamed: 0_level_0,HourlyPrecipIn,dailyrainin
Time,Unnamed: 1_level_1,Unnamed: 2_level_1
2018-01-01 00:03:00,0.0,0.0
2018-01-01 00:08:00,0.0,0.0
2018-01-01 00:13:00,0.0,0.0
2018-01-01 00:19:00,0.0,0.0
2018-01-01 00:24:00,0.0,0.0
