In [1]:
# %load weather_data.py
#! /usr/bin/env python3
from datetime import datetime, timedelta
import time  
from collections import namedtuple  
import pandas as pd
import numpy as np
import requests  
import os

from utils import my_logger

logger = my_logger(__name__, level="DEBUG")
try:
    root_path = os.path.dirname(os.path.abspath(__file__))
except:
    root_path = '/Users/lichong/Sources/RPi_weather'


class WeatherData(object):
    """
    历史气象数据获取和预处理，生成训练和测试数据集dataset
    """
    # Weather Underground API
    # API_KEY: The API_KEY that Weather Underground provides with your account
    # YYYYMMDD: A string representing the target date of your request
    # STATE: The two letter state abbreviation in the United States
    # CITY: The name of the city associated with the state you requested
    BASE_URL = 'http://api.wunderground.com/api/{API_KEY}/history_{YYYYMMDD}/q/{STATE}/{CITY}.json'
    API_KEY =  'dcfae7a1de117e98'
    STATE = '上海市'
    CITY = 'Guangfulin'
    URL = 'http://api.wunderground.com/api/dcfae7a1de117e98/history_{YYYYMMDD}/q/上海市/Guangfulin.json'
    FEATURES = ['datetime', 'tempm', 'hum', 'pressurem','conds']
    DailyObservation = namedtuple("DailyObservation", FEATURES)
    # HAZARDOUS_WEATHER = [''.join(['Heavy ', w]) for w in ['Rain', 'Snow',
    #     'Mist', 'Hail', 'Ice', 'Thunderstorm','Freezing']]
    HAZARDOUS_WEATHER = ['Rain', 'Snow', 'Mist', 'Hail', 'Ice', 'Thunderstorm','Freezing']

    def __init__(self, data='data.csv'):
        data_ = data.replace('.csv', '')
        data_cleaned = data_ + '_cleaned.csv'
        dataset = data_ + '_dataset.csv'
        self.data = os.path.join(root_path, data)
        self.data_cleaned = os.path.join(root_path, data_cleaned)
        self.dataset = os.path.join(root_path, dataset)
        self.df = pd.read_csv(self.data, index_col=0, parse_dates=True)
        self.features = ['tempm', 'hum', 'pressurem']
        self.result = 'hazardous'
            
    def collect_data(self, end_date, days):  
        records = []
        for _ in range(days):
            url = WeatherData.URL.format(
                YYYYMMDD=end_date.strftime('%Y%m%d'))
            response = requests.get(url)
            if response.status_code == 200:
                logger.debug('response success!')
                observations = response.json()['history']['observations']
                # 48 observs each day, twice an hour
                for obv in observations:
                    dt = obv['date']
                    dtt = ''.join(
                        [dt['year'], dt['mon'], dt['mday'],
                        dt['hour'], dt['min']])
                    daily_obv = WeatherData.DailyObservation(
                        datetime=datetime.strptime(dtt, '%Y%m%d%H%M'),
                        tempm=obv['tempm'],
                        hum=obv['hum'],
                        pressurem=obv['pressurem'],
                        conds=obv['conds'])
                    records.append(daily_obv)
            time.sleep(6)
            end_date += timedelta(days=-1)

        df = pd.DataFrame(records, columns=WeatherData.FEATURES).set_index('datetime')  
        df.to_csv(self.data)
        self.df = df
        return records

    def clean_data(self):        
        df = self.df.copy()
        # remove abnormal value [-9999, -999, 'Null', 'N/A']
        xvalues = df[self.features]     
        xvalues = xvalues.apply(pd.to_numeric, errors='coerce')
        xvalues = xvalues.replace(-9999, np.nan)
        xvalues = xvalues.replace(-999, np.nan)
        df[self.features] = xvalues

        # remove unknown weather condition
        df['conds'] = df['conds'].str.strip().str.lower().replace('unknown', np.nan)
        # drop all np.nan
        df = df.dropna()

        # sort index, index is datetime, sort ascending
        df = df.sort_index()

        df.to_csv(self.data_cleaned)
        self.df = df
        return df

    def add_features(self, priors=2, features=None):
        """ add 2 prior samples as the extra features of the current sample. """
        df = self.df.copy()
        if not features:
            features = self.features.copy()
        rows = df.shape[0]
        # derive_nth_day_feature
        for N in range(1, priors+1):
            for feature in features:
                col_name = "{}_{}".format(feature, N)
                df[col_name] = [np.nan]*N + [df[feature][i-N] for i in range(N, rows)]
                self.features.append(col_name)
        # rearrange columns order
        df = df[self.features + ['conds']]
        df = df.dropna()
        self.df = df
        return df

    def split_dataset(self):
        from sklearn.model_selection import train_test_split 
        df = self.df.copy()
        # add a result column 'hazardous'
        hazardous_pattern = '|'.join(WeatherData.HAZARDOUS_WEATHER).lower()
        df[self.result] = df['conds'].str.contains(hazardous_pattern)
        self.X = df[self.features]
        self.y = df[self.result]
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
            self.X, self.y, test_size=0.2, random_state=12)
        df.to_csv(self.dataset)
        self.df = df
        return self.X_train, self.X_test, self.y_train, self.y_test 

    def get_dataset(self):
        wd = WeatherData()
        wd.clean_data()
        # wd.add_features()
        return wd.split_dataset()





In [2]:
wd = WeatherData()

In [3]:
whos

Variable      Type           Data/Info
--------------------------------------
WeatherData   type           <class '__main__.WeatherData'>
datetime      type           <class 'datetime.datetime'>
logger        Logger         <Logger __main__ (DEBUG)>
my_logger     function       <function my_logger at 0x1065fda60>
namedtuple    function       <function namedtuple at 0x104117510>
np            module         <module 'numpy' from '/Li<...>kages/numpy/__init__.py'>
os            module         <module 'os' from '/Libra<...>3.6/lib/python3.6/os.py'>
requests      module         <module 'requests' from '<...>es/requests/__init__.py'>
root_path     str            /Users/lichong/Sources/RPi_weather
time          module         <module 'time' (built-in)>
timedelta     type           <class 'datetime.timedelta'>
wd            WeatherData    <__main__.WeatherData object at 0x110963390>


In [4]:
who

WeatherData	 datetime	 logger	 my_logger	 namedtuple	 np	 os	 requests	 root_path	 
time	 timedelta	 wd	 


In [5]:
wd

<__main__.WeatherData at 0x110963390>

In [6]:
wd.data

'/Users/lichong/Sources/RPi_weather/data.csv'

In [7]:
wd.df

Unnamed: 0_level_0,tempm,hum,pressurem,conds
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2017-12-31 00:00:00,7.0,81.0,1028,Overcast
2017-12-31 00:30:00,7.0,76.0,1029,Partly Cloudy
2017-12-31 01:00:00,6.0,81.0,1028,Partly Cloudy
2017-12-31 01:30:00,6.0,81.0,1028,Partly Cloudy
2017-12-31 02:00:00,6.0,81.0,1029,Mostly Cloudy
2017-12-31 02:30:00,5.0,81.0,1029,Mostly Cloudy
2017-12-31 03:00:00,5.0,87.0,1029,Mostly Cloudy
2017-12-31 03:30:00,5.0,87.0,1028,Mostly Cloudy
2017-12-31 04:30:00,5.0,81.0,1028,Mostly Cloudy
2017-12-31 05:00:00,5.0,81.0,1028,Mostly Cloudy


In [8]:
type(wd.df)

pandas.core.frame.DataFrame

In [9]:
wd.df.head(10)

Unnamed: 0_level_0,tempm,hum,pressurem,conds
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2017-12-31 00:00:00,7.0,81.0,1028,Overcast
2017-12-31 00:30:00,7.0,76.0,1029,Partly Cloudy
2017-12-31 01:00:00,6.0,81.0,1028,Partly Cloudy
2017-12-31 01:30:00,6.0,81.0,1028,Partly Cloudy
2017-12-31 02:00:00,6.0,81.0,1029,Mostly Cloudy
2017-12-31 02:30:00,5.0,81.0,1029,Mostly Cloudy
2017-12-31 03:00:00,5.0,87.0,1029,Mostly Cloudy
2017-12-31 03:30:00,5.0,87.0,1028,Mostly Cloudy
2017-12-31 04:30:00,5.0,81.0,1028,Mostly Cloudy
2017-12-31 05:00:00,5.0,81.0,1028,Mostly Cloudy


In [14]:
pd.get_option('max_rows')

10

In [13]:
pd.set_option('max_rows', 10)

In [15]:
df = wd.df.sort_index()


In [16]:
df

Unnamed: 0_level_0,tempm,hum,pressurem,conds
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2015-01-01 00:00:00,2.0,35.0,1033,Clear
2015-01-01 00:30:00,1.0,38.0,1033,Clear
2015-01-01 01:00:00,1.0,41.0,1033,Clear
2015-01-01 01:30:00,1.0,44.0,1033,Clear
2015-01-01 02:00:00,1.0,41.0,1033,Clear
...,...,...,...,...
2017-12-31 21:30:00,6.0,81.0,1028,Mist
2017-12-31 22:00:00,6.0,76.0,1028,Mist
2017-12-31 22:30:00,6.0,81.0,1028,Mist
2017-12-31 23:00:00,6.0,81.0,1028,Mist


In [17]:
df.describe()


Unnamed: 0,tempm,hum,pressurem
count,52243.0,52241.0,52243.0
mean,17.706353,71.433606,1016.158241
std,62.630583,17.787898,9.140462
min,-9999.0,12.0,986.0
25%,11.0,59.0,1008.0
50%,19.0,74.0,1016.0
75%,25.0,87.0,1023.0
max,42.0,100.0,1042.0


In [18]:
df.conds.describe()


count     52243
unique       23
top       Clear
freq      23255
Name: conds, dtype: object

In [19]:
df.atype

AttributeError: 'DataFrame' object has no attribute 'atype'

In [20]:
wd.clean_data()

Unnamed: 0_level_0,tempm,hum,pressurem,conds
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2015-01-01 00:00:00,2.0,35.0,1033,clear
2015-01-01 00:30:00,1.0,38.0,1033,clear
2015-01-01 01:00:00,1.0,41.0,1033,clear
2015-01-01 01:30:00,1.0,44.0,1033,clear
2015-01-01 02:00:00,1.0,41.0,1033,clear
...,...,...,...,...
2017-12-31 21:30:00,6.0,81.0,1028,mist
2017-12-31 22:00:00,6.0,76.0,1028,mist
2017-12-31 22:30:00,6.0,81.0,1028,mist
2017-12-31 23:00:00,6.0,81.0,1028,mist


In [21]:
wd.df.describe()

Unnamed: 0,tempm,hum,pressurem
count,49331.0,49331.0,49331.0
mean,18.184894,71.483976,1016.106809
std,9.046758,17.858663,9.159749
min,-7.0,12.0,986.0
25%,11.0,60.0,1008.0
50%,19.0,74.0,1016.0
75%,25.0,87.0,1023.0
max,42.0,100.0,1042.0
