In [299]:
import numpy as np
from pandas import DataFrame, get_dummies, concat
import requests
from sklearn.ensemble import RandomForestRegressor
from sklearn.cross_validation import train_test_split
from sklearn.metrics import r2_score
import matplotlib.pyplot as plt
from ggplot import *
%matplotlib inline 

In [None]:
# predict on year, day of month optionally -- those are available but for some reason we aren't modeling on those - of course, predicting on previous values would also be nice but not sure how to integrate this into forecasting

In [None]:
"FCTTIME": {
		"hour": "2","hour_padded": "02","min": "00","min_unpadded": "0","sec": "0","year": "2015","mon": "10","mon_padded": "10","mon_abbrev": "Oct","mday": "3","mday_padded": "03","yday": "275","isdst": "0","epoch": "1443808800","pretty": "2:00 AM CST on October 03, 2015","civil": "2:00 AM","month_name": "October","month_name_abbrev": "Oct","weekday_name": "Saturday","weekday_name_night": "Saturday Night","weekday_name_abbrev": "Sat","weekday_name_unlang": "Saturday","weekday_name_night_unlang": "Saturday Night","ampm": "AM","tz": "","age": "","UTCDATE": ""
		},
		"temp": {"english": "56", "metric": "13"},
		"dewpoint": {"english": "43", "metric": "6"},
		"condition": "Clear",
		"icon": "clear",
		"icon_url":"http://icons.wxug.com/i/c/k/nt_clear.gif",
		"fctcode": "1",
		"sky": "0",
		"wspd": {"english": "6", "metric": "10"},
		"wdir": {"dir": "N", "degrees": "359"},
		"wx": "Clear",
		"uvi": "0",
		"humidity": "61",

In [406]:
class forecast(object):    
    
    def __init__(self, forestColumnNames, frst):
        self.forestColumnNames = forestColumnNames
        self.frst = frst
        API_KEY = ''
        url = 'http://api.wunderground.com/api/'+API_KEY+'/hourly10day/q/Beijing/Beijing.json'
        content = requests.get(url).json()
        self.raw = content['hourly_forecast']
        self.df = self.prepForRF()
        
    def day_to_int(self, day_string):
        dayvals = {
            'Sunday': 0,
            'Monday': 1,
            'Tuesday': 2,
            'Wednesday': 3,
            'Thursday': 4, 
            'Friday': 5, 
            'Saturday': 6
        }
        return dayvals.get(day_string, 'nothing')
        
    def extractRow(self, one):   
        date = one['FCTTIME']
        return dict(year=date['year'], month=date['mon'], day=date['mday'], wday=self.day_to_int(date['weekday_name']), hour=date['hour'], pressurei=one['mslp']['english'], icon=one['icon'], dewpti=one['dewpoint']['english'], hum=one['humidity'], temp=one['temp']['metric'], wdird=one['wdir']['degrees'], wdire=one['wdir']['dir'], wspdm=one['wspd']['metric'])
         
    def extractRows(self):
        return DataFrame([self.extractRow(x) for x in self.raw])
    
    # make into binary values categorical vars
    def categorizeVars(self):
        df = self.extractRows()
        return pd.concat([df, get_dummies(df.wdire), get_dummies(df.icon)], axis=1)
    
    def prepForRF(self):
        df = self.categorizeVars()
        # rename certain vars, E to East, W to West, Month to mon, because of inconsistencies in the wunderground API
        df = df.rename(columns={'E':'East', 'W':'West', 'N':'North', 'S':'South', 'month': 'mon', 'temp' : 'tempm'})
        df.columns
        # drop icon, wdire
        # later rebuild the model with day and year
        df = df.drop(['icon', 'wdire', 'wdird', 'day', 'year'], axis=1)
        df = self.imputeVals(df)
        df = self.addPredictions(df)
        return df
    
    def imputeVals(self, df):
        # any missing values in the test frame need to be imputed with all zeroes
        cols_to_add = self.forestColumnNames - df.columns
        for (idx, val) in enumerate(cols_to_add):
            df[val] = np.zeros(df.shape[0])        
        return df
    
    def addPredictions(self, df):
        df['predictions'] = self.frst.predict(df)
        return df
    

In [407]:
frcast = forecast(X_train.columns, rf)

[Parallel(n_jobs=1)]: Done   1 jobs       | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  50 jobs       | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done 200 jobs       | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done 450 jobs       | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.0s finished


In [415]:
class forest():
    
    def __init__(self):
        data = pd.read_csv('data.csv')
        self.X_train, self.X_test, self.y_train, self.y_test = self.prepData(data)
        self.rf = self.buildForest(self.X_train, self.y_train)
        
    def prepData(self, df):
        df = self.catVars(df)
        df = self.removeUnusedVars(df)
        self.df = df
        return self.splitSet(df)
        
    def catVars(self, df):
        wdire_cat = get_dummies(df.wdire)
        icon_cat = get_dummies(df.icon)
        return concat([df, wdire_cat, icon_cat], axis=1)
    
    def removeUnusedVars(self, df):
        return df.drop(['X', 'conds', 'datetime', 'icon', 'visi', 'wdird', 'wdire', 'Unnamed: 0'], axis=1)
    
    def splitSet(self, df):
        # create training (80%) and test (20%) sets
        X_train, X_test, y_train, y_test = train_test_split(df[df.columns.difference(['Value'])], df.Value, test_size=0.2, random_state=42)
        return X_train, X_test, y_train, y_test
    
    def buildForest(self, X_train, y_train):
        rf = RandomForestRegressor(n_estimators=500, verbose=1)
        rf.fit(X_train, y_train)
        return rf
    
    def metrics(self, y_test, X_test, rf):
        self.r2 = r2_score(y_test, rf.predict(X_test))
        self.mse = np.mean((y_test - rf.predict(X_test))**2)
        self.rmse = np.sqrt(mse)
    
    def plotMetrics(self, rf):
        measures = DataFrame({"feature_importances_": rf.feature_importances_, "names" : X_train.columns})
        ggplot(measures, aes(x='names', y='feature_importances_')) + geom_bar(stat='bar') + theme(axis_text_x = element_text(angle = 90, hjust = 1))
        
        

In [416]:
fr = forest()

[Parallel(n_jobs=1)]: Done   1 jobs       | elapsed:    0.4s
[Parallel(n_jobs=1)]: Done  50 jobs       | elapsed:   23.0s
[Parallel(n_jobs=1)]: Done 200 jobs       | elapsed:  1.5min
[Parallel(n_jobs=1)]: Done 450 jobs       | elapsed:  3.2min
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:  3.6min finished
