In [1]:
import numpy as np
import pandas as pd
from pandas import DataFrame, get_dummies, concat
import requests
from sklearn.ensemble import RandomForestRegressor
from sklearn.cross_validation import train_test_split
from sklearn.metrics import r2_score
from ggplot import *
%matplotlib inline 

In [3]:
# predict on year, day of month optionally -- those are available but for some reason we aren't modeling on those - of course, predicting on previous values would also be nice but not sure how to integrate this into forecasting

In [3]:
class forecast(object):    
    
    def __init__(self, forestColumnNames, frst):
        self.forestColumnNames = forestColumnNames
        self.frst = frst
        API_KEY = 'a3d4db71573f30f9'
        url = 'http://api.wunderground.com/api/'+API_KEY+'/hourly10day/q/Beijing/Beijing.json'
        content = requests.get(url).json()
        self.raw = content['hourly_forecast']
        self.df = self.prepForRF()
        
    def day_to_int(self, day_string):
        dayvals = {
            'Sunday': 0,
            'Monday': 1,
            'Tuesday': 2,
            'Wednesday': 3,
            'Thursday': 4, 
            'Friday': 5, 
            'Saturday': 6
        }
        return dayvals.get(day_string, 'nothing')
        
    def extractRow(self, one):   
        date = one['FCTTIME']
        return dict(year=date['year'], month=date['mon'], day=date['mday'], wday=self.day_to_int(date['weekday_name']), hour=date['hour'], pressurei=one['mslp']['english'], icon=one['icon'], dewpti=one['dewpoint']['english'], hum=one['humidity'], temp=one['temp']['metric'], wdird=one['wdir']['degrees'], wdire=one['wdir']['dir'], wspdm=one['wspd']['metric'])
         
    def extractRows(self):
        return DataFrame([self.extractRow(x) for x in self.raw])
    
    # make into binary values categorical vars
    def categorizeVars(self):
        df = self.extractRows()
        return pd.concat([df, get_dummies(df.wdire), get_dummies(df.icon)], axis=1)
    
    def prepForRF(self):
        df = self.categorizeVars()
        # rename certain vars, E to East, W to West, Month to mon, because of inconsistencies in the wunderground API
        df = df.rename(columns={'E':'East', 'W':'West', 'N':'North', 'S':'South', 'month': 'mon', 'temp' : 'tempm'})
        df.columns
        # drop icon, wdire
        # later rebuild the model with day and year
        df = df.drop(['icon', 'wdire', 'wdird', 'day', 'year'], axis=1)
        df = self.imputeVals(df)
        df = self.addPredictions(df)
        return df
    
    def imputeVals(self, df):
        # any missing values in the test frame need to be imputed with all zeroes
        cols_to_add = self.forestColumnNames - df.columns
        for (idx, val) in enumerate(cols_to_add):
            df[val] = np.zeros(df.shape[0])        
        return df
    
    def addPredictions(self, df):
        df['predictions'] = self.frst.predict(df)
        return df
    

In [92]:
class forest():
    
    def __init__(self):
        data = pd.read_csv('data.csv')
        self.X_train, self.X_test, self.y_train, self.y_test = self.prepData(data)
        self.rf = self.buildForest(self.X_train, self.y_train)
        #self.measures = plotMetrics()
        
    def prepData(self, df):
        df = self.catVars(df)
        df = self.removeUnusedVars(df)
        self.df = df
        return self.splitSet(df)
        
    def catVars(self, df):
        wdire_cat = get_dummies(df.wdire)
        icon_cat = get_dummies(df.icon)
        return concat([df, wdire_cat, icon_cat], axis=1)
    
    def removeUnusedVars(self, df):
        return df.drop(['X', 'conds', 'datetime', 'icon', 'visi', 'wdird', 'wdire', 'Unnamed: 0'], axis=1)
    
    def splitSet(self, df):
        # create training (80%) and test (20%) sets
        X_train, X_test, y_train, y_test = train_test_split(df[df.columns.difference(['Value'])], df.Value, test_size=0.2, random_state=42)
        return X_train, X_test, y_train, y_test
    
    def buildForest(self, X_train, y_train):
        rf = RandomForestRegressor(n_estimators=50, verbose=1)
        rf.fit_transform(X_train, y_train)
        return rf
    
    def metrics(self, y_test, X_test, rf):
        self.r2 = r2_score(y_test, rf.predict(X_test))
        self.mse = np.mean((y_test - rf.predict(X_test))**2)
        self.rmse = np.sqrt(mse)
    
    def plotMetrics(self):
        measures = DataFrame({"feature_importances_": self.rf.feature_importances_, "names" : X_train.columns})
        ggplot(measures, aes(x='names', y='feature_importances_')) + geom_bar(stat='bar') + theme(axis_text_x = element_text(angle = 90, hjust = 1))
        
        

In [93]:
frT = forest()

[Parallel(n_jobs=1)]: Done   1 jobs       | elapsed:    0.4s
[Parallel(n_jobs=1)]: Done  50 jobs       | elapsed:   26.0s
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:   26.0s finished


In [20]:
frcast = forecast(fr.X_train.columns, fr.rf)

[Parallel(n_jobs=1)]: Done   1 jobs       | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  50 jobs       | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done 200 jobs       | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done 450 jobs       | elapsed:    0.1s
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.1s finished


In [31]:
ggplot(measures, aes(x='names', y='feature_importances_')) + geom_bar(stat='bar') + theme(axis_text_x = element_text(angle = 90, hjust = 1))

NameError: name 'measures' is not defined

In [95]:
import cPickle

with open('rforestModel.cpickle', 'wb') as f:
    cPickle.dump(frT.rf, f)

In [94]:
frT.rf.score(frT.X_test, frT.y_test)

[Parallel(n_jobs=1)]: Done   1 jobs       | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  50 jobs       | elapsed:    0.2s
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    0.2s finished


0.71631081170597599

In [55]:
measures = concat([DataFrame({"name":fr.X_test.columns}), DataFrame({"importance":fr.rf.feature_importances_})], axis=1)

In [60]:
measures.sort("importance", ascending=False)

Unnamed: 0,name,importance
17,clear,0.317509
23,hum,0.082328
21,hazy,0.079311
27,pressurei,0.078157
19,dewpti,0.077845
22,hour,0.060354
35,wspdm,0.05787
31,tempm,0.053088
24,mon,0.051246
34,wday,0.048758


In [68]:
frT.rf.score(frT.X_test, frT.y_test)

[Parallel(n_jobs=1)]: Done   1 jobs       | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  50 jobs       | elapsed:    0.3s
[Parallel(n_jobs=1)]: Done 200 jobs       | elapsed:    1.1s
[Parallel(n_jobs=1)]: Done 450 jobs       | elapsed:    2.4s
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    2.6s finished


0.72274540476990845

In [76]:
np.mean(fr.rf.feature_importances_)

0.027777777777777773

In [78]:
frT.rf.feature_importances_

array([  2.94917129e-03,   6.28787879e-03,   5.06512666e-03,
         2.29022786e-03,   4.30306659e-03,   3.57140091e-03,
         1.80284268e-03,   7.27357838e-03,   6.82429414e-03,
         5.83397796e-03,   3.98994528e-03,   1.61447589e-03,
         6.11637599e-03,   5.29339914e-03,   1.46440063e-03,
         5.96633894e-04,   1.19962871e-03,   3.17849175e-01,
         4.18897450e-03,   7.81169674e-02,   4.14882214e-03,
         7.89791295e-02,   6.04215625e-02,   8.12641938e-02,
         5.12873060e-02,   1.18620679e-03,   2.05567424e-03,
         7.80871288e-02,   8.24824708e-03,   8.89212949e-06,
         8.19644046e-04,   5.30013065e-02,   1.89028393e-03,
         4.40651675e-03,   4.86040254e-02,   5.89595185e-02])

In [81]:
fr.rf.transform(fr.X_train).shape

(44999, 10)

In [82]:
fr.X_train.shape

(44999, 36)