This python script trains a random forest model to predict if it rains.

In [9]:
import pandas
import numpy as np
from sklearn.cross_validation import cross_val_score,KFold
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import f1_score

In [10]:
raw_dataframe=pandas.read_csv("./data/improved-dataset/turnstile_weather_v2.csv")
dataframe=raw_dataframe

In [11]:
print(raw_dataframe.shape[0])
print(raw_dataframe.columns)

42649
Index(['UNIT', 'DATEn', 'TIMEn', 'ENTRIESn', 'EXITSn', 'ENTRIESn_hourly',
       'EXITSn_hourly', 'datetime', 'hour', 'day_week', 'weekday', 'station',
       'latitude', 'longitude', 'conds', 'fog', 'precipi', 'pressurei', 'rain',
       'tempi', 'wspdi', 'meanprecipi', 'meanpressurei', 'meantempi',
       'meanwspdi', 'weather_lat', 'weather_lon'],
      dtype='object')


### Prepare the features for training

In [12]:
selected_features=['tempi','pressurei','wspdi','precipi','fog','weekday']
features_to_dummy=['hour']
features = dataframe[[f for f in selected_features if f not in features_to_dummy]]

total_dummy_feature_num=0
for fd in features_to_dummy:

    # Add UNIT to features using dummy variables
    dummy_units = pandas.get_dummies(dataframe[fd], prefix=fd)
    total_dummy_feature_num+=dummy_units.shape[1]
    features = features.join(dummy_units)

values = dataframe['rain']
features_array=np.array(features)
values_array=np.array(values)

### Run Random Forest classification

In [13]:
rf=RandomForestClassifier()
rf.fit(features_array,values_array)

RandomForestClassifier(bootstrap=True, compute_importances=None,
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_density=None, min_samples_leaf=1,
            min_samples_split=2, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0)

### Print the importances of each trained features:

In [14]:
for i,r in enumerate(rf.feature_importances_):
    print(features.columns[i]+":"+str(r))

tempi:0.247063630483
pressurei:0.326992317629
wspdi:0.141390646133
precipi:0.203805064386
fog:0.00554135423726
weekday:0.0261809626921
hour_0:0.00751892092735
hour_4:0.00932259867797
hour_8:0.00704045195043
hour_12:0.00517219750272
hour_16:0.0134191915695
hour_20:0.00655266381197


### Run cross validation, use f1 score as metric

In [15]:
cs=cross_val_score(RandomForestClassifier(), features_array,values_array,cv=5,scoring='f1')
print(cs)

[ 0.99869758  1.          1.          0.99044175  0.98636481]


#### averaged f1 scores of the five fold classification:

In [16]:
np.mean(cs)

0.99510082591514382