In [None]:
import pandas as pd
import matplotlib
%matplotlib inline 
import numpy as np
nyc = pd.read_csv('data/central-park-raw.csv', parse_dates=[0])

In [None]:
### Data transformation from previous notebooks
nyc.columns = [x.strip() for x in nyc.columns]
nyc.columns = [x.replace(' ', '_') for x in nyc.columns]
nyc.PrecipitationIn.replace("T", '0.001')
nyc.PrecipitationIn = pd.to_numeric(nyc.PrecipitationIn.replace("T", '0.001'))
nyc['Events'] = nyc.Events.fillna('')

# Machine Learning

Pandas allows gives us easy integration with the sklearn library. Let's see if we can 
predict humidity (``y``) from the other columns (``X``).

We will train a Random Forest with a sample of our data, then test it with another sample to see how it performs.

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
from sklearn.model_selection import train_test_split

In [None]:
nyc.columns

In [None]:
# Shift Humidity down to predict next day
pd.concat([nyc.Mean_Humidity, nyc.Mean_Humidity.shift(1)], axis=1)

In [None]:
# Regression - Try to predict Mean_Humidity (y) from non humidity columns (X)
# Get training set (X_train)
# Shift Humidity down to predict next day
X = nyc[[x for x in nyc.columns if 'Humid' not in x]]
y = nyc.Mean_Humidity.shift(1)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [None]:
# Create a model 
rf_model = RandomForestRegressor()
rf_model.fit(X_train, y_train)



In [None]:
# Need to make "Dummy" variables from Events column
nyc_dummy = pd.get_dummies(nyc, columns=['Events'])
nyc_dummy.head()

In [None]:
# Regression - Try to predict Mean_Humidity (y) from non humidity columns (X)
# Get training set (X_train)
X = nyc_dummy[[x for x in nyc_dummy.columns if 'Humid' not in x]]
y = nyc_dummy.Mean_Humidity.shift(1)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [None]:
X.T

In [None]:
# Create a model (whoops data needs to be floats)
rf_model = RandomForestRegressor()
rf_model.fit(X_train, y_train)



In [None]:
# Need to remove timestamp
# Regression - Try to predict Mean_Humidity (y) from non humidity columns (X)
# Get training set (X_train)
def valid(col):
    return 'Humid' not in col and 'EST' not in col
X = nyc_dummy[[x for x in nyc_dummy.columns if valid(x)]]
y = nyc_dummy.Mean_Humidity.shift(1)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [None]:
# Create a model 
rf_model = RandomForestRegressor()
rf_model.fit(X_train, y_train)

In [None]:
# Need to remove NA
# Regression - Try to predict Mean_Humidity (y) from non humidity columns (X)
# Get training set (X_train)
def valid(col):
    return 'Humid' not in col and 'EST' not in col
nyc_dummy = nyc_dummy.dropna()
X = nyc_dummy[[x for x in nyc_dummy.columns if valid(x)]].iloc[1:]
y = nyc_dummy.Mean_Humidity.shift(1).dropna()
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [None]:
# Create a model 
rf_model = RandomForestRegressor()
rf_model.fit(X_train, y_train)

In [None]:
# Get R2 measure (indicator of accuracy 1 is perfect 0 is horrible)
rf_model.score(X_test, y_test)

In [None]:
type(y_test)

In [None]:
pd.concat([pd.Series(rf_model.predict(X_test)), y_test.reset_index(
drop=True)], axis=1)

In [None]:
sorted(zip(X.columns, rf_model.feature_importances_),
        key=lambda x: x[1], reverse=True)

## Machine Learning Assignment
* Using the nino dataset, see if you can predict what the temperature (``air_temp_F``) will be for the next day 