In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from joblib import dump, load

## Rainfall Prediction

This is a cleaned version of a working jupyter notebook where I attempt to accurately predict whether it will rain on n+1 day given a set of conditions. This is a classification task and is quite straight forward. 

In [2]:
# Read in the full weather dataset
df = pd.read_csv(r"C:\Users\Kendall\Downloads\weatherAUS.csv\weatherAUS.csv")

In [3]:
CITY_TO_PREDICT = "Hobart"

In [4]:
city_subset_df = df.loc[df['Location'] == CITY_TO_PREDICT]

city_subset_df = city_subset_df.drop(columns=['Cloud9am', 'Cloud3pm'])
city_subset_df = city_subset_df.dropna()

city_subset_df['RainToday']  = city_subset_df.RainToday.map(dict(Yes=1, No=0))

wd = city_subset_df.WindDir3pm.map(dict(E=0, ENE=67.5, ESE=112.5, 
                                      N=90, NE=45, NNE=22.5, 
                                      NNW=337.5, NW=315, 
                                      S=180, SE=135, SSE=157.5, 
                                      SSW=202.5, SW=225, 
                                      W=270, WNW=292.5, WSW=247.5))
city_subset_df['numberWindDir3pm'] = wd

In [9]:
# Drop the unneccessary columns  
X = city_subset_df.drop(columns=['RainTomorrow','Date', 'WindGustDir','Location', 'WindDir9am', 'WindDir3pm', 'Rainfall', 'Sunshine', 'MaxTemp', 'MinTemp', 'Evaporation', 'WindSpeed9am', 'WindSpeed3pm', 'Pressure9am', 'Temp9am', 'Humidity9am'])
y = city_subset_df['RainTomorrow']

In [10]:
# Scale ready for classification model
X_scaled = StandardScaler().fit_transform(X)

In [12]:
# Create our test train split, setting a seed so I can eval different methods
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=.3, random_state=42)

In [13]:
# I have already determined these as the best parameters with trial and error and help from GridSearchCV
rf_model = RandomForestClassifier(n_estimators=200, max_depth=8, criterion='entropy')
rf_model.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='entropy', max_depth=8, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=200,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [14]:
y_predict = rf_model.predict(X_test)

In [15]:
accuracy_score(y_test, y_predict)

0.8241042345276873

If we are happy with the score above we can dump the model to disk and reload it later for other uses.

In [16]:
dump(rf_model, 'sydney_rain_prediction_model.joblib')

['sydney_rain_prediction_model.joblib']