In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
import os
from sklearn.model_selection import train_test_split

In [2]:
#Set File Paths for weather Data CSV Files
weatherAUS = "../ml-predictions-project/aus_weather/weatherAUS_clean.csv"

In [3]:
#Import weather CSV to Pandas Data Frame
weather = pd.read_csv(weatherAUS)
weather = weather.drop(columns=['RISK_MM'])

#Display Data Frame
weather.head(50)

Unnamed: 0,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustSpeed,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,...,WindDir3pm_NNW,WindDir3pm_NW,WindDir3pm_S,WindDir3pm_SE,WindDir3pm_SSE,WindDir3pm_SSW,WindDir3pm_SW,WindDir3pm_W,WindDir3pm_WNW,WindDir3pm_WSW
0,17.9,35.2,0.0,12.0,12.3,48.0,6.0,20.0,20.0,13.0,...,0,0,0,0,0,0,1,0,0,0
1,18.4,28.9,0.0,14.8,13.0,37.0,19.0,19.0,30.0,8.0,...,0,0,0,0,1,0,0,0,0,0
2,19.4,37.6,0.0,10.8,10.6,46.0,30.0,15.0,42.0,22.0,...,1,0,0,0,0,0,0,0,0,0
3,21.9,38.4,0.0,11.4,12.2,31.0,6.0,6.0,37.0,22.0,...,0,0,0,0,0,0,0,0,0,1
4,24.2,41.0,0.0,11.2,8.4,35.0,17.0,13.0,19.0,15.0,...,0,0,0,0,0,0,0,0,1,0
5,27.1,36.1,0.0,13.0,0.0,43.0,7.0,20.0,26.0,19.0,...,0,0,0,0,0,0,0,0,1,0
6,23.3,34.0,0.0,9.8,12.6,41.0,17.0,19.0,33.0,15.0,...,0,0,0,0,1,0,0,0,0,0
7,16.1,34.2,0.0,14.6,13.2,37.0,15.0,6.0,25.0,9.0,...,0,0,1,0,0,0,0,0,0,0
8,19.0,35.5,0.0,12.0,12.3,48.0,30.0,9.0,46.0,28.0,...,0,0,0,0,0,0,0,0,0,1
9,19.7,35.5,0.0,11.0,12.7,41.0,15.0,17.0,61.0,14.0,...,0,0,0,0,0,0,0,0,0,1


In [4]:
#make sure all data types are floats or integers for the tree
weather.dtypes

MinTemp            float64
MaxTemp            float64
Rainfall           float64
Evaporation        float64
Sunshine           float64
WindGustSpeed      float64
WindSpeed9am       float64
WindSpeed3pm       float64
Humidity9am        float64
Humidity3pm        float64
Pressure9am        float64
Pressure3pm        float64
Cloud9am           float64
Cloud3pm           float64
Temp9am            float64
Temp3pm            float64
rain_today_b          bool
rain_tomorrow_b       bool
WindGustDir_E        int64
WindGustDir_ENE      int64
WindGustDir_ESE      int64
WindGustDir_N        int64
WindGustDir_NE       int64
WindGustDir_NNE      int64
WindGustDir_NNW      int64
WindGustDir_NW       int64
WindGustDir_S        int64
WindGustDir_SE       int64
WindGustDir_SSE      int64
WindGustDir_SSW      int64
                    ...   
WindDir9am_ESE       int64
WindDir9am_N         int64
WindDir9am_NE        int64
WindDir9am_NNE       int64
WindDir9am_NNW       int64
WindDir9am_NW        int64
W

In [5]:
#print out every column name so that I can double check spelling of 'rain_tomorrow_b' which is my output
weather.columns

Index(['MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation', 'Sunshine',
       'WindGustSpeed', 'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am',
       'Humidity3pm', 'Pressure9am', 'Pressure3pm', 'Cloud9am', 'Cloud3pm',
       'Temp9am', 'Temp3pm', 'rain_today_b', 'rain_tomorrow_b',
       'WindGustDir_E', 'WindGustDir_ENE', 'WindGustDir_ESE', 'WindGustDir_N',
       'WindGustDir_NE', 'WindGustDir_NNE', 'WindGustDir_NNW',
       'WindGustDir_NW', 'WindGustDir_S', 'WindGustDir_SE', 'WindGustDir_SSE',
       'WindGustDir_SSW', 'WindGustDir_SW', 'WindGustDir_W', 'WindGustDir_WNW',
       'WindGustDir_WSW', 'WindDir9am_E', 'WindDir9am_ENE', 'WindDir9am_ESE',
       'WindDir9am_N', 'WindDir9am_NE', 'WindDir9am_NNE', 'WindDir9am_NNW',
       'WindDir9am_NW', 'WindDir9am_S', 'WindDir9am_SE', 'WindDir9am_SSE',
       'WindDir9am_SSW', 'WindDir9am_SW', 'WindDir9am_W', 'WindDir9am_WNW',
       'WindDir9am_WSW', 'WindDir3pm_E', 'WindDir3pm_ENE', 'WindDir3pm_ESE',
       'WindDir3pm_N', 'WindDir3pm_NE'

In [6]:
#choose my output column
target = weather['rain_tomorrow_b']


In [7]:
#Choose my input columns - everything except my output column
data = weather.drop('rain_tomorrow_b', axis=1)
feature_names = data.columns

In [8]:
# create my training data
X_train, X_test, y_train, y_test = train_test_split(data, target, random_state = 42)

In [9]:
#fit and score data
random_forest = RandomForestClassifier(n_estimators = 400)
random_forest = random_forest.fit(X_train, y_train)
random_forest.score(X_test, y_test)

0.8618220489188231

In [10]:
#get value of each feature in a list
feature_value = list(random_forest.feature_importances_)
#list of feature names
feature_names = list(data.columns)
#loop to zip the feature name and its value together
features_and_values = [(feature, importance) for feature, importance in zip(feature_names, feature_value)]
features_and_values
sorted(features_and_values, key = lambda x: x[1], reverse = True)

[('Humidity3pm', 0.14432337880749369),
 ('Sunshine', 0.10281630712983189),
 ('Pressure3pm', 0.060101048667025765),
 ('Pressure9am', 0.05494606377366953),
 ('Cloud3pm', 0.05298814703040725),
 ('WindGustSpeed', 0.051387354764969494),
 ('Humidity9am', 0.04689770303922361),
 ('Rainfall', 0.04387645139036425),
 ('Temp3pm', 0.0420046793576393),
 ('MinTemp', 0.04009343918409978),
 ('MaxTemp', 0.03904186754888245),
 ('Temp9am', 0.03850587156376854),
 ('Evaporation', 0.03494310536896011),
 ('Cloud9am', 0.0303439590993751),
 ('WindSpeed3pm', 0.03001706648648661),
 ('WindSpeed9am', 0.028699169501421044),
 ('rain_today_b', 0.018552695514434833),
 ('WindDir9am_N', 0.00449252575413389),
 ('WindDir9am_NNE', 0.003852500966725496),
 ('WindDir3pm_N', 0.003571040516789735),
 ('WindGustDir_N', 0.0035701787393437804),
 ('WindDir9am_W', 0.0035664046391585135),
 ('WindDir3pm_WNW', 0.003566337359376166),
 ('WindGustDir_WNW', 0.0034736774537517654),
 ('WindGustDir_W', 0.0033997887059989333),
 ('WindDir3pm_W', 

In [11]:
#create new model dropping the bottom 5% of the data
weather_model2 = weather.drop(columns=['WindDir9am_NNW','WindDir3pm_WSW',
 'WindDir9am_SW',
 'WindDir9am_WSW',
 'WindGustDir_SE',
 'WindDir3pm_ENE',
 'WindDir3pm_NE',
 'WindDir9am_SSW',
 'WindGustDir_NNE',
 'WindGustDir_E',
 'WindGustDir_SSE',
 'WindDir3pm_SSW',
 'WindDir3pm_ESE',
 'WindDir9am_SSE',
 'WindDir3pm_SW',
 'WindDir3pm_SSE',
 'WindDir9am_ENE',
 'WindDir9am_S',
 'WindDir3pm_E',
 'WindDir9am_E',
 'WindGustDir_NE',
 'WindGustDir_ESE',
 'WindGustDir_ENE',
 'WindDir9am_SE',
 'WindDir9am_ESE']
)
weather.head()

Unnamed: 0,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustSpeed,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,...,WindDir3pm_NNW,WindDir3pm_NW,WindDir3pm_S,WindDir3pm_SE,WindDir3pm_SSE,WindDir3pm_SSW,WindDir3pm_SW,WindDir3pm_W,WindDir3pm_WNW,WindDir3pm_WSW
0,17.9,35.2,0.0,12.0,12.3,48.0,6.0,20.0,20.0,13.0,...,0,0,0,0,0,0,1,0,0,0
1,18.4,28.9,0.0,14.8,13.0,37.0,19.0,19.0,30.0,8.0,...,0,0,0,0,1,0,0,0,0,0
2,19.4,37.6,0.0,10.8,10.6,46.0,30.0,15.0,42.0,22.0,...,1,0,0,0,0,0,0,0,0,0
3,21.9,38.4,0.0,11.4,12.2,31.0,6.0,6.0,37.0,22.0,...,0,0,0,0,0,0,0,0,0,1
4,24.2,41.0,0.0,11.2,8.4,35.0,17.0,13.0,19.0,15.0,...,0,0,0,0,0,0,0,0,1,0


In [12]:
#choose my output column for model 2
target = weather_model2['rain_tomorrow_b']

In [13]:
#Choose my input columns - everything except my output column
data_model2 = weather_model2.drop('rain_tomorrow_b', axis=1)
feature_names_model2 = data_model2.columns

In [14]:
# create my training data
X_train2, X_test2, y_train2, y_test2 = train_test_split(data, target, random_state = 42)

In [15]:
#fit and score data for model 2
random_forest_model2 = RandomForestClassifier(n_estimators = 400)
random_forest_model2 = random_forest_model2.fit(X_train2, y_train2)
random_forest_model2.score(X_test2, y_test2)

0.8608294930875576

In [16]:
from sklearn.model_selection import GridSearchCV
import numpy as np
max_depth = list(np.arange(20, 71, 10))
max_features = list(np.arange(5, 26, 5))
max_features.append('auto')
max_features.append('log2')
max_depth.append(None)
param_grid = {
    'bootstrap': [True, False], 
    'n_estimators': [250],
    'max_features': max_features,
    'max_depth': max_depth,
    'n_jobs': [-1]
}
param_grid

{'bootstrap': [True, False],
 'n_estimators': [250],
 'max_features': [5, 10, 15, 20, 25, 'auto', 'log2'],
 'max_depth': [20, 30, 40, 50, 60, 70, None],
 'n_jobs': [-1]}

In [17]:
grid = GridSearchCV(random_forest_model2, param_grid, verbose=3)
grid.fit(X_train2, y_train2)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 3 folds for each of 98 candidates, totalling 294 fits
[CV] bootstrap=True, max_depth=20, max_features=5, n_estimators=250, n_jobs=-1 
[CV]  bootstrap=True, max_depth=20, max_features=5, n_estimators=250, n_jobs=-1, score=0.857, total=   9.0s
[CV] bootstrap=True, max_depth=20, max_features=5, n_estimators=250, n_jobs=-1 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    9.0s remaining:    0.0s


[CV]  bootstrap=True, max_depth=20, max_features=5, n_estimators=250, n_jobs=-1, score=0.859, total=   5.8s
[CV] bootstrap=True, max_depth=20, max_features=5, n_estimators=250, n_jobs=-1 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   14.8s remaining:    0.0s


[CV]  bootstrap=True, max_depth=20, max_features=5, n_estimators=250, n_jobs=-1, score=0.859, total=   6.2s
[CV] bootstrap=True, max_depth=20, max_features=10, n_estimators=250, n_jobs=-1 
[CV]  bootstrap=True, max_depth=20, max_features=10, n_estimators=250, n_jobs=-1, score=0.859, total=   7.9s
[CV] bootstrap=True, max_depth=20, max_features=10, n_estimators=250, n_jobs=-1 
[CV]  bootstrap=True, max_depth=20, max_features=10, n_estimators=250, n_jobs=-1, score=0.860, total=   9.1s
[CV] bootstrap=True, max_depth=20, max_features=10, n_estimators=250, n_jobs=-1 
[CV]  bootstrap=True, max_depth=20, max_features=10, n_estimators=250, n_jobs=-1, score=0.859, total=   7.8s
[CV] bootstrap=True, max_depth=20, max_features=15, n_estimators=250, n_jobs=-1 
[CV]  bootstrap=True, max_depth=20, max_features=15, n_estimators=250, n_jobs=-1, score=0.860, total=  10.7s
[CV] bootstrap=True, max_depth=20, max_features=15, n_estimators=250, n_jobs=-1 
[CV]  bootstrap=True, max_depth=20, max_features=15

[CV]  bootstrap=True, max_depth=40, max_features=10, n_estimators=250, n_jobs=-1, score=0.861, total=   8.0s
[CV] bootstrap=True, max_depth=40, max_features=10, n_estimators=250, n_jobs=-1 
[CV]  bootstrap=True, max_depth=40, max_features=10, n_estimators=250, n_jobs=-1, score=0.861, total=   8.4s
[CV] bootstrap=True, max_depth=40, max_features=10, n_estimators=250, n_jobs=-1 
[CV]  bootstrap=True, max_depth=40, max_features=10, n_estimators=250, n_jobs=-1, score=0.859, total=   8.4s
[CV] bootstrap=True, max_depth=40, max_features=15, n_estimators=250, n_jobs=-1 
[CV]  bootstrap=True, max_depth=40, max_features=15, n_estimators=250, n_jobs=-1, score=0.861, total=  10.4s
[CV] bootstrap=True, max_depth=40, max_features=15, n_estimators=250, n_jobs=-1 
[CV]  bootstrap=True, max_depth=40, max_features=15, n_estimators=250, n_jobs=-1, score=0.861, total=  11.1s
[CV] bootstrap=True, max_depth=40, max_features=15, n_estimators=250, n_jobs=-1 
[CV]  bootstrap=True, max_depth=40, max_features=1

[CV]  bootstrap=True, max_depth=60, max_features=10, n_estimators=250, n_jobs=-1, score=0.860, total=  10.3s
[CV] bootstrap=True, max_depth=60, max_features=10, n_estimators=250, n_jobs=-1 
[CV]  bootstrap=True, max_depth=60, max_features=10, n_estimators=250, n_jobs=-1, score=0.859, total=  10.3s
[CV] bootstrap=True, max_depth=60, max_features=15, n_estimators=250, n_jobs=-1 
[CV]  bootstrap=True, max_depth=60, max_features=15, n_estimators=250, n_jobs=-1, score=0.860, total=  13.7s
[CV] bootstrap=True, max_depth=60, max_features=15, n_estimators=250, n_jobs=-1 
[CV]  bootstrap=True, max_depth=60, max_features=15, n_estimators=250, n_jobs=-1, score=0.862, total=  14.1s
[CV] bootstrap=True, max_depth=60, max_features=15, n_estimators=250, n_jobs=-1 
[CV]  bootstrap=True, max_depth=60, max_features=15, n_estimators=250, n_jobs=-1, score=0.860, total=  13.6s
[CV] bootstrap=True, max_depth=60, max_features=20, n_estimators=250, n_jobs=-1 
[CV]  bootstrap=True, max_depth=60, max_features=2

[CV]  bootstrap=True, max_depth=None, max_features=10, n_estimators=250, n_jobs=-1, score=0.860, total=  10.6s
[CV] bootstrap=True, max_depth=None, max_features=15, n_estimators=250, n_jobs=-1 
[CV]  bootstrap=True, max_depth=None, max_features=15, n_estimators=250, n_jobs=-1, score=0.860, total=  14.2s
[CV] bootstrap=True, max_depth=None, max_features=15, n_estimators=250, n_jobs=-1 
[CV]  bootstrap=True, max_depth=None, max_features=15, n_estimators=250, n_jobs=-1, score=0.861, total=  14.1s
[CV] bootstrap=True, max_depth=None, max_features=15, n_estimators=250, n_jobs=-1 
[CV]  bootstrap=True, max_depth=None, max_features=15, n_estimators=250, n_jobs=-1, score=0.859, total=  14.3s
[CV] bootstrap=True, max_depth=None, max_features=20, n_estimators=250, n_jobs=-1 
[CV]  bootstrap=True, max_depth=None, max_features=20, n_estimators=250, n_jobs=-1, score=0.860, total=  17.3s
[CV] bootstrap=True, max_depth=None, max_features=20, n_estimators=250, n_jobs=-1 
[CV]  bootstrap=True, max_dept

[CV]  bootstrap=False, max_depth=30, max_features=15, n_estimators=250, n_jobs=-1, score=0.861, total=  27.1s
[CV] bootstrap=False, max_depth=30, max_features=15, n_estimators=250, n_jobs=-1 
[CV]  bootstrap=False, max_depth=30, max_features=15, n_estimators=250, n_jobs=-1, score=0.863, total=  28.1s
[CV] bootstrap=False, max_depth=30, max_features=15, n_estimators=250, n_jobs=-1 
[CV]  bootstrap=False, max_depth=30, max_features=15, n_estimators=250, n_jobs=-1, score=0.858, total=  27.1s
[CV] bootstrap=False, max_depth=30, max_features=20, n_estimators=250, n_jobs=-1 
[CV]  bootstrap=False, max_depth=30, max_features=20, n_estimators=250, n_jobs=-1, score=0.861, total=  32.8s
[CV] bootstrap=False, max_depth=30, max_features=20, n_estimators=250, n_jobs=-1 
[CV]  bootstrap=False, max_depth=30, max_features=20, n_estimators=250, n_jobs=-1, score=0.861, total=  33.1s
[CV] bootstrap=False, max_depth=30, max_features=20, n_estimators=250, n_jobs=-1 
[CV]  bootstrap=False, max_depth=30, max

[CV]  bootstrap=False, max_depth=50, max_features=15, n_estimators=250, n_jobs=-1, score=0.863, total=  26.6s
[CV] bootstrap=False, max_depth=50, max_features=15, n_estimators=250, n_jobs=-1 
[CV]  bootstrap=False, max_depth=50, max_features=15, n_estimators=250, n_jobs=-1, score=0.858, total=  26.2s
[CV] bootstrap=False, max_depth=50, max_features=20, n_estimators=250, n_jobs=-1 
[CV]  bootstrap=False, max_depth=50, max_features=20, n_estimators=250, n_jobs=-1, score=0.861, total=  32.9s
[CV] bootstrap=False, max_depth=50, max_features=20, n_estimators=250, n_jobs=-1 
[CV]  bootstrap=False, max_depth=50, max_features=20, n_estimators=250, n_jobs=-1, score=0.862, total=  32.0s
[CV] bootstrap=False, max_depth=50, max_features=20, n_estimators=250, n_jobs=-1 
[CV]  bootstrap=False, max_depth=50, max_features=20, n_estimators=250, n_jobs=-1, score=0.857, total=  33.1s
[CV] bootstrap=False, max_depth=50, max_features=25, n_estimators=250, n_jobs=-1 
[CV]  bootstrap=False, max_depth=50, max

[CV]  bootstrap=False, max_depth=70, max_features=15, n_estimators=250, n_jobs=-1, score=0.859, total=  26.9s
[CV] bootstrap=False, max_depth=70, max_features=20, n_estimators=250, n_jobs=-1 
[CV]  bootstrap=False, max_depth=70, max_features=20, n_estimators=250, n_jobs=-1, score=0.861, total=  33.6s
[CV] bootstrap=False, max_depth=70, max_features=20, n_estimators=250, n_jobs=-1 
[CV]  bootstrap=False, max_depth=70, max_features=20, n_estimators=250, n_jobs=-1, score=0.863, total=  33.5s
[CV] bootstrap=False, max_depth=70, max_features=20, n_estimators=250, n_jobs=-1 
[CV]  bootstrap=False, max_depth=70, max_features=20, n_estimators=250, n_jobs=-1, score=0.859, total=  32.9s
[CV] bootstrap=False, max_depth=70, max_features=25, n_estimators=250, n_jobs=-1 
[CV]  bootstrap=False, max_depth=70, max_features=25, n_estimators=250, n_jobs=-1, score=0.860, total=  38.7s
[CV] bootstrap=False, max_depth=70, max_features=25, n_estimators=250, n_jobs=-1 
[CV]  bootstrap=False, max_depth=70, max

[Parallel(n_jobs=1)]: Done 294 out of 294 | elapsed: 119.0min finished


GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=400, n_jobs=None,
                                              oob_score=False,
                                              random_state=None, verbose=0,
                                              warm_start=False),
             i

In [18]:
#fit and score data for new "best fit" model based on the param_grid results
random_forest_bestfit = RandomForestClassifier(bootstrap= True, max_depth=20, max_features=15, n_estimators = 250,
                                              n_jobs=-1)
random_forest_bestfit = random_forest_bestfit.fit(X_train2, y_train2)
random_forest_bestfit.score(X_test2, y_test2)

0.8609712867777384

In [16]:
#Set File Paths for a new CSV which has been re-engineered 
engineer_weatherAUS = "../ml-predictions-project/aus_weather/weatherAUS_feature_engineer.csv"

In [18]:
#Import engineered weather CSV to Pandas Data Frame
engineered_weather = pd.read_csv(engineer_weatherAUS)

#Display Data Frame
engineered_weather.head(50)

Unnamed: 0,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustSpeed,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,...,rain_today_b,rain_tomorrow_b,temp_change_9to3,temp_change_min_max,humidity_change,humidity_change_percent,pressure_change,wind_change_direction,wind_gust_change_3,wind_gust_change_9
0,17.9,35.2,0.0,12.0,12.3,48.0,6.0,20.0,20.0,13.0,...,False,False,6.8,17.3,7.0,0.35,1.9,False,True,True
1,18.4,28.9,0.0,14.8,13.0,37.0,19.0,19.0,30.0,8.0,...,False,False,6.7,10.5,22.0,0.733333,0.8,True,True,True
2,19.4,37.6,0.0,10.8,10.6,46.0,30.0,15.0,42.0,22.0,...,False,False,6.2,18.2,20.0,0.47619,3.1,False,True,False
3,21.9,38.4,0.0,11.4,12.2,31.0,6.0,6.0,37.0,22.0,...,False,False,6.5,16.5,15.0,0.405405,3.6,False,True,False
4,24.2,41.0,0.0,11.2,8.4,35.0,17.0,13.0,19.0,15.0,...,False,False,4.0,16.8,4.0,0.210526,3.3,False,False,True
5,27.1,36.1,0.0,13.0,0.0,43.0,7.0,20.0,26.0,19.0,...,False,False,3.6,9.0,7.0,0.269231,0.3,False,True,False
6,23.3,34.0,0.0,9.8,12.6,41.0,17.0,19.0,33.0,15.0,...,False,False,6.5,10.7,18.0,0.545455,1.4,False,True,True
7,16.1,34.2,0.0,14.6,13.2,37.0,15.0,6.0,25.0,9.0,...,False,False,12.1,18.1,16.0,0.64,4.1,False,True,False
8,19.0,35.5,0.0,12.0,12.3,48.0,30.0,9.0,46.0,28.0,...,False,False,9.9,16.5,18.0,0.391304,4.3,False,True,False
9,19.7,35.5,0.0,11.0,12.7,41.0,15.0,17.0,61.0,14.0,...,False,False,9.6,15.8,47.0,0.770492,2.1,False,True,True


In [19]:
#choose my output column
target = engineered_weather['rain_tomorrow_b']


In [20]:
#Choose my input columns - everything except my output column
data = engineered_weather.drop('rain_tomorrow_b', axis=1)
feature_names = data.columns

In [21]:
# create my training data
X_train, X_test, y_train, y_test = train_test_split(data, target, random_state = 42)

In [22]:
#fit and score data
random_forest = RandomForestClassifier(n_estimators = 400)
random_forest = random_forest.fit(X_train, y_train)
random_forest.score(X_test, y_test)

0.8608210298883687

In [23]:
#get value of each feature in a list
feature_value = list(random_forest.feature_importances_)
#list of feature names
feature_names = list(data.columns)
#loop to zip the feature name and its value together
features_and_values = [(feature, importance) for feature, importance in zip(feature_names, feature_value)]
features_and_values
sorted(features_and_values, key = lambda x: x[1], reverse = True)

[('Humidity3pm', 0.1360601182988874),
 ('Sunshine', 0.10129612113239633),
 ('Pressure3pm', 0.05717424608686316),
 ('humidity_change_percent', 0.051924828921656466),
 ('WindGustSpeed', 0.049599891626928516),
 ('temp_change_min_max', 0.049472635713249224),
 ('Pressure9am', 0.04781806427987535),
 ('pressure_change', 0.04377678074641013),
 ('Rainfall', 0.04144516775480865),
 ('Humidity9am', 0.04121251565248109),
 ('humidity_change', 0.040170388928201635),
 ('Cloud3pm', 0.03881945480488366),
 ('temp_change_9to3', 0.03624560213590387),
 ('Temp3pm', 0.03333953241375356),
 ('MinTemp', 0.03274547691173235),
 ('MaxTemp', 0.03179074074859684),
 ('Temp9am', 0.03166500352934013),
 ('Evaporation', 0.03123160627091269),
 ('WindSpeed3pm', 0.026693100502506038),
 ('WindSpeed9am', 0.025360989477492525),
 ('Cloud9am', 0.020402913885740854),
 ('rain_today_b', 0.01817716535965074),
 ('wind_gust_change_3', 0.0049384438968088674),
 ('wind_gust_change_9', 0.0046508316354644625),
 ('wind_change_direction', 0.0

In [26]:
#Create a new dataframe by dropping the least important columns from previous dataframe
engineered_weather_model2 = engineered_weather.drop(columns=['wind_gust_change_3','wind_gust_change_9', 'wind_change_direction'])

#Display Data Frame
engineered_weather_model2.head()

Unnamed: 0,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustSpeed,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,...,Cloud3pm,Temp9am,Temp3pm,rain_today_b,rain_tomorrow_b,temp_change_9to3,temp_change_min_max,humidity_change,humidity_change_percent,pressure_change
0,17.9,35.2,0.0,12.0,12.3,48.0,6.0,20.0,20.0,13.0,...,5.0,26.6,33.4,False,False,6.8,17.3,7.0,0.35,1.9
1,18.4,28.9,0.0,14.8,13.0,37.0,19.0,19.0,30.0,8.0,...,1.0,20.3,27.0,False,False,6.7,10.5,22.0,0.733333,0.8
2,19.4,37.6,0.0,10.8,10.6,46.0,30.0,15.0,42.0,22.0,...,6.0,28.7,34.9,False,False,6.2,18.2,20.0,0.47619,3.1
3,21.9,38.4,0.0,11.4,12.2,31.0,6.0,6.0,37.0,22.0,...,5.0,29.1,35.6,False,False,6.5,16.5,15.0,0.405405,3.6
4,24.2,41.0,0.0,11.2,8.4,35.0,17.0,13.0,19.0,15.0,...,6.0,33.6,37.6,False,False,4.0,16.8,4.0,0.210526,3.3


In [27]:
#choose my output column
target = engineered_weather_model2['rain_tomorrow_b']


In [28]:
#Choose my input columns - everything except my output column
data = engineered_weather_model2.drop('rain_tomorrow_b', axis=1)
feature_names = data.columns

In [29]:
#fit and score data
random_forest = RandomForestClassifier(n_estimators = 400)
random_forest = random_forest.fit(X_train, y_train)
random_forest.score(X_test, y_test)

0.8619013323730644

In [35]:
#change the test_size to increase the amount of training data used
X_train, X_test, y_train, y_test = train_test_split(data, target, train_size=0.8, random_state=42)

In [36]:
#fit and score data for engineered_weather_model2 with increased training data
random_forest_engineered = RandomForestClassifier(n_estimators = 400)
random_forest_engineered = random_forest_engineered.fit(X_train, y_train)
random_forest_engineered.score(X_test, y_test)

0.8607651912978245