# Basic Decision Tree Notebook

## Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import seaborn as sns

In [2]:
df = pd.read_csv("..\data\weatherAUS.csv")

## What does our data look like?

In [3]:
df.head()

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,2008-12-01,Albury,13.4,22.9,0.6,,,W,44.0,W,...,71.0,22.0,1007.7,1007.1,8.0,,16.9,21.8,No,No
1,2008-12-02,Albury,7.4,25.1,0.0,,,WNW,44.0,NNW,...,44.0,25.0,1010.6,1007.8,,,17.2,24.3,No,No
2,2008-12-03,Albury,12.9,25.7,0.0,,,WSW,46.0,W,...,38.0,30.0,1007.6,1008.7,,2.0,21.0,23.2,No,No
3,2008-12-04,Albury,9.2,28.0,0.0,,,NE,24.0,SE,...,45.0,16.0,1017.6,1012.8,,,18.1,26.5,No,No
4,2008-12-05,Albury,17.5,32.3,1.0,,,W,41.0,ENE,...,82.0,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,No,No


In [4]:
df.shape

(145460, 23)

In [5]:
df.dtypes

Date              object
Location          object
MinTemp          float64
MaxTemp          float64
Rainfall         float64
Evaporation      float64
Sunshine         float64
WindGustDir       object
WindGustSpeed    float64
WindDir9am        object
WindDir3pm        object
WindSpeed9am     float64
WindSpeed3pm     float64
Humidity9am      float64
Humidity3pm      float64
Pressure9am      float64
Pressure3pm      float64
Cloud9am         float64
Cloud3pm         float64
Temp9am          float64
Temp3pm          float64
RainToday         object
RainTomorrow      object
dtype: object

In [6]:
df.isna().sum()/df.shape[0]

Date             0.000000
Location         0.000000
MinTemp          0.010209
MaxTemp          0.008669
Rainfall         0.022419
Evaporation      0.431665
Sunshine         0.480098
WindGustDir      0.070989
WindGustSpeed    0.070555
WindDir9am       0.072639
WindDir3pm       0.029066
WindSpeed9am     0.012148
WindSpeed3pm     0.021050
Humidity9am      0.018246
Humidity3pm      0.030984
Pressure9am      0.103568
Pressure3pm      0.103314
Cloud9am         0.384216
Cloud3pm         0.408071
Temp9am          0.012148
Temp3pm          0.024811
RainToday        0.022419
RainTomorrow     0.022460
dtype: float64

For now, we will simply drop the missing rows from the dataframe.

In [7]:
df.dropna(inplace=True)

And now, convert the date into three seperate columns describing the year, month, and day.

In [8]:
df['year'] = pd.DatetimeIndex(df['Date']).year
df['month'] = pd.DatetimeIndex(df['Date']).month
df['day'] = pd.DatetimeIndex(df['Date']).day

df.drop(columns=["Date"], inplace=True)

In [9]:
y = df["RainToday"]
df.drop(columns = ["RainToday", "RainTomorrow", "Rainfall"], inplace = True)

For decision trees, we need OneHotEncoding. This is why we need this piece of code here.

In [10]:
df_new = pd.get_dummies(df ,columns=["Location", "WindGustDir", "WindDir9am", "WindDir3pm"],drop_first=False)
df_new

Unnamed: 0,MinTemp,MaxTemp,Evaporation,Sunshine,WindGustSpeed,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,...,WindDir3pm_NNW,WindDir3pm_NW,WindDir3pm_S,WindDir3pm_SE,WindDir3pm_SSE,WindDir3pm_SSW,WindDir3pm_SW,WindDir3pm_W,WindDir3pm_WNW,WindDir3pm_WSW
6049,17.9,35.2,12.0,12.3,48.0,6.0,20.0,20.0,13.0,1006.3,...,0,0,0,0,0,0,1,0,0,0
6050,18.4,28.9,14.8,13.0,37.0,19.0,19.0,30.0,8.0,1012.9,...,0,0,0,0,1,0,0,0,0,0
6052,19.4,37.6,10.8,10.6,46.0,30.0,15.0,42.0,22.0,1012.3,...,1,0,0,0,0,0,0,0,0,0
6053,21.9,38.4,11.4,12.2,31.0,6.0,6.0,37.0,22.0,1012.7,...,0,0,0,0,0,0,0,0,0,1
6054,24.2,41.0,11.2,8.4,35.0,17.0,13.0,19.0,15.0,1010.7,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
142298,19.3,33.4,6.0,11.0,35.0,9.0,20.0,63.0,32.0,1013.9,...,0,0,0,0,0,0,0,0,0,0
142299,21.2,32.6,7.6,8.6,37.0,13.0,11.0,56.0,28.0,1014.6,...,0,0,0,1,0,0,0,0,0,0
142300,20.7,32.8,5.6,11.0,33.0,17.0,11.0,46.0,23.0,1015.3,...,0,0,0,0,0,0,0,1,0,0
142301,19.5,31.8,6.2,10.6,26.0,9.0,17.0,62.0,58.0,1014.9,...,1,0,0,0,0,0,0,0,0,0


# Preparing the decision trees

In [11]:
X = df_new

In [12]:
from sklearn import tree

decTree = tree.DecisionTreeClassifier()

decTree.fit(X, y)

DecisionTreeClassifier()

In [13]:
decTree.get_params()

{'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'presort': 'deprecated',
 'random_state': None,
 'splitter': 'best'}

In [14]:
from sklearn.model_selection import cross_val_score

acc = cross_val_score(decTree, X, y, cv = 10, scoring = "f1_weighted")
acc = np.median(acc)

print(acc)

0.7056189347387394


Ok! This is our vanilla performance!

## Hyperparameter Tuning: Random Grid Search

For hyperparameter tuning, we consider a mathematical formula that can ensure sufficient results.

In [15]:
def getIterations(topPercent, probability):
    result = np.math.ceil(np.log(-probability + 1)/np.log(1-topPercent))
    return result

For this formula, we describe the goodness of the solution, and our chance to get this solution. For instance, in this code, we try to get one of the top 1% solutions, with a chance of 95%. The expected number of iterations will be returned when we use the function.

In [16]:
topPercent = 0.01
probability = 0.95

iterations = getIterations(topPercent, probability)
print(iterations)

299


This is the number of random grid search iteration that are needed for these parameters.

In [17]:
from sklearn.model_selection import RandomizedSearchCV

# Depth of the tree
max_depth = [int(x) for x in np.linspace(start = 1, stop = 40, num = 21)]
# Number of features to consider at every split
max_features = [int(x) for x in np.linspace(start = 1, stop = 92, num = 21)]
max_features.append('auto')
max_features.append('sqrt')
# Number of minimum samples at a leaf node
min_samples_leaf = [int(x) for x in np.linspace(start = 1, stop = 1000, num = 21)]
# Number of miniumum samples for a split
min_samples_split = [int(x) for x in np.linspace(start = 2, stop = 1000, num = 21)]

random_grid = {'max_depth': max_depth,
               'max_features': max_features,
               'min_samples_leaf': min_samples_leaf,
               'min_samples_split': min_samples_split}

print(random_grid)

{'max_depth': [1, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40], 'max_features': [1, 5, 10, 14, 19, 23, 28, 32, 37, 41, 46, 51, 55, 60, 64, 69, 73, 78, 82, 87, 92, 'auto', 'sqrt'], 'min_samples_leaf': [1, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100], 'min_samples_split': [2, 51, 101, 151, 201, 251, 301, 351, 401, 451, 501, 550, 600, 650, 700, 750, 800, 850, 900, 950, 1000]}


In [18]:
clf = tree.DecisionTreeClassifier()

rf_random = RandomizedSearchCV(estimator = clf, 
                               param_distributions = random_grid, 
                               n_iter = iterations, 
                               scoring = 'f1_weighted',
                               cv = 10, 
                               verbose=1, 
                               random_state=1, 
                               n_jobs = -1)
rf_random.fit(X, y)

Fitting 10 folds for each of 299 candidates, totalling 2990 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   15.0s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   53.2s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:  3.6min
[Parallel(n_jobs=-1)]: Done 1242 tasks      | elapsed:  5.7min
[Parallel(n_jobs=-1)]: Done 1792 tasks      | elapsed:  8.1min
[Parallel(n_jobs=-1)]: Done 2442 tasks      | elapsed: 10.9min
[Parallel(n_jobs=-1)]: Done 2990 out of 2990 | elapsed: 13.4min finished


RandomizedSearchCV(cv=10, estimator=DecisionTreeClassifier(), n_iter=299,
                   n_jobs=-1,
                   param_distributions={'max_depth': [1, 2, 4, 6, 8, 10, 12, 14,
                                                      16, 18, 20, 22, 24, 26,
                                                      28, 30, 32, 34, 36, 38,
                                                      40],
                                        'max_features': [1, 5, 10, 14, 19, 23,
                                                         28, 32, 37, 41, 46, 51,
                                                         55, 60, 64, 69, 73, 78,
                                                         82, 87, 92, 'auto',
                                                         'sqrt'],
                                        'min_samples_leaf': [1, 5, 10, 15, 20,
                                                             25, 30, 35, 40, 45,
                                                        

In [19]:
params = rf_random.best_params_

clf = tree.DecisionTreeClassifier(**params)
acc = cross_val_score(clf, X, y, cv = 10, scoring = "f1_weighted")
acc = np.median(acc)
print(acc, "\n")
print(rf_random.best_params_)

0.789149931452578 

{'min_samples_split': 451, 'min_samples_leaf': 100, 'max_features': 55, 'max_depth': 30}


In [20]:
bestParams = rf_random.best_params_

# max_depth
best_max_depth = bestParams['max_depth']
start = best_max_depth - 1
if(best_max_depth < 1):
    start = 0
stop = best_max_depth + 1

num = (stop - start) + 1

max_depth = n_estimators = [int(x) for x in np.linspace(start = start, stop = stop, num = int(num))]

# max_features
best_max_features = bestParams['max_features']
start = best_max_features - 3
if(best_max_features < 3):
    start = 0
stop = best_max_features + 3

num = (stop - start) + 1

max_features = n_estimators = [int(x) for x in np.linspace(start = start, stop = stop, num = int(num))]

# min_samples_leaf
best_min_samples_leaf = bestParams['min_samples_leaf']
start = best_min_samples_leaf - 4
if(best_min_samples_leaf < 4):
    start = 0
stop = best_min_samples_leaf + 4

num = (stop - start) + 1

min_samples_leaf = n_estimators = [int(x) for x in np.linspace(start = start, stop = stop, num = int(num))]

# min_samples_split
best_min_samples_split = bestParams['min_samples_split']
start = best_min_samples_split - 4
if(best_min_samples_split < 4):
    start = 0
stop = best_min_samples_split + 4

num = (stop - start)/2 + 1

min_samples_split = n_estimators = [int(x) for x in np.linspace(start = start, stop = stop, num = int(num))]


param_grid = {'max_depth': max_depth,
              'max_features': max_features,
              'min_samples_leaf': min_samples_leaf,
              'min_samples_split': min_samples_split}

param_grid

{'max_depth': [29, 30, 31],
 'max_features': [52, 53, 54, 55, 56, 57, 58],
 'min_samples_leaf': [96, 97, 98, 99, 100, 101, 102, 103, 104],
 'min_samples_split': [447, 449, 451, 453, 455]}

In [21]:
from sklearn.model_selection import GridSearchCV

clf2 = tree.DecisionTreeClassifier()
rf_grid = GridSearchCV(estimator = clf2, param_grid = param_grid, cv = 10, verbose = 1, n_jobs = -1, scoring = "f1_weighted")

rf_grid.fit(X, y)

Fitting 10 folds for each of 945 candidates, totalling 9450 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   13.9s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:  4.1min
[Parallel(n_jobs=-1)]: Done 1242 tasks      | elapsed:  6.5min
[Parallel(n_jobs=-1)]: Done 1792 tasks      | elapsed:  9.5min
[Parallel(n_jobs=-1)]: Done 2442 tasks      | elapsed: 13.1min
[Parallel(n_jobs=-1)]: Done 3192 tasks      | elapsed: 17.2min
[Parallel(n_jobs=-1)]: Done 4042 tasks      | elapsed: 21.6min
[Parallel(n_jobs=-1)]: Done 4992 tasks      | elapsed: 26.6min
[Parallel(n_jobs=-1)]: Done 6042 tasks      | elapsed: 32.4min
[Parallel(n_jobs=-1)]: Done 7192 tasks      | elapsed: 38.5min
[Parallel(n_jobs=-1)]: Done 8442 tasks      | elapsed: 45.2min
[Parallel(n_jobs=-1)]: Done 9450 out of 9450 | elapsed: 50.8min finished


GridSearchCV(cv=10, estimator=DecisionTreeClassifier(), n_jobs=-1,
             param_grid={'max_depth': [29, 30, 31],
                         'max_features': [52, 53, 54, 55, 56, 57, 58],
                         'min_samples_leaf': [96, 97, 98, 99, 100, 101, 102,
                                              103, 104],
                         'min_samples_split': [447, 449, 451, 453, 455]},
             scoring='f1_weighted', verbose=1)

In [22]:
params = rf_grid.best_params_

clf2 = tree.DecisionTreeClassifier(**params)
acc = cross_val_score(clf, X, y, cv = 10, scoring = "f1_weighted")
acc = np.median(acc)
print(acc, "\n")
print(rf_grid.best_params_)

0.7945524889637866 

{'max_depth': 30, 'max_features': 56, 'min_samples_leaf': 100, 'min_samples_split': 447}
