# Predict Regulation Throughput Ratio with CART Decision Trees
## (Lower Frequency Regulation Service)

## 1. Import the necessary modules and libraries

In [1]:
# data analysis and wrangling
import pandas as pd
import numpy as np

# visualization
import matplotlib.pyplot as plt

# machine learning
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.model_selection import GridSearchCV
from sklearn import tree

## 2. Exploring and Preprocessing the Data

In [2]:
# read data from csv file
df = pd.read_csv("df_lower_alldata.csv").fillna(0)

In [3]:
df = df.drop(['Unnamed: 0', 'Interval Ending (AEST)'],axis=1)
df.head(5)

Unnamed: 0,BLUFF1,CLEMGPWF,HALLWF1,HALLWF2,HDWF1,HDWF2,HDWF3,LKBONNY2,LKBONNY3,NBHWF1,SNOWNTH1,SNOWSTH1,SNOWTWN1,WATERLWF,Unnamed: 15,Average Frequency (Hz),Median Frequency (Hz),Cummulative Frequency (Hz),Difference Frequency (Hz),Lower Throughput Ratio
0,14.3,2.44919,13.6,10.63,22.2,15.6,16.6,32.53022,13.22982,21.5,13.4,7.4,12.43428,13.3,0.0,50.063867,50.07,2.340012,0.0,0.020234
1,9.9,1.51274,8.72,7.63,19.6,17.0,13.1,23.63865,9.43392,9.3,9.7,1.2,6.01231,8.9,0.0,50.0437,50.040001,1.950005,0.010002,0.126083
2,11.0,0.79239,9.27,7.26,14.6,16.0,12.0,22.36064,5.13563,9.1,8.9,2.3,4.41001,4.1,0.0,50.047967,50.040001,2.269928,0.050003,0.090646
3,4.7,1.65681,6.49,-0.06,10.7,21.9,16.2,26.73126,6.47535,12.0,11.4,7.3,8.2313,0.2,0.0,50.032533,50.029999,2.140041,0.02,0.181213
4,3.7,0.72035,5.34,-0.36,13.0,22.6,22.0,25.91605,7.81508,7.3,12.3,6.4,6.92434,-0.4,0.0,50.029,50.029999,2.140018,-0.060001,0.106281


In [4]:
df.describe()

Unnamed: 0,BLUFF1,CLEMGPWF,HALLWF1,HALLWF2,HDWF1,HDWF2,HDWF3,LKBONNY2,LKBONNY3,NBHWF1,SNOWNTH1,SNOWSTH1,SNOWTWN1,WATERLWF,Unnamed: 15,Average Frequency (Hz),Median Frequency (Hz),Cummulative Frequency (Hz),Difference Frequency (Hz),Lower Throughput Ratio
count,20778.0,20778.0,20778.0,20778.0,20778.0,20778.0,20778.0,20778.0,20778.0,20778.0,20778.0,20778.0,20778.0,20778.0,20778.0,20778.0,20778.0,20778.0,20778.0,20778.0
mean,19.203651,23.186787,40.469118,30.71016,48.014205,44.679442,47.878493,46.441632,11.57746,58.87639,55.925392,54.765276,43.443422,48.632761,0.797251,50.0107,50.009851,2.382877,-0.001477,0.278321
std,17.072084,18.295492,29.090983,22.213073,32.042073,31.893685,33.359502,44.204632,11.066033,41.338364,44.01771,39.212054,32.232361,39.069051,2.959013,0.039128,0.042637,0.231348,0.082418,0.230872
min,-0.2,-0.64832,-0.84,-0.6,-1.1,-1.2,-1.1,-1.43366,-0.33668,-1.2,0.0,0.0,-0.92882,-1.2,0.0,49.842233,49.810001,1.559982,-0.290001,1e-06
25%,2.7,5.25856,13.08375,9.91,17.3125,14.6,16.3,9.652405,2.501,19.4,15.1,17.355,10.596363,13.201,0.0,49.987033,49.990002,2.22997,-0.059998,0.08906
50%,14.6905,21.29,39.225,29.658,48.7,42.2,46.5595,31.355795,7.75926,59.7985,47.8,51.8,44.13842,39.3,0.0,50.0118,50.009998,2.370018,0.0,0.223874
75%,36.275108,40.48368,68.28,51.39,76.4,73.37489,77.734265,77.186723,19.03531,96.88189,98.8,90.354345,73.13488,86.0,0.0,50.035533,50.040001,2.520039,0.049999,0.414948
max,52.962,56.415,90.67,70.36,100.8,102.3,109.9,157.745,38.68466,131.059,143.3,125.017,96.67012,129.706,17.9,50.167866,50.169998,4.440033,0.34,1.0


In [40]:
a = np.percentile(df, [25, 50, 75])
a

array([ 2.19997787, 25.4       , 50.13999939])

In [5]:
df.columns

Index(['BLUFF1', 'CLEMGPWF', 'HALLWF1', 'HALLWF2', 'HDWF1', 'HDWF2', 'HDWF3',
       'LKBONNY2', 'LKBONNY3', 'NBHWF1', 'SNOWNTH1', 'SNOWSTH1', 'SNOWTWN1',
       'WATERLWF', 'Unnamed: 15', 'Average Frequency (Hz)',
       'Median Frequency (Hz)', 'Cummulative Frequency (Hz)',
       'Difference Frequency (Hz)', 'Lower Throughput Ratio'],
      dtype='object')

##  3. Split `df` into `df_train`, `df_test` with 30% as test

In [6]:
df_train, df_test = train_test_split(df, test_size = 0.3, random_state = 100)
display(df_train.head())
display(df_test.head())

Unnamed: 0,BLUFF1,CLEMGPWF,HALLWF1,HALLWF2,HDWF1,HDWF2,HDWF3,LKBONNY2,LKBONNY3,NBHWF1,SNOWNTH1,SNOWSTH1,SNOWTWN1,WATERLWF,Unnamed: 15,Average Frequency (Hz),Median Frequency (Hz),Cummulative Frequency (Hz),Difference Frequency (Hz),Lower Throughput Ratio
937,9.4,3.02547,38.61,11.85,36.7,9.8,43.2,61.07637,12.55995,42.3,113.7,102.8,0.0,41.0,0.0,49.982933,49.990002,2.349968,-0.110001,0.122113
12346,29.232,31.101,65.209,51.264,67.722,73.932,82.819,134.183,32.601,120.136,91.341,48.691,77.836,50.177,0.0,50.003134,50.0,2.160004,0.120003,0.049341
10428,49.889,21.189,62.029,42.499,69.059,66.49,65.059,48.747,10.916,114.801,82.251,94.841,88.295,94.585,0.0,49.962567,49.970001,2.459999,0.080002,0.256708
5590,0.212,4.997,0.0,0.0,11.387,9.49,6.195,21.272,7.048,0.0,27.899,22.248,20.328,11.187,0.0,50.0072,50.0,2.18998,-0.07,0.148876
13889,6.7,-0.21611,22.42,7.75,17.2,13.2,19.7,6.45276,0.05582,20.7,16.2,8.4,8.4683,1.7,0.0,50.019467,50.029999,2.400036,0.02,0.544832


Unnamed: 0,BLUFF1,CLEMGPWF,HALLWF1,HALLWF2,HDWF1,HDWF2,HDWF3,LKBONNY2,LKBONNY3,NBHWF1,SNOWNTH1,SNOWSTH1,SNOWTWN1,WATERLWF,Unnamed: 15,Average Frequency (Hz),Median Frequency (Hz),Cummulative Frequency (Hz),Difference Frequency (Hz),Lower Throughput Ratio
7303,3.5,53.44998,16.25,11.73,31.5,19.4,24.1,8.46766,1.22808,24.6,131.5,112.6,94.54403,20.7,0.0,49.9319,49.919998,2.239998,-0.080002,0.782171
10187,21.7,43.342,57.858,48.602,40.973,58.417,56.314,2.327,0.616,93.14,37.078,36.365,44.852,29.789,0.0,49.978,49.970001,2.669991,-0.209999,0.276639
13175,39.4,23.12324,69.1,54.35,74.6,82.5,79.8,117.63667,29.02745,96.6,105.2,85.1,72.40504,50.5,0.0,50.0866,50.09,2.450031,0.09,0.097938
9844,0.221,35.69,9.355,20.695,20.434,16.415,18.879,7.944,0.956,30.739,106.769,61.809,78.24,14.979,0.0,50.016867,50.009998,1.929989,-0.009998,0.353877
6565,9.4,52.72963,32.22,42.54,26.0,17.9,18.6,6.08401,3.07021,63.1,110.0,107.6,92.90159,45.5,0.0,50.033333,50.040001,2.749996,0.09,0.281637


In [7]:
# Split df_train
X_train = df_train.drop('Lower Throughput Ratio',axis=1) # Features
Y_train = df_train['Lower Throughput Ratio']

# Split df_train
X_test = df_test.drop('Lower Throughput Ratio',axis=1) # Features
Y_test = df_test['Lower Throughput Ratio']

In [8]:
print(X_train.shape, Y_train.shape)
print(X_test.shape, Y_test.shape)

(14544, 19) (14544,)
(6234, 19) (6234,)


## 4. Fit Regression Model

In [22]:
param_grid = {'max_depth':np.arange(2,10),
             'min_samples_leaf':np.arange(1,3),
             'min_samples_split':np.arange(2,5)            
             }

clf = GridSearchCV(DecisionTreeRegressor(), param_grid, cv=5)
clf.fit(X_train,Y_train)

print("best_parms:{0}\nbest_score:{1}".format(clf.best_params_, clf.best_score_))

best_parms:{'max_depth': 4, 'min_samples_leaf': 1, 'min_samples_split': 4}
best_score:0.04847842976900411


In [31]:
max_d = clf.best_params_['max_depth']
min_sl = clf.best_params_['min_samples_leaf']
min_ss = clf.best_params_['min_samples_split']
max_d,min_sl,min_ss

(4, 1, 4)

In [32]:
DTR = DecisionTreeRegressor(max_depth=max_d,min_samples_leaf=min_sl,
                           min_samples_split=min_ss)
DTR.fit(X_train,Y_train)

DecisionTreeRegressor(criterion='mse', max_depth=4, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=4, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best')

## 5. Predict

In [37]:
Y_test_pred = DTR.predict(X_test)
mean_squared_error(Y_test, Y_test_pred)

0.049638047739171284

In [38]:
Y_train_pred = DTR.predict(X_train)
mean_squared_error(Y_train, Y_train_pred)

0.04984914322383897

In [41]:
with open("tree.dot", 'w') as f:
    f = tree.export_graphviz(DTR, out_file=f)