# Predict Regulation Throughput Ratio with CART Decision Trees
## (Raise Frequency Regulation Service)

## 1. Import the necessary modules and libraries

In [1]:
# data analysis and wrangling
import pandas as pd
import numpy as np

# visualization
import matplotlib.pyplot as plt

# machine learning
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.model_selection import GridSearchCV
from sklearn import tree

## 2. Exploring and Preprocessing the Data

In [2]:
# read data from csv file
df = pd.read_csv("df_raise_alldata.csv").fillna(0)

In [3]:
df = df.drop(['Unnamed: 0', 'Interval Ending (AEST)'],axis=1)
df.head(5)

Unnamed: 0,BLUFF1,CLEMGPWF,HALLWF1,HALLWF2,HDWF1,HDWF2,HDWF3,LKBONNY2,LKBONNY3,NBHWF1,SNOWNTH1,SNOWSTH1,SNOWTWN1,WATERLWF,Unnamed: 15,Average Frequency (Hz),Median Frequency (Hz),Cummulative Frequency (Hz),Difference Frequency (Hz),Raise Throughput Ratio
0,11.9,3.24158,13.01,10.33,22.2,13.9,17.9,42.1314,10.27125,21.7,14.9,8.6,13.64382,11.9,0.0,50.056733,50.060001,2.159977,0.02,0.224593
1,13.0,5.83484,6.14,8.14,21.2,15.7,15.8,38.2406,12.6716,18.5,9.3,3.4,7.75261,14.2,0.0,49.992867,49.98,2.089989,0.150002,0.689341
2,13.7,6.12298,6.53,7.22,21.7,15.6,15.5,28.10419,9.54556,17.7,8.0,1.7,7.56961,12.5,0.0,50.010567,50.0,2.019958,-0.080002,0.266614
3,12.6,6.62722,6.68,6.67,22.9,16.8,15.2,29.15364,8.42912,16.2,8.6,1.6,6.4969,11.4,0.0,50.017,50.009998,1.929985,0.029999,0.703356
4,11.0,4.82635,7.59,7.14,21.1,16.9,14.5,27.82511,8.81988,11.5,8.4,1.1,5.87731,10.1,0.0,50.042333,50.049999,2.070015,0.07,0.003143


In [4]:
df.describe()

Unnamed: 0,BLUFF1,CLEMGPWF,HALLWF1,HALLWF2,HDWF1,HDWF2,HDWF3,LKBONNY2,LKBONNY3,NBHWF1,SNOWNTH1,SNOWSTH1,SNOWTWN1,WATERLWF,Unnamed: 15,Average Frequency (Hz),Median Frequency (Hz),Cummulative Frequency (Hz),Difference Frequency (Hz),Raise Throughput Ratio
count,20535.0,20535.0,20535.0,20535.0,20535.0,20535.0,20535.0,20535.0,20535.0,20535.0,20535.0,20535.0,20535.0,20535.0,20535.0,20535.0,20535.0,20535.0,20535.0,20535.0
mean,19.813264,21.624513,38.767415,29.861424,42.368068,39.863514,43.293565,49.959412,12.397599,53.183611,55.628494,52.804282,40.826846,48.532683,0.641488,50.006977,50.005772,2.373395,0.002109,0.396186
std,17.331952,17.932268,29.812225,22.913906,31.753946,31.344071,33.280824,46.305012,11.64682,41.942099,45.441511,40.19139,32.711374,40.42993,2.440933,0.042489,0.045957,0.233683,0.084491,0.274535
min,-0.2,-0.64832,-0.87,-0.57,-1.1,-1.2,-1.3,-1.43366,-0.33668,-1.3,0.0,0.0,-0.90558,-1.2,0.0,49.7079,49.689999,1.589897,-0.310001,1e-06
25%,2.7,3.997,9.95,7.56,12.1,10.7075,12.1,10.002665,2.45617,12.1,12.2,13.9,7.36214,12.2,0.0,49.982,49.98,2.219978,-0.050003,0.163533
50%,15.616,18.80114,35.05,27.868,38.3,34.0,38.0,34.48988,8.15001,47.7,45.4,46.58,37.72353,37.088,0.0,50.008767,50.0,2.36002,0.0,0.352865
75%,37.251405,39.314875,68.302685,51.55,72.414115,70.0,75.270565,89.30568,22.37,93.560735,100.9,90.315795,71.98962,88.2,0.0,50.034433,50.029999,2.519955,0.060001,0.600152
max,52.876,56.507,90.77,70.257,100.7,102.5,110.1,157.954,38.854,131.241,143.60001,125.076,96.64157,129.964,19.3,50.153433,50.16,4.219955,0.360001,0.999971


In [5]:
a = np.percentile(df, [25, 50, 75])
a

array([ 1.9600296 , 22.49778   , 50.04560003])

In [6]:
df.columns

Index(['BLUFF1', 'CLEMGPWF', 'HALLWF1', 'HALLWF2', 'HDWF1', 'HDWF2', 'HDWF3',
       'LKBONNY2', 'LKBONNY3', 'NBHWF1', 'SNOWNTH1', 'SNOWSTH1', 'SNOWTWN1',
       'WATERLWF', 'Unnamed: 15', 'Average Frequency (Hz)',
       'Median Frequency (Hz)', 'Cummulative Frequency (Hz)',
       'Difference Frequency (Hz)', 'Raise Throughput Ratio'],
      dtype='object')

##  3. Split `df` into `df_train`, `df_test` with 30% as test

In [7]:
df_train, df_test = train_test_split(df, test_size = 0.3, random_state = 100)
display(df_train.head())
display(df_test.head())

Unnamed: 0,BLUFF1,CLEMGPWF,HALLWF1,HALLWF2,HDWF1,HDWF2,HDWF3,LKBONNY2,LKBONNY3,NBHWF1,SNOWNTH1,SNOWSTH1,SNOWTWN1,WATERLWF,Unnamed: 15,Average Frequency (Hz),Median Frequency (Hz),Cummulative Frequency (Hz),Difference Frequency (Hz),Raise Throughput Ratio
18392,0.5,3.02547,0.85,-0.41,11.0,11.1,9.6,26.43571,3.74008,3.2,14.7,19.3,10.78864,-0.3,0.6,49.9999,50.0,2.099983,0.080002,0.351371
7248,31.024,9.014,56.823,50.793,53.503,43.948,44.0,0.58,0.014,81.609,73.065,108.946,35.0,50.842,0.0,49.934766,49.919998,2.489944,0.110001,0.307755
12115,43.5,29.17418,87.72,67.45,42.2,34.4,41.5,147.51599,36.22849,108.3,43.4,63.4,44.98869,101.1,0.0,49.9183,49.919998,2.460011,0.060001,0.320999
5407,3.4,1.36867,8.82,5.84,6.8,8.5,7.1,25.87181,4.13083,5.8,54.1,35.2,30.1685,16.5,0.0,50.0496,50.049999,2.630039,-0.070004,0.513354
7814,46.622,34.078,85.827,62.853,66.853,37.999,53.542,140.696,37.193,102.283,79.942,80.326,73.484,80.531,0.0,50.065033,50.07,2.459995,0.119999,0.256933


Unnamed: 0,BLUFF1,CLEMGPWF,HALLWF1,HALLWF2,HDWF1,HDWF2,HDWF3,LKBONNY2,LKBONNY3,NBHWF1,SNOWNTH1,SNOWSTH1,SNOWTWN1,WATERLWF,Unnamed: 15,Average Frequency (Hz),Median Frequency (Hz),Cummulative Frequency (Hz),Difference Frequency (Hz),Raise Throughput Ratio
5513,0.1,-0.36018,-0.32,-0.32,7.5,6.4,4.7,11.52644,2.7911,-0.5,0.3,2.7,-0.64303,-0.9,0.0,50.0209,50.029999,2.290092,0.07,0.189629
5561,1.3,21.46643,14.56,13.13,5.2,4.3,1.2,2.23821,1.84213,3.2,46.9,48.7,47.40813,10.8,0.0,50.077334,50.080002,2.47998,0.040001,0.602287
12074,46.718,45.044,80.544,66.349,75.376,55.342,59.622,144.192,37.991,85.038,47.163,93.593,45.834,126.74,0.0,50.020367,50.02,2.390011,-0.049999,0.351814
877,46.2,30.39878,57.45,54.66,23.5,25.9,33.7,14.81392,6.36371,31.6,136.60001,107.5,94.41513,80.3,0.0,49.9995,49.990002,2.340004,-0.040001,0.299124
4409,1.8,36.95396,40.07,24.32,70.4,42.0,44.3,1.06056,0.16747,69.3,119.9,102.4,94.02545,25.2,0.0,49.974434,49.98,1.909988,-0.010002,0.185118


In [9]:
# Split df_train
X_train = df_train.drop('Raise Throughput Ratio',axis=1) # Features
Y_train = df_train['Raise Throughput Ratio']

# Split df_train
X_test = df_test.drop('Raise Throughput Ratio',axis=1) # Features
Y_test = df_test['Raise Throughput Ratio']

In [10]:
print(X_train.shape, Y_train.shape)
print(X_test.shape, Y_test.shape)

(14374, 19) (14374,)
(6161, 19) (6161,)


## 4. Fit Regression Model

In [11]:
param_grid = {'max_depth':np.arange(2,10),
             'min_samples_leaf':np.arange(1,3),
             'min_samples_split':np.arange(2,5)            
             }

clf = GridSearchCV(DecisionTreeRegressor(), param_grid, cv=5)
clf.fit(X_train,Y_train)

print("best_parms:{0}\nbest_score:{1}".format(clf.best_params_, clf.best_score_))

best_parms:{'max_depth': 4, 'min_samples_leaf': 2, 'min_samples_split': 3}
best_score:0.046269051770450764


In [12]:
max_d = clf.best_params_['max_depth']
min_sl = clf.best_params_['min_samples_leaf']
min_ss = clf.best_params_['min_samples_split']
max_d,min_sl,min_ss

(4, 2, 3)

In [13]:
DTR = DecisionTreeRegressor(max_depth=max_d,min_samples_leaf=min_sl,
                           min_samples_split=min_ss)
DTR.fit(X_train,Y_train)

DecisionTreeRegressor(criterion='mse', max_depth=4, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=2,
           min_samples_split=3, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best')

## 5. Predict

In [14]:
Y_test_pred = DTR.predict(X_test)
mean_squared_error(Y_test, Y_test_pred)

0.07233644803830541

In [15]:
Y_train_pred = DTR.predict(X_train)
mean_squared_error(Y_train, Y_train_pred)

0.07062009250754121

In [18]:
with open("tree.dot", 'w') as f:
    f = tree.export_graphviz(DTR, out_file=f)