# Training Data Cleaning & Exploration

**Overview of Implementation**
1. <a href="#section1">Data Cleaning</a>
2. <a href="#section2">Data Imputation</a>
3. <a href="#section3">CART</a>
4. <a href="#section4">Results</a>
5. <a href="#section5">CART w/o non-deterministic variables</a>

In [1]:
# import libraries
library(data.table)
library(ggplot2)

Registered S3 methods overwritten by 'ggplot2':
  method         from 
  [.quosures     rlang
  c.quosures     rlang
  print.quosures rlang


## <a id='section1'>1. Data Cleaning</a>
import data & explore statistics

In [2]:
# Import train data using data.table fread function
data <- fread("../data_without_imputation.csv", stringsAsFactors = T)
data.imputation <- fread("../data_with_imputation.csv", stringsAsFactors = T)

paste("Number of NA values: ", sum(is.na(data)))
paste("Number of NA for imputed: ", sum(is.na(data.imputation)))

summary(data)

   tracking_id      wind_speed      atmospheric_temperature shaft_temperature
 WM_1    :    1   Min.   :-496.21   Min.   :-53.80          Min.   :-99.00   
 WM_10   :    1   1st Qu.:  20.88   1st Qu.: 12.33          1st Qu.: 41.63   
 WM_1000 :    1   Median :  93.30   Median : 18.33          Median : 43.69   
 WM_10000:    1   Mean   :  69.04   Mean   : 19.21          Mean   : 40.09   
 WM_10001:    1   3rd Qu.:  95.27   3rd Qu.: 25.24          3rd Qu.: 45.67   
 WM_10002:    1   Max.   : 601.46   Max.   : 80.22          Max.   :169.82   
 (Other) :28194   NA's   :273       NA's   :7392            NA's   :2        
  blades_angle       gearbox_temperature engine_temperature  motor_torque   
 Min.   :-146.2595   Min.   :-244.97     Min.   : 3.167     Min.   : 500.0  
 1st Qu.:  -1.1977   1st Qu.:  40.56     1st Qu.:41.911     1st Qu.: 870.3  
 Median :  -0.4956   Median :  43.22     Median :43.525     Median :2031.8  
 Mean   :  -9.6540   Mean   :  41.03     Mean   :42.614     Mean   :

In [3]:
data[data == ""] <- NA # account for "" as NA

data$turbine_status <- droplevels(data$turbine_status) # removes unused "" level
data$cloud_level <- droplevels(data$cloud_level) # removes unused "" level

In [4]:
levels(data$cloud_level)
levels(data$turbine_status)

levels(data.imputation$cloud_level)
levels(data.imputation$turbine_status)

In [5]:
data$year = as.factor(data$year)
data$month = as.factor(data$month)
data$mday = as.factor(data$mday)
data$wday = as.factor(data$wday)

data.imputation$year = as.factor(data.imputation$year)
data.imputation$month = as.factor(data.imputation$month)
data.imputation$mday = as.factor(data.imputation$mday)
data.imputation$wday = as.factor(data.imputation$wday)

In [6]:
# drop column tracking_id and datatime
data[,tracking_id:=NULL]
data[,min:=NULL]
data[,mday:=NULL]
data[,wday:=NULL]
data[,year:=NULL]

data.imputation[,tracking_id:=NULL]
data.imputation[,min:=NULL]
data.imputation[,mday:=NULL]
data.imputation[,wday:=NULL]
data.imputation[,year:=NULL]

"Column 'tracking_id' does not exist to remove"

In [7]:
#sample split into train and test set
library(caTools)
set.seed(2021)
train <- sample.split(Y=data$windmill_generated_power, SplitRatio=0.7)
trainset<- subset(data, train==T)
testset<- subset(data, train==F)

trainset.imputation <- subset(data.imputation, train==T)
testset.imputation <- subset(data.imputation, train==F)
paste("number of rows of trainset: ",nrow(trainset))
paste("number of rows of imputed trainset: ", nrow(trainset.imputation))
paste("proportion of trainset: ", nrow(trainset)/nrow(data))
paste("number of rows of testset: ",nrow(testset))
paste("number of rows of imputed testset: ",nrow(testset.imputation))
paste("proportion of testset: ", nrow(testset)/nrow(data))

In [8]:
head(trainset)

wind_speed,atmospheric_temperature,shaft_temperature,blades_angle,gearbox_temperature,engine_temperature,motor_torque,generator_temperature,atmospheric_pressure,area_temperature,...,resistance,rotor_torque,turbine_status,cloud_level,blade_length,blade_breadth,windmill_height,windmill_generated_power,month,hour
94.82002,,41.72302,-0.9034229,82.41057,42.52302,2563.1245,76.66556,103402.96,26.89787,...,2730.311,42.08467,BA,Medium,2.217542,0.3140648,24.28169,6.766521,8,14
238.81942,,45.44391,15.1153228,44.75964,47.2821,2888.1341,95.38997,18689.73,46.02005,...,1964.503,42.7446,ABC,,4.857385,0.3671399,24.28777,14.851089,12,15
10.72289,,41.98118,1.7156961,-17.61646,43.46985,781.6954,37.42307,114468.17,34.57294,...,1177.516,13.38729,AAA,Medium,,0.4533737,27.97165,3.519074,5,3
16.02625,,44.07282,-0.1968448,41.68058,43.3849,778.11,40.28402,121813.38,33.84939,...,1222.931,11.80511,BD,Low,2.917922,0.4473414,33.59351,5.089173,4,18
48.73783,12.71681,43.21778,-99.0,-48.40509,44.12584,980.9885,43.69187,120923.02,30.55316,...,1177.637,18.38487,BA,Low,2.93881,0.354881,29.94482,8.536889,7,21
91.99617,,41.87308,69.4844587,-12.38164,43.13339,1146.9242,69.35794,16453.59,23.15148,...,1662.733,23.0571,BB,Medium,2.939582,0.301911,24.55546,3.90696,5,12


In [9]:
head(trainset.imputation)

blade_breadth,month,hour,wind_speed,atmospheric_temperature,shaft_temperature,blades_angle,gearbox_temperature,engine_temperature,motor_torque,...,area_temperature,windmill_body_temperature,wind_direction,resistance,rotor_torque,blade_length,windmill_height,windmill_generated_power,turbine_status,cloud_level
0.3140648,8,14,94.82002,30.519014,41.72302,-0.9034229,82.41057,42.52302,2563.1245,...,26.89787,52.49037,239.8364,2730.311,42.08467,2.217542,24.28169,6.766521,BA,Medium
0.3671399,12,15,238.81942,22.301115,45.44391,15.1153228,44.75964,47.2821,2888.1341,...,46.02005,44.82715,492.0815,1964.503,42.7446,4.857385,24.28777,14.851089,ABC,Medium
0.4533737,5,3,10.72289,-1.876392,41.98118,1.7156961,-17.61646,43.46985,781.6954,...,34.57294,-99.0,259.2746,1177.516,13.38729,6.845387,27.97165,3.519074,AAA,Medium
0.4473414,4,18,16.02625,14.100095,44.07282,-0.1968448,41.68058,43.3849,778.11,...,33.84939,43.00875,528.004,1222.931,11.80511,2.917922,33.59351,5.089173,BD,Low
0.354881,7,21,48.73783,12.716815,43.21778,-99.0,-48.40509,44.12584,980.9885,...,30.55316,-99.0,423.3216,1177.637,18.38487,2.93881,29.94482,8.536889,BA,Low
0.301911,5,12,91.99617,27.311698,41.87308,69.4844587,-12.38164,43.13339,1146.9242,...,23.15148,41.19579,248.8143,1662.733,23.0571,2.939582,24.55546,3.90696,BB,Medium


### We have 2 training datasets, one with data imputation to handle NAs and one without handling of NAs. We will put both through CART and against a testset that has imputation and one that has NAs

## <a id='section3'>3. CART</a>

In [10]:
library(rpart)
#install.packages("rpart.plot")
library(rpart.plot)

In [11]:
#cart for trainset with na
cart.na<- rpart(windmill_generated_power~., data=trainset, method='anova',control=rpart.control(minsplit=2,cp=0))

In [12]:
#print tree
printcp(cart.na)


Regression tree:
rpart(formula = windmill_generated_power ~ ., data = trainset, 
    method = "anova", control = rpart.control(minsplit = 2, cp = 0))

Variables actually used in tree construction:
 [1] area_temperature          atmospheric_pressure     
 [3] atmospheric_temperature   blade_breadth            
 [5] blade_length              blades_angle             
 [7] cloud_level               engine_temperature       
 [9] gearbox_temperature       generator_temperature    
[11] hour                      month                    
[13] motor_torque              resistance               
[15] rotor_torque              shaft_temperature        
[17] turbine_status            wind_direction           
[19] wind_speed                windmill_body_temperature
[21] windmill_height          

Root node error: 143089/19583 = 7.3068

n=19583 (157 observations deleted due to missingness)

              CP nsplit  rel error   xerror      xstd
1     4.3799e-01      0 1.0000e+00 1.000118 0.01110

In [13]:
#determine cp to prune at
CVerror.cap<-cart.na$cptable[which.min(cart.na$cptable[,"xerror"]),"xerror"]+ cart.na$cptable[which.min(cart.na$cptable[,"xerror"]),"xstd"]

In [14]:
i<-1;j<-4
while (cart.na$cptable[i,j]>CVerror.cap){
    i<-i+1
}

cp.opt = ifelse(i>1,sqrt(cart.na$cptable[i,1]*cart.na$cptable[i-1,1]),1)
i
cp.opt

In [15]:
#prune
cart.na.prune<-prune(cart.na,cp=cp.opt)
printcp(cart.na.prune, digits=3)


Regression tree:
rpart(formula = windmill_generated_power ~ ., data = trainset, 
    method = "anova", control = rpart.control(minsplit = 2, cp = 0))

Variables actually used in tree construction:
 [1] atmospheric_pressure    atmospheric_temperature blade_breadth          
 [4] blades_angle            engine_temperature      gearbox_temperature    
 [7] generator_temperature   hour                    month                  
[10] motor_torque            resistance              wind_speed             

Root node error: 143089/19583 = 7.31

n=19583 (157 observations deleted due to missingness)

         CP nsplit rel error xerror    xstd
1  0.437991      0    1.0000 1.0001 0.01111
2  0.108482      1    0.5620 0.5627 0.00772
3  0.058112      2    0.4535 0.4544 0.00760
4  0.048341      3    0.3954 0.3968 0.00681
5  0.035312      4    0.3471 0.3288 0.00625
6  0.027338      5    0.3118 0.2995 0.00610
7  0.023911      8    0.2297 0.2280 0.00573
8  0.014557      9    0.2058 0.2098 0.00558
9  0

In [16]:
#repeat for trainset with imputation
cart.imputed<- rpart(windmill_generated_power~., data=trainset.imputation, method='anova',control=rpart.control(minsplit=2,cp=0))

In [17]:
#print tree
printcp(cart.imputed)


Regression tree:
rpart(formula = windmill_generated_power ~ ., data = trainset.imputation, 
    method = "anova", control = rpart.control(minsplit = 2, cp = 0))

Variables actually used in tree construction:
 [1] area_temperature          atmospheric_pressure     
 [3] atmospheric_temperature   blade_breadth            
 [5] blade_length              blades_angle             
 [7] cloud_level               engine_temperature       
 [9] gearbox_temperature       generator_temperature    
[11] hour                      month                    
[13] motor_torque              resistance               
[15] rotor_torque              shaft_temperature        
[17] turbine_status            wind_direction           
[19] wind_speed                windmill_body_temperature
[21] windmill_height          

Root node error: 144331/19740 = 7.3116

n= 19740 

              CP nsplit  rel error   xerror      xstd
1     4.3675e-01      0 1.0000e+00 1.000114 0.0110460
2     1.0732e-01      1 5.6325

In [18]:
#determine cp to prune at
CVerror.cap<-cart.imputed$cptable[which.min(cart.imputed$cptable[,"xerror"]),"xerror"]+ cart.imputed$cptable[which.min(cart.imputed$cptable[,"xerror"]),"xstd"]

In [19]:
i<-1;j<-4
while (cart.imputed$cptable[i,j]>CVerror.cap){
    i<-i+1
}

cp.opt = ifelse(i>1,sqrt(cart.imputed$cptable[i,1]*cart.imputed$cptable[i-1,1]),1)
i
cp.opt

In [20]:
#prune
cart.imputed.prune<-prune(cart.imputed,cp=cp.opt)
printcp(cart.imputed.prune, digits=3)


Regression tree:
rpart(formula = windmill_generated_power ~ ., data = trainset.imputation, 
    method = "anova", control = rpart.control(minsplit = 2, cp = 0))

Variables actually used in tree construction:
 [1] area_temperature          atmospheric_pressure     
 [3] blade_breadth             blades_angle             
 [5] engine_temperature        gearbox_temperature      
 [7] generator_temperature     hour                     
 [9] month                     motor_torque             
[11] resistance                turbine_status           
[13] wind_direction            wind_speed               
[15] windmill_body_temperature

Root node error: 144331/19740 = 7.31

n= 19740 

         CP nsplit rel error xerror    xstd
1  0.436747      0    1.0000 1.0001 0.01105
2  0.107317      1    0.5633 0.5636 0.00771
3  0.049617      2    0.4559 0.4566 0.00758
4  0.048094      3    0.4063 0.4084 0.00680
5  0.037486      4    0.3582 0.3625 0.00630
6  0.026809      5    0.3207 0.3238 0.00581
7  

In [21]:
#trainset with NAs variable importance
cart.na.prune$variable.importance
scaledVarImpt <- round(100*cart.na.prune$variable.importance/sum(cart.na.prune$variable.importance))
scaledVarImpt[scaledVarImpt > 3]  # Print all var impt > cutoff

In [22]:
#trainset with imputation variable importance
cart.imputed.prune$variable.importance
scaledVarImpt <- round(100*cart.imputed.prune$variable.importance/sum(cart.imputed.prune$variable.importance))
scaledVarImpt[scaledVarImpt > 3]  # Print all var impt > cutoff

## <a id='section4'>4. Results</a>

In [23]:
library(Metrics)
#prediction using both trainsets on testset with NAs
train.na.predict.na<-predict(cart.na.prune, newdata=testset)
train.imputed.predict.na<-predict(cart.imputed.prune,newdata=testset)
testset.na.actual<-testset$windmill_generated_power
train.na.predict.imputed<-predict(cart.na.prune,newdata=testset.imputation)
train.imputed.predict.imputed<-predict(cart.imputed.prune,newdata=testset.imputation)
testset.imputed.actual<-testset.imputation$windmill_generated_power

In [24]:
#form result dataframe
rmse.df<- data.frame(train.na.predict.na,train.imputed.predict.na,testset.na.actual,train.na.predict.imputed,train.imputed.predict.imputed,testset.imputed.actual)
nrow(rmse.df) #8460

In [25]:
head(rmse.df)

train.na.predict.na,train.imputed.predict.na,testset.na.actual,train.na.predict.imputed,train.imputed.predict.imputed,testset.imputed.actual
6.275851,6.025558,5.966275,6.275851,6.025558,5.966275
3.290764,3.66738,2.874342,3.290764,3.66738,2.874342
4.869788,4.987416,4.94578,4.869788,4.987416,4.94578
8.143861,8.546583,8.739166,8.143861,8.546583,8.739166
1.674652,1.687126,1.94881,1.674652,1.687126,1.94881
4.869788,5.201112,5.22922,4.869788,5.201112,5.22922


In [26]:
#check NAs in testset target variable
sum(is.na(testset.na.actual)) #50
sum(is.na(testset.imputed.actual)) #0

In [27]:
#remove rows with NA in target variable
rmse2.df<-rmse.df[!(is.na(rmse.df$testset.na.actual)),]
nrow(rmse2.df) #8410
sum(is.na(rmse2.df$testset.na.actual)) #0

In [28]:
rmse(rmse2.df$testset.na.actual, rmse2.df$testset.imputed.actual)

In [29]:
#calculate rmse
rmse.train.na.test.na <- rmse(rmse2.df$train.na.predict.na,rmse2.df$testset.na.actual)
rmse.train.imputed.test.na <- rmse(rmse2.df$train.imputed.predict.na,rmse2.df$testset.na.actual)
rmse.train.na.test.imputed <- rmse(rmse2.df$train.na.predict.imputed,rmse2.df$testset.imputed.actual)
rmse.train.imputed.test.imputed <- rmse(rmse2.df$train.imputed.predict.imputed,rmse2.df$testset.imputed.actual)

In [30]:
paste("rmse of CART without data imputation: ",rmse.train.na.test.na)
paste("rmse of CART with training data imputation: ",rmse.train.imputed.test.na)
paste("rmse of CART with testing data imputation: ",rmse.train.na.test.imputed)
paste("rmse of CART with both training and testing data imputation: ",rmse.train.imputed.test.imputed)

## <a id='section5'>5. CART w/o non-deterministic variables</a>

In [None]:
#CART removing non-deterministic variables in top results for variable importance
cart2<- rpart(windmill_generated_power~wind_speed+
              atmospheric_temperature+
              blades_angle+
              atmospheric_pressure+
              area_temperature+
              wind_direction+
              turbine_status+
              cloud_level+
              blade_length+
              blade_breadth+
              windmill_height+
              resistance,
              data=trainset, method='anova',control=rpart.control(minsplit=2,cp=0))

In [None]:
printcp(cart2)

In [None]:
#determine cp to prune at
CVerror.cap<-cart2$cptable[which.min(cart2$cptable[,"xerror"]),"xerror"]+ cart2$cptable[which.min(cart2$cptable[,"xerror"]),"xstd"]

i<-1;j<-4
while (cart2$cptable[i,j]>CVerror.cap){
    i<-i+1
}

cp.opt = ifelse(i>1,sqrt(cart2$cptable[i,1]*cart2$cptable[i-1,1]),1)
i
cp.opt

In [None]:
#prune
cart2.prune<-prune(cart2,cp=cp.opt)
printcp(cart2.prune, digits=3)

In [None]:
cart2.prune$variable.importance
scaledVarImpt <- round(100*cart2.prune$variable.importance/sum(cart2.prune$variable.importance))
scaledVarImpt[scaledVarImpt > 3]  # Print all var impt > cutoff

In [None]:
deterministic.predict <- predict(cart2.prune,newdata=testset)
rmse3.df <- data.frame(deterministic.predict, testset.na.actual)
nrow(rmse3.df)
head(rmse3.df)

In [None]:
#remove rows with NA in target variable
rmse3.df<-rmse3.df[!(is.na(rmse3.df$testset.na.actual)),]
nrow(rmse3.df) #8410
sum(is.na(rmse3.df$testset.na.actual)) #0

In [None]:
#rmse calculation
rmse.deterministic <- rmse(rmse3.df$deterministic.predict, rmse3.df$testset.na.actual)
paste("rmse of CART: ",rmse.train.na.test.na)
paste("rmse of CART with only deterministic variables: ", rmse.deterministic)

In [None]:
#increase in error
paste("Increase in error in %: ",(rmse.deterministic/rmse.train.na.test.na - 1)*100)