In [None]:
install.packages('gbm')
library(gbm)

# Read training data file

In [21]:
train_raw <- read.table("../data/train_FD001.txt", 
    sep=" ", 
    colClasses=c(rep("numeric", 2), rep("double", 24), rep("NULL", 2)),
    col.name=c("id", "cycle", "setting1", "setting2", "setting3",
               "s1", "s2", "s3", "s4", "s5", "s6",
               "s7", "s8", "s9", "s10", "s11", "s12",
               "s13", "s14", "s15", "s16", "s17", "s18",
               "s19", "s20", "s21", "na", "na")
)
head(train_raw)

id,cycle,setting1,setting2,setting3,s1,s2,s3,s4,s5,⋯,s12,s13,s14,s15,s16,s17,s18,s19,s20,s21
1,1,-0.0007,-0.0004,100,518.67,641.82,1589.7,1400.6,14.62,⋯,521.66,2388.02,8138.62,8.4195,0.03,392,2388,100,39.06,23.419
1,2,0.0019,-0.0003,100,518.67,642.15,1591.82,1403.14,14.62,⋯,522.28,2388.07,8131.49,8.4318,0.03,392,2388,100,39.0,23.4236
1,3,-0.0043,0.0003,100,518.67,642.35,1587.99,1404.2,14.62,⋯,522.42,2388.03,8133.23,8.4178,0.03,390,2388,100,38.95,23.3442
1,4,0.0007,0.0,100,518.67,642.35,1582.79,1401.87,14.62,⋯,522.86,2388.08,8133.83,8.3682,0.03,392,2388,100,38.88,23.3739
1,5,-0.0019,-0.0002,100,518.67,642.37,1582.85,1406.22,14.62,⋯,522.19,2388.04,8133.8,8.4294,0.03,393,2388,100,38.9,23.4044
1,6,-0.0043,-0.0001,100,518.67,642.1,1584.47,1398.37,14.62,⋯,521.68,2388.03,8132.85,8.4108,0.03,391,2388,100,38.98,23.3669


# Prepare training data
* Find max cycle for each engine id
* Append max cycle to each row
* Add a column of remaining cycle = max cycle - current cycle
* Select only needed columns for training

In [22]:
train_maxcycle <- setNames(aggregate(cycle~id,train_raw,max), c("id", "max"))
train_labeled <- merge(train_raw,train_maxcycle,by=c("id"))
train_labeled$RUL <- train_labeled$max - train_labeled$cycle
train_df <- train_labeled[, c("id", "cycle", "s9", "s11", "s14", "s15", "RUL")]
head(train_df)

id,cycle,s9,s11,s14,s15,RUL
1,1,9046.19,47.47,8138.62,8.4195,191
1,2,9044.07,47.49,8131.49,8.4318,190
1,3,9052.94,47.27,8133.23,8.4178,189
1,4,9049.48,47.13,8133.83,8.3682,188
1,5,9055.15,47.28,8133.8,8.4294,187
1,6,9049.68,47.16,8132.85,8.4108,186


# Read test data file

In [23]:
test_raw <- read.table("../data/test_FD001.txt", 
    sep=" ", 
    colClasses=c(rep("numeric", 2), rep("double", 24), rep("NULL", 2)),
    col.name=c("id", "cycle", "setting1", "setting2", "setting3",
               "s1", "s2", "s3", "s4", "s5", "s6",
               "s7", "s8", "s9", "s10", "s11", "s12",
               "s13", "s14", "s15", "s16", "s17", "s18",
               "s19", "s20", "s21", "na", "na")
)
head(test_raw)

id,cycle,setting1,setting2,setting3,s1,s2,s3,s4,s5,⋯,s12,s13,s14,s15,s16,s17,s18,s19,s20,s21
1,1,0.0023,0.0003,100,518.67,643.02,1585.29,1398.21,14.62,⋯,521.72,2388.03,8125.55,8.4052,0.03,392,2388,100,38.86,23.3735
1,2,-0.0027,-0.0003,100,518.67,641.71,1588.45,1395.42,14.62,⋯,522.16,2388.06,8139.62,8.3803,0.03,393,2388,100,39.02,23.3916
1,3,0.0003,0.0001,100,518.67,642.46,1586.94,1401.34,14.62,⋯,521.97,2388.03,8130.1,8.4441,0.03,393,2388,100,39.08,23.4166
1,4,0.0042,0.0,100,518.67,642.44,1584.12,1406.42,14.62,⋯,521.38,2388.05,8132.9,8.3917,0.03,391,2388,100,39.0,23.3737
1,5,0.0014,0.0,100,518.67,642.51,1587.19,1401.92,14.62,⋯,522.15,2388.03,8129.54,8.4031,0.03,390,2388,100,38.99,23.413
1,6,0.0012,0.0003,100,518.67,642.11,1579.12,1395.13,14.62,⋯,521.92,2388.08,8127.46,8.4238,0.03,392,2388,100,38.91,23.3467


# Prepare test data
* Find max cycle for each engine id
* Keep only the row with max cycle for each engine id
* Select only necessary columns
* Order the data set by engine id

In [24]:
test_maxcycle <- aggregate(cycle~id,test_raw,max)
test_maxcycle_only <- merge(test_maxcycle,test_raw)[, c("id", "cycle", "s9", "s11", "s14", "s15")]
test_ordered = test_maxcycle_only[order(test_maxcycle_only$id), ]
print(head(test_ordered), row.names=FALSE)

 id cycle      s9   s11     s14    s15
  1    31 9056.40 47.23 8130.11 8.4024
  2    49 9044.77 47.67 8126.90 8.4505
  3   126 9049.26 47.88 8131.46 8.4119
  4   106 9051.30 47.65 8133.64 8.4634
  5    98 9053.99 47.46 8125.74 8.4362
  6   105 9055.83 47.51 8139.02 8.4452


# Read labels (ground truth) for test data 

In [25]:
rul_df <- read.table("../data/RUL_FD001.txt", 
    colClasses=c("numeric"),
    col.name=c("RUL")
)
head(rul_df)

RUL
112
98
69
82
91
93


# Combine test data with label

In [26]:
test_df <- cbind(test_ordered, rul_df)
print(head(test_df), row.names=FALSE)

 id cycle      s9   s11     s14    s15 RUL
  1    31 9056.40 47.23 8130.11 8.4024 112
  2    49 9044.77 47.67 8126.90 8.4505  98
  3   126 9049.26 47.88 8131.46 8.4119  69
  4   106 9051.30 47.65 8133.64 8.4634  82
  5    98 9053.99 47.46 8125.74 8.4362  91
  6   105 9055.83 47.51 8139.02 8.4452  93


# Train a Gradient Boosted regression model

In [27]:
formula <- as.formula("RUL ~ cycle + s9 + s11 + s14 + s15")
gbt <- gbm(
    formula = formula, 
    data = train_df, 
    shrinkage = 0.2, 
    n.trees = 100, 
    distribution = "gaussian" 
    )

# Evaluate the model
* Make predictions on the test data set
* Calculate error between predictions and ground truth

In [28]:
predictions <- predict(object = gbt, newdata = test_df, n.trees = 100)

evaluate_model <- function(observed, predicted) {
  se <- (observed - predicted)^2
  rmse <- sqrt(mean(se))
  metrics <- c("Root Mean Squared Error" = rmse)
  return(metrics)
}

rmse <- evaluate_model(observed = test_df$RUL, predicted = predictions)
rmse