Skip to content
This repository
Browse code

Chapter 6 code

  • Loading branch information...
commit 76e9a75fd08115edb3dd055eaf6c1f531a288035 1 parent 24e55ba
John Myles White authored
494 06-Regularization/chapter06.R
... ... @@ -0,0 +1,494 @@
  1 +library('ggplot2')
  2 +
  3 +# First snippet
  4 +set.seed(1)
  5 +
  6 +x <- seq(-10, 10, by = 0.01)
  7 +y <- 1 - x ^ 2 + rnorm(length(x), 0, 5)
  8 +
  9 +ggplot(data.frame(X = x, Y = y), aes(x = X, y = Y)) +
  10 + geom_point() +
  11 + geom_smooth(se = FALSE)
  12 +
  13 +# Second code snippet
  14 +x.squared <- x ^ 2
  15 +
  16 +# Third code snippet
  17 +ggplot(data.frame(XSquared = x.squared, Y = y), aes(x = XSquared, y = Y)) +
  18 + geom_point() +
  19 + geom_smooth(method = 'lm', se = FALSE)
  20 +
  21 +# Fourth code snippet
  22 +summary(lm(y ~ x))$r.squared
  23 +#[1] 2.973e-06
  24 +
  25 +summary(lm(y ~ x.squared))$r.squared
  26 +#[1] 0.9707
  27 +
  28 +# Fifth code snippet
  29 +set.seed(1)
  30 +
  31 +x <- seq(0, 1, by = 0.01)
  32 +y <- sin(2 * pi * x) + rnorm(length(x), 0, 0.1)
  33 +
  34 +df <- data.frame(X = x, Y = y)
  35 +
  36 +ggplot(df, aes(x = X, y = Y)) +
  37 + geom_point()
  38 +
  39 +# Sixth code snippet
  40 +summary(lm(Y ~ X, data = df))
  41 +
  42 +#Call:
  43 +#lm(formula = Y ~ X, data = df)
  44 +#
  45 +#Residuals:
  46 +# Min 1Q Median 3Q Max
  47 +#-1.00376 -0.41253 -0.00409 0.40664 0.85874
  48 +#
  49 +#Coefficients:
  50 +# Estimate Std. Error t value Pr(>|t|)
  51 +#(Intercept) 0.94111 0.09057 10.39 <2e-16 ***
  52 +#X -1.86189 0.15648 -11.90 <2e-16 ***
  53 +#---
  54 +#Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
  55 +#
  56 +#Residual standard error: 0.4585 on 99 degrees of freedom
  57 +#Multiple R-squared: 0.5885, Adjusted R-squared: 0.5843
  58 +#F-statistic: 141.6 on 1 and 99 DF, p-value: < 2.2e-16
  59 +
  60 +# Seventh code snippet
  61 +ggplot(data.frame(X = x, Y = y), aes(x = X, y = Y)) +
  62 + geom_point() +
  63 + geom_smooth(method = 'lm', se = FALSE)
  64 +
  65 +# Eighth code snippet
  66 +df <- transform(df, X2 = X ^ 2)
  67 +df <- transform(df, X3 = X ^ 3)
  68 +
  69 +summary(lm(Y ~ X + X2 + X3, data = df))
  70 +
  71 +#Call:
  72 +#lm(formula = Y ~ X + X2 + X3, data = df)
  73 +#
  74 +#Residuals:
  75 +# Min 1Q Median 3Q Max
  76 +#-0.32331 -0.08538 0.00652 0.08320 0.20239
  77 +#
  78 +#Coefficients:
  79 +# Estimate Std. Error t value Pr(>|t|)
  80 +#(Intercept) -0.16341 0.04425 -3.693 0.000367 ***
  81 +#X 11.67844 0.38513 30.323 < 2e-16 ***
  82 +#X2 -33.94179 0.89748 -37.819 < 2e-16 ***
  83 +#X3 22.59349 0.58979 38.308 < 2e-16 ***
  84 +#---
  85 +#Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
  86 +#
  87 +#Residual standard error: 0.1153 on 97 degrees of freedom
  88 +#Multiple R-squared: 0.9745, Adjusted R-squared: 0.9737
  89 +#F-statistic: 1235 on 3 and 97 DF, p-value: < 2.2e-16
  90 +
  91 +# Ninth code snippet
  92 +df <- transform(df, X4 = X ^ 4)
  93 +df <- transform(df, X5 = X ^ 5)
  94 +df <- transform(df, X6 = X ^ 6)
  95 +df <- transform(df, X7 = X ^ 7)
  96 +df <- transform(df, X8 = X ^ 8)
  97 +df <- transform(df, X9 = X ^ 9)
  98 +df <- transform(df, X10 = X ^ 10)
  99 +df <- transform(df, X11 = X ^ 11)
  100 +df <- transform(df, X12 = X ^ 12)
  101 +df <- transform(df, X13 = X ^ 13)
  102 +df <- transform(df, X14 = X ^ 14)
  103 +df <- transform(df, X15 = X ^ 15)
  104 +
  105 +summary(lm(Y ~ X + X2 + X3 + X4 + X5 + X6 + X7 + X8 + X9 + X10 + X11 + X12 + X13 + X14,
  106 + data = df))
  107 +
  108 +#Call:
  109 +#lm(formula = Y ~ X + X2 + X3 + X4 + X5 + X6 + X7 + X8 + X9 +
  110 +# X10 + X11 + X12 + X13 + X14, data = df)
  111 +#
  112 +#Residuals:
  113 +# Min 1Q Median 3Q Max
  114 +#-0.242662 -0.038179 0.002771 0.052484 0.210917
  115 +#
  116 +#Coefficients: (1 not defined because of singularities)
  117 +# Estimate Std. Error t value Pr(>|t|)
  118 +#(Intercept) -6.909e-02 8.413e-02 -0.821 0.414
  119 +#X 1.494e+01 1.056e+01 1.415 0.161
  120 +#X2 -2.609e+02 4.275e+02 -0.610 0.543
  121 +#X3 3.764e+03 7.863e+03 0.479 0.633
  122 +#X4 -3.203e+04 8.020e+04 -0.399 0.691
  123 +#X5 1.717e+05 5.050e+05 0.340 0.735
  124 +#X6 -6.225e+05 2.089e+06 -0.298 0.766
  125 +#X7 1.587e+06 5.881e+06 0.270 0.788
  126 +#X8 -2.889e+06 1.146e+07 -0.252 0.801
  127 +#X9 3.752e+06 1.544e+07 0.243 0.809
  128 +#X10 -3.398e+06 1.414e+07 -0.240 0.811
  129 +#X11 2.039e+06 8.384e+06 0.243 0.808
  130 +#X12 -7.276e+05 2.906e+06 -0.250 0.803
  131 +#X13 1.166e+05 4.467e+05 0.261 0.795
  132 +#X14 NA NA NA NA
  133 +#
  134 +#Residual standard error: 0.09079 on 87 degrees of freedom
  135 +#Multiple R-squared: 0.9858, Adjusted R-squared: 0.9837
  136 +#F-statistic: 465.2 on 13 and 87 DF, p-value: < 2.2e-16
  137 +
  138 +# Tenth code snippet
  139 +summary(lm(Y ~ poly(X, degree = 14), data = df))
  140 +
  141 +#Call:
  142 +#lm(formula = Y ~ poly(X, degree = 14), data = df)
  143 +#
  144 +#Residuals:
  145 +# Min 1Q Median 3Q Max
  146 +#-0.232557 -0.042933 0.002159 0.051021 0.209959
  147 +#
  148 +#Coefficients:
  149 +# Estimate Std. Error t value Pr(>|t|)
  150 +#(Intercept) 0.010167 0.009038 1.125 0.2638
  151 +#poly(X, degree = 14)1 -5.455362 0.090827 -60.063 < 2e-16 ***
  152 +#poly(X, degree = 14)2 -0.039389 0.090827 -0.434 0.6656
  153 +#poly(X, degree = 14)3 4.418054 0.090827 48.642 < 2e-16 ***
  154 +#poly(X, degree = 14)4 -0.047966 0.090827 -0.528 0.5988
  155 +#poly(X, degree = 14)5 -0.706451 0.090827 -7.778 1.48e-11 ***
  156 +#poly(X, degree = 14)6 -0.204221 0.090827 -2.248 0.0271 *
  157 +#poly(X, degree = 14)7 -0.051341 0.090827 -0.565 0.5734
  158 +#poly(X, degree = 14)8 -0.031001 0.090827 -0.341 0.7337
  159 +#poly(X, degree = 14)9 0.077232 0.090827 0.850 0.3975
  160 +#poly(X, degree = 14)10 0.048088 0.090827 0.529 0.5979
  161 +#poly(X, degree = 14)11 0.129990 0.090827 1.431 0.1560
  162 +#poly(X, degree = 14)12 0.024726 0.090827 0.272 0.7861
  163 +#poly(X, degree = 14)13 0.023706 0.090827 0.261 0.7947
  164 +#poly(X, degree = 14)14 0.087906 0.090827 0.968 0.3358
  165 +#---
  166 +#Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
  167 +#
  168 +#Residual standard error: 0.09083 on 86 degrees of freedom
  169 +#Multiple R-squared: 0.986, Adjusted R-squared: 0.9837
  170 +#F-statistic: 431.7 on 14 and 86 DF, p-value: < 2.2e-16
  171 +
  172 +# Eleventh code snippet
  173 +poly.fit <- lm(Y ~ poly(X, degree = 1), data = df)
  174 +df <- transform(df, PredictedY = predict(poly.fit))
  175 +
  176 +ggplot(df, aes(x = X, y = PredictedY)) +
  177 + geom_point() +
  178 + geom_line()
  179 +
  180 +poly.fit <- lm(Y ~ poly(X, degree = 3), data = df)
  181 +df <- transform(df, PredictedY = predict(poly.fit))
  182 +
  183 +ggplot(df, aes(x = X, y = PredictedY)) +
  184 + geom_point() +
  185 + geom_line()
  186 +
  187 +poly.fit <- lm(Y ~ poly(X, degree = 5), data = df)
  188 +df <- transform(df, PredictedY = predict(poly.fit))
  189 +
  190 +ggplot(df, aes(x = X, y = PredictedY)) +
  191 + geom_point() +
  192 + geom_line()
  193 +
  194 +poly.fit <- lm(Y ~ poly(X, degree = 25), data = df)
  195 +df <- transform(df, PredictedY = predict(poly.fit))
  196 +
  197 +ggplot(df, aes(x = X, y = PredictedY)) +
  198 + geom_point() +
  199 + geom_line()
  200 +
  201 +# Twelfth code snippet
  202 +set.seed(1)
  203 +
  204 +x <- seq(0, 1, by = 0.01)
  205 +y <- sin(2 * pi * x) + rnorm(length(x), 0, 0.1)
  206 +
  207 +# Thirteenth code snippet
  208 +n <- length(x)
  209 +
  210 +indices <- sort(sample(1:n, round(0.5 * n)))
  211 +
  212 +training.x <- x[indices]
  213 +training.y <- y[indices]
  214 +
  215 +test.x <- x[-indices]
  216 +test.y <- y[-indices]
  217 +
  218 +training.df <- data.frame(X = training.x, Y = training.y)
  219 +test.df <- data.frame(X = test.x, Y = test.y)
  220 +
  221 +# Fourteenth code snippet
  222 +rmse <- function(y, h)
  223 +{
  224 + return(sqrt(mean((y - h) ^ 2)))
  225 +}
  226 +
  227 +# Fifteenth code snippet
  228 +performance <- data.frame()
  229 +
  230 +for (d in 1:12)
  231 +{
  232 + poly.fit <- lm(Y ~ poly(X, degree = d), data = training.df)
  233 +
  234 + performance <- rbind(performance,
  235 + data.frame(Degree = d,
  236 + Data = 'Training',
  237 + RMSE = rmse(training.y, predict(poly.fit))))
  238 +
  239 + performance <- rbind(performance,
  240 + data.frame(Degree = d,
  241 + Data = 'Test',
  242 + RMSE = rmse(test.y, predict(poly.fit,
  243 + newdata = test.df))))
  244 +}
  245 +
  246 +# Sixteenth code snippet
  247 +ggplot(performance, aes(x = Degree, y = RMSE, linetype = Data)) +
  248 + geom_point() +
  249 + geom_line()
  250 +
  251 +# Seventeenth code snippet
  252 +lm.fit <- lm(y ~ x)
  253 +model.complexity <- sum(coef(lm.fit) ^ 2)
  254 +
  255 +# Eighteenth code snippet
  256 +lm.fit <- lm(y ~ x)
  257 +l2.model.complexity <- sum(coef(lm.fit) ^ 2)
  258 +l1.model.complexity <- sum(abs(coef(lm.fit)))
  259 +
  260 +# Ninteenth code snippet
  261 +set.seed(1)
  262 +
  263 +x <- seq(0, 1, by = 0.01)
  264 +y <- sin(2 * pi * x) + rnorm(length(x), 0, 0.1)
  265 +
  266 +# Twentieth code snippet
  267 +x <- matrix(x)
  268 +
  269 +library('glmnet')
  270 +
  271 +glmnet(x, y)
  272 +
  273 +#Call: glmnet(x = x, y = y)
  274 +#
  275 +# Df %Dev Lambda
  276 +# [1,] 0 0.00000 0.542800
  277 +# [2,] 1 0.09991 0.494600
  278 +# [3,] 1 0.18290 0.450700
  279 +# [4,] 1 0.25170 0.410600
  280 +# [5,] 1 0.30890 0.374200
  281 +#...
  282 +#[51,] 1 0.58840 0.005182
  283 +#[52,] 1 0.58840 0.004721
  284 +#[53,] 1 0.58850 0.004302
  285 +#[54,] 1 0.58850 0.003920
  286 +#[55,] 1 0.58850 0.003571
  287 +
  288 +# Twenty-first code snippet
  289 +set.seed(1)
  290 +
  291 +x <- seq(0, 1, by = 0.01)
  292 +y <- sin(2 * pi * x) + rnorm(length(x), 0, 0.1)
  293 +
  294 +n <- length(x)
  295 +
  296 +indices <- sort(sample(1:n, round(0.5 * n)))
  297 +
  298 +training.x <- x[indices]
  299 +training.y <- y[indices]
  300 +
  301 +test.x <- x[-indices]
  302 +test.y <- y[-indices]
  303 +
  304 +df <- data.frame(X = x, Y = y)
  305 +
  306 +training.df <- data.frame(X = training.x, Y = training.y)
  307 +test.df <- data.frame(X = test.x, Y = test.y)
  308 +
  309 +rmse <- function(y, h)
  310 +{
  311 + return(sqrt(mean((y - h) ^ 2)))
  312 +}
  313 +
  314 +# Twenty-second code snippet
  315 +library('glmnet')
  316 +
  317 +glmnet.fit <- with(training.df, glmnet(poly(X, degree = 10), Y))
  318 +
  319 +lambdas <- glmnet.fit$lambda
  320 +
  321 +performance <- data.frame()
  322 +
  323 +for (lambda in lambdas)
  324 +{
  325 + performance <- rbind(performance,
  326 + data.frame(Lambda = lambda,
  327 + RMSE = rmse(test.y, with(test.df, predict(glmnet.fit, poly(X,
  328 + degree = 10), s = lambda)))))
  329 +}
  330 +
  331 +# Twenty-third code snippet
  332 +ggplot(performance, aes(x = Lambda, y = RMSE)) +
  333 + geom_point() +
  334 + geom_line()
  335 +
  336 +# Twenty-fourth code snippet
  337 +best.lambda <- with(performance, Lambda[which(RMSE == min(RMSE))])
  338 +
  339 +glmnet.fit <- with(df, glmnet(poly(X, degree = 10), Y))
  340 +
  341 +# Twenty-fifth code snippet
  342 +coef(glmnet.fit, s = best.lambda)
  343 +
  344 +#11 x 1 sparse Matrix of class "dgCMatrix"
  345 +# 1
  346 +#(Intercept) 0.0101667
  347 +#1 -5.2132586
  348 +#2 0.0000000
  349 +#3 4.1759498
  350 +#4 0.0000000
  351 +#5 -0.4643476
  352 +#6 0.0000000
  353 +#7 0.0000000
  354 +#8 0.0000000
  355 +#9 0.0000000
  356 +#10 0.0000000
  357 +
  358 +# Twenty-sixth code snippet
  359 +ranks <- read.csv('data/oreilly.csv', stringsAsFactors = FALSE)
  360 +
  361 +library('tm')
  362 +
  363 +documents <- data.frame(Text = ranks$Long.Desc.)
  364 +row.names(documents) <- 1:nrow(documents)
  365 +
  366 +corpus <- Corpus(DataframeSource(documents))
  367 +corpus <- tm_map(corpus, tolower)
  368 +corpus <- tm_map(corpus, stripWhitespace)
  369 +corpus <- tm_map(corpus, removeWords, stopwords('english'))
  370 +
  371 +dtm <- DocumentTermMatrix(corpus)
  372 +
  373 +# Twenty-seventh code snippet
  374 +x <- as.matrix(dtm)
  375 +y <- rev(1:100)
  376 +
  377 +# Twenty-eighth code snippet
  378 +set.seed(1)
  379 +
  380 +library('glmnet')
  381 +
  382 +# Twenty-ninth code snippet
  383 +performance <- data.frame()
  384 +
  385 +for (lambda in c(0.1, 0.25, 0.5, 1, 2, 5))
  386 +{
  387 + for (i in 1:50)
  388 + {
  389 + indices <- sample(1:100, 80)
  390 +
  391 + training.x <- x[indices, ]
  392 + training.y <- y[indices]
  393 +
  394 + test.x <- x[-indices, ]
  395 + test.y <- y[-indices]
  396 +
  397 + glm.fit <- glmnet(training.x, training.y)
  398 +
  399 + predicted.y <- predict(glm.fit, test.x, s = lambda)
  400 +
  401 + rmse <- sqrt(mean((predicted.y - test.y) ^ 2))
  402 +
  403 + performance <- rbind(performance,
  404 + data.frame(Lambda = lambda,
  405 + Iteration = i,
  406 + RMSE = rmse))
  407 + }
  408 +}
  409 +
  410 +# Thirtieth code snippet
  411 +ggplot(performance, aes(x = Lambda, y = RMSE)) +
  412 + stat_summary(fun.data = 'mean_cl_boot', geom = 'errorbar') +
  413 + stat_summary(fun.data = 'mean_cl_boot', geom = 'point')
  414 +
  415 +# Thirty-first code snippet
  416 +y <- rep(c(1, 0), each = 50)
  417 +
  418 +# Thirty-second code snippet
  419 +regularized.fit <- glmnet(x, y, family = 'binomial')
  420 +
  421 +# Thirty-third code snippet
  422 +regularized.fit <- glmnet(x, y)
  423 +
  424 +regularized.fit <- glmnet(x, y, family = 'gaussian')
  425 +
  426 +regularized.fit <- glmnet(x, y, family = 'binomial')
  427 +
  428 +# Thirty-fourth code snippet
  429 +predict(regularized.fit, newx = x, s = 0.001)
  430 +#1 4.884576
  431 +#2 6.281354
  432 +#3 4.892129
  433 +#...
  434 +#98 -5.958003
  435 +#99 -5.677161
  436 +#100 -4.956271
  437 +
  438 +# Thirty-fifth code snippet
  439 +ifelse(predict(regularized.fit, newx = x, s = 0.001) > 0, 1, 0)
  440 +#1 1
  441 +#2 1
  442 +#3 1
  443 +#...
  444 +#98 0
  445 +#99 0
  446 +#100 0
  447 +
  448 +# Thirty-sixth code snippet
  449 +library('boot')
  450 +
  451 +inv.logit(predict(regularized.fit, newx = x, s = 0.001))
  452 +#1 0.992494427
  453 +#2 0.998132627
  454 +#3 0.992550485
  455 +#...
  456 +#98 0.002578403
  457 +#99 0.003411583
  458 +#100 0.006989922
  459 +
  460 +# Thirty-seventh code snippet
  461 +set.seed(1)
  462 +
  463 +performance <- data.frame()
  464 +
  465 +for (i in 1:250)
  466 +{
  467 + indices <- sample(1:100, 80)
  468 +
  469 + training.x <- x[indices, ]
  470 + training.y <- y[indices]
  471 +
  472 + test.x <- x[-indices, ]
  473 + test.y <- y[-indices]
  474 +
  475 + for (lambda in c(0.0001, 0.001, 0.0025, 0.005, 0.01, 0.025, 0.5, 0.1))
  476 + {
  477 + glm.fit <- glmnet(training.x, training.y, family = 'binomial')
  478 +
  479 + predicted.y <- ifelse(predict(glm.fit, test.x, s = lambda) > 0, 1, 0)
  480 +
  481 + error.rate <- mean(predicted.y != test.y)
  482 +
  483 + performance <- rbind(performance,
  484 + data.frame(Lambda = lambda,
  485 + Iteration = i,
  486 + ErrorRate = error.rate))
  487 + }
  488 +}
  489 +
  490 +# Thirty-eighth code snippet
  491 +ggplot(performance, aes(x = Lambda, y = ErrorRate)) +
  492 + stat_summary(fun.data = 'mean_cl_boot', geom = 'errorbar') +
  493 + stat_summary(fun.data = 'mean_cl_boot', geom = 'point') +
  494 + scale_x_log10()
1  06-Regularization/data/oreilly.csv
1 addition, 0 deletions not shown

0 comments on commit 76e9a75

Please sign in to comment.
Something went wrong with that request. Please try again.