# Ejercicio 2: Overfitting en R
Este notebook implementa el cálculo de R², R² ajustado y R² fuera de muestra usando modelos polinomiales.

In [None]:
library(ggplot2)
set.seed(123)
n <- 1000
X <- runif(n, 0, 1)
e <- rnorm(n, 0, 1)
y <- exp(4 * X) + e 

mse <- function(y, y_hat) {
  mean((y - y_hat)^2)
}

r2_manual <- function(y, y_hat) {
  mse_val <- mse(y, y_hat)
  1 - mse_val / mean(y^2)
}

r2_adjusted <- function(r2, n, k) {
  if (k >= n - 1) return(NA)
  1 - (1 - r2) * (n - 1) / (n - k - 1)
}

features_list <- c(1, 2, 5, 10, 20, 50, 100, 200, 500)
r2_full <- c()
r2_adj <- c()
r2_out <- c()

for (p in features_list) {
  X_p <- sapply(1:p, function(i) X^i)
  X_p <- as.data.frame(X_p)

  model <- lm(y ~ ., data = X_p)
  y_hat <- predict(model, X_p)

  r2 <- r2_manual(y, y_hat)
  r2_full <- c(r2_full, r2)
  r2_adj <- c(r2_adj, r2_adjusted(r2, n, p))
  
  idx <- sample(1:n, size = 0.75*n)
  trainX <- X_p[idx, ]
  testX <- X_p[-idx, ]
  trainY <- y[idx]
  testY <- y[-idx]
                
  train_data <- data.frame(trainY = trainY, trainX)
  test_data  <- data.frame(testY = testY, testX)
                
  model_train <- lm(trainY ~ ., data = train_data)
  y_pred_test <- predict(model_train, test_data)
                
  r2_out <- c(r2_out, r2_manual(test_data$testY, y_pred_test))
}

df_results <- data.frame(
  Features = features_list,
  R2_full = r2_full,
  R2_adj = r2_adj,
  R2_out = r2_out
)

ggplot(df_results, aes(x = Features, y = R2_full)) +
  geom_line() + geom_point() +
  scale_x_log10() +
  labs(title = "R² en toda la muestra (R)", x = "Número de features", y = "R²")

ggplot(df_results, aes(x = Features, y = R2_adj)) +
  geom_line(color="orange") + geom_point(color="orange") +
  scale_x_log10() +
  labs(title = "R² ajustado (R)", x = "Número de features", y = "R² ajustado")

ggplot(df_results, aes(x = Features, y = R2_out)) +
  geom_line(color="green") + geom_point(color="green") +
  scale_x_log10() +
  labs(title = "R² en test (R)", x = "Número de features", y = "R² fuera de muestra")