In [None]:
using CSV
using DataFrames
using Plots
using Random
using LinearAlgebra
using Statistics
using GLMNet

Random.seed!(12092024)

In [None]:
# Cargar datos
df = DataFrame(CSV.File("Districtwise_literacy_rates.csv"))

# Eliminar valores faltantes
dropmissing!(df)
println("Dataset limpio: ", size(df))

In [None]:
# Histogramas
default(size=(900,600))
histogram(df.FEMALE_LIT, bins=30, title="Distribución alfabetización femenina",
          xlabel="FEMALE_LIT", ylabel="Frecuencia", color=:purple)
savefig("female_literacy_hist.png")

histogram(df.MALE_LIT, bins=30, title="Distribución alfabetización masculina",
          xlabel="MALE_LIT", ylabel="Frecuencia", color=:blue)
savefig("male_literacy_hist.png")

In [None]:
# Separar train y test
n  = nrow(df)
idx = shuffle(1:n)
ntr = Int(round(0.7n))
tr  = idx[1:ntr]; te = idx[ntr+1:end]

y  = Vector{Float64}(df.FEMALE_LIT)

In [None]:
# Modelo simple
X_low = Matrix(df[:, [:MALE_LIT]])

cv_low = glmnetcv(X_low[tr, :], y[tr], alpha=1)
λ_low = cv_low.lambda[argmin(cv_low.meanloss)]
fit_low = glmnet(X_low[tr, :], y[tr], alpha=1, lambda=[λ_low])

ŷ_low = predict(fit_low, X_low[te, :], λ_low)[:, 1]
r2_low = 1 .- sum((y[te] .- ŷ_low).^2) / sum((y[te] .- mean(y[te])).^2)

coefs_low = coef(fit_low, λ_low)
nnz_low = count(!iszero, coefs_low[2:end])

println("R² simple = ", round(r2_low, digits=3),
        " | λ* = ", λ_low,
        " | #coef≠0 = ", nnz_low)

In [None]:
# Modelo flexible
pred = Symbol[
    :MALE_LIT, :TOTPOPULAT, :P_URB_POP, :SEXRATIO, :OVERALL_LI, :AREA_SQKM,
    Symbol("TOT_6_10_15"), Symbol("TOT_11_13_15"), :SCH1, :SCH2, :SCH3
]

Xbase = Matrix{Float64}(df[:, pred])
Xsq = hcat([df[!, p].^2 for p in pred]...)

ints  = DataFrame()
for i in 1:length(pred)-1, j in i+1:length(pred)
    v = df[!, pred[i]] .* df[!, pred[j]]
    insertcols!(ints!, ncol(ints)+1 => v)
end
Xint = Matrix{Float64}(ints)

X_high = hcat(Xbase, Xsq, Xint)

cv_high = glmnetcv(X_high[tr, :], y[tr], alpha=1)
λ_high = cv_high.lambda[argmin(cv_high.meanloss)]
fit_high = glmnet(X_high[tr, :], y[tr], alpha=1, lambda=[λ_high])

ŷ_high = predict(fit_high, X_high[te, :], λ_high)[:, 1]
r2_high = 1 .- sum((y[te] .- ŷ_high).^2) / sum((y[te] .- mean(y[te])).^2)
coefs_high = coef(fit_high, λ_high)
nnz_high = count(!iszero, coefs_high[2:end])

println("R² flexible = ", round(r2_high, digits=3),
        " | λ* = ", λ_high,
        " | #coef≠0 = ", nnz_high)

In [None]:
# Trayectoria de coeficientes
λ_grid = exp.(range(log(1.0e4), stop=log(1.0e-3), length=100))
fit_path = glmnet(X_high[tr, :], y[tr], alpha=1, lambda=λ_grid)

C = hcat([vec(coef(fit_path, λ))[2:end] for λ in λ_grid]...)

plt = plot(xscale=:log10, xlabel="λ (alpha)", ylabel="Coeficientes",
           title="Trayectoria de coeficientes LASSO", legend=false)
for k in 1:size(C, 1)
    plot!(λ_grid, C[k, :])
end
hline!([0], linetype=:dash, color=:black)
savefig("lasso_path.png")

println("Listo: gráficos guardados")