In [66]:
using JuMP, Gurobi, CSV, LinearAlgebra, DataFrames, Random, Distributions, Statistics,MLBase, ROCAnalysis

In [77]:
gurobi_env = Gurobi.Env()

Academic license - for non-commercial use only


Gurobi.Env(Ptr{Nothing} @0x00007fd793153c00)

# Loading Data

In [87]:
train_data = CSV.read("data/adult_train.csv")
X_train = convert(Matrix, train_data[:, 1:91])
y_train = train_data[:, 92]
test_data = CSV.read("data/adult_test.csv")
X_test = convert(Matrix, test_data[:, 1:91])
y_test = test_data[:, 92]
println("Got the data for X dataset.")

Got the data for X dataset.


# Utils Functions

In [88]:
function compute_∇f(w_k, z_k, y, X, λ)
    n, p = size(X)
    ∇f_k = sum(-z_k[i]/(1+exp(y[i]*dot(w_k,X[i,:])))*y[i].*X[i,:] for i in 1:n) + 2*λ.*w_k
    return ∇f_k
end

compute_∇f (generic function with 1 method)

In [89]:
function solve_inner_max_problem(w_k, y, X, K, λ)
    n, p = size(X)
    model_inner_max = Model(solver=GurobiSolver(OutputFlag=0,gurobi_env))
    @variable(model_inner_max, z[1:n] >= 0)
    @constraint(model_inner_max, [i=1:n], 1 >= z[i])
    @constraint(model_inner_max, sum(z) <= K)
    @objective(
        model_inner_max,
        Max,
        sum(z[i]*log(1+exp(-y[i]*dot(X[i,:], w_k))) for i=1:n)
    )
    solve(model_inner_max)
    optimal_z_k = getvalue(z)
    optimal_f_k = getobjectivevalue(model_inner_max) + λ*dot(w_k,w_k)
    return optimal_z_k, optimal_f_k
end

solve_inner_max_problem (generic function with 1 method)

# Cutting Planes Implementation

In [110]:
function stable_LR_cutting_planes(y, X, ε, K,λ)
    errors = []
    n, p = size(X)
    # Initialization values and step 0
    w_0 = [0 for i in 1:p]
    #w_0 = [rand(Uniform(-0.5, 0.5)) for i in 1:p]
    z_0, f_0 = solve_inner_max_problem(w_0, y, X, K, λ)
    ∇f_0 = compute_∇f(w_0, z_0, y, X, λ)
    
    # Outer minimization problem
    outer_min_model = Model(solver=GurobiSolver(OutputFlag=0, gurobi_env))
    @variable(outer_min_model, t >= 0)
    @variable(outer_min_model, w[1:p])
    #@constraint(outer_min_model, [j=1:p], -1 <= w[j] <= 1)
    @constraint(outer_min_model, t >= f_0 + dot(∇f_0, w)-dot(∇f_0, w_0))
    @constraint(outer_min_model, [j=1:p], 1 >= w[j])
    @constraint(outer_min_model, [j=1:p], w[j] >= -1)
    @objective(outer_min_model, Min, t)
    k = 1 # Number of constraints in the final problem
    solve(outer_min_model)
    
    # New steps k
    t_k = getvalue(t)
    w_k = getvalue(w)
    z_k, f_k = solve_inner_max_problem(w_k, y, X, K, λ)
    
    ∇f_k = compute_∇f(w_k, z_k, y, X, λ)
    while abs(f_k - t_k) >= ε # error
        push!(errors, f_k - t_k)
        @constraint(outer_min_model,t >= f_k + dot(∇f_k, w)-dot(∇f_k, w_k))
        k += 1
        solve(outer_min_model)
        # Updating all the values
        t_k = getvalue(t)
        w_k = getvalue(w)
        z_k, f_k = solve_inner_max_problem(w_k, y, X, K, λ)
        
        ∇f_k = compute_∇f(w_k, z_k, y, X, λ)
        if k%100 == 0
            println("Number of constraints: ", k, "\t Error = ", abs(t_k - f_k))
        end
        if k > 20000
            break
        end
    end
    push!(errors, f_k - t_k)
    return t_k, f_k, w_k, z_k, errors
end

stable_LR_cutting_planes (generic function with 1 method)

In [111]:
function scores(preds, gt)
    acc = sum(preds .== gt)/size(preds)[1]
    TPR = dot((preds.==1),gt.==1)/(dot((preds.==1),gt.==1) + dot((preds.==-1),gt.==1))
    FPR = dot((preds.==1),gt.==-1)/ (dot((preds.==1),gt.==-1) + dot((preds.==-1),gt.==-1))
    return acc, TPR, FPR
end

scores (generic function with 1 method)

In [112]:
# 50% train data
n,p=size(X_train)
t_opt, f_opt, w_opt, z_opt, errors = stable_LR_cutting_planes(y_train, X_train, 0.01, Int(round(n/2)),.1)

preds = (X_train*w_opt)
train_sc = scores(sign.(preds), y_train)
preds = (X_test*w_opt)
test_sc = scores(sign.(preds), y_test)
println("########## 50% data ##########")
println("########## Train Scores ##########")
println(train_sc)
println("########## Test Scores ##########")
println(test_sc)

Number of constraints: 100	 Error = 30506.699243119783
Number of constraints: 200	 Error = 18700.28616590511
Number of constraints: 300	 Error = 8239.290518416337
Number of constraints: 400	 Error = 5505.1124951493
Number of constraints: 500	 Error = 3418.7603904502703
Number of constraints: 600	 Error = 2077.710359108709
Number of constraints: 700	 Error = 1543.1784504268999
Number of constraints: 800	 Error = 858.5312766481948
Number of constraints: 900	 Error = 642.4722961490752
Number of constraints: 1000	 Error = 540.9061045182098
Number of constraints: 1100	 Error = 361.8555419157983
Number of constraints: 1200	 Error = 285.9702688003472
Number of constraints: 1300	 Error = 186.76249553490197
Number of constraints: 1400	 Error = 209.90593917820115
Number of constraints: 1500	 Error = 137.2595974827782
Number of constraints: 1600	 Error = 143.32307084100466
Number of constraints: 1700	 Error = 96.73688145849519
Number of constraints: 1800	 Error = 85.39791169868658
Number of const

In [114]:
# 60% train data
n,p=size(X_train)
t_opt, f_opt, w_opt, z_opt, errors = stable_LR_cutting_planes(y_train, X_train, 0.01, Int(round(.6*n)),.1)

preds = (X_train*w_opt)
train_sc = scores(sign.(preds), y_train)
preds = (X_test*w_opt)
test_sc = scores(sign.(preds), y_test)
println("########## 60% data ##########")
println("########## Train Scores ##########")
println(train_sc)
println("########## Test Scores ##########")
println(test_sc)

Number of constraints: 100	 Error = 38838.91543297889
Number of constraints: 200	 Error = 20570.64376068709
Number of constraints: 300	 Error = 8887.016796861639
Number of constraints: 400	 Error = 5838.261684543708
Number of constraints: 500	 Error = 3492.5897934638324
Number of constraints: 600	 Error = 2641.833894532958
Number of constraints: 700	 Error = 1704.848026620688
Number of constraints: 800	 Error = 1067.3938910891302
Number of constraints: 900	 Error = 931.4926872739761
Number of constraints: 1000	 Error = 577.687351961782
Number of constraints: 1100	 Error = 483.4782063941675
Number of constraints: 1200	 Error = 327.25771003082264
Number of constraints: 1300	 Error = 297.3077182740708
Number of constraints: 1400	 Error = 205.1298003876418
Number of constraints: 1500	 Error = 147.8041734866656
Number of constraints: 1600	 Error = 131.9617796649145
Number of constraints: 1700	 Error = 106.02508573947307
Number of constraints: 1800	 Error = 92.91417073259618
Number of constr

In [115]:
# 70% train data
n,p=size(X_train)
t_opt, f_opt, w_opt, z_opt, errors = stable_LR_cutting_planes(y_train, X_train, 0.01, Int(round(.7*n)),.1)

preds = (X_train*w_opt)
train_sc = scores(sign.(preds), y_train)
preds = (X_test*w_opt)
test_sc = scores(sign.(preds), y_test)
println("########## 70% data ##########")
println("########## Train Scores ##########")
println(train_sc)
println("########## Test Scores ##########")
println(test_sc)

Number of constraints: 100	 Error = 36375.850374060756
Number of constraints: 200	 Error = 20087.024592071488
Number of constraints: 300	 Error = 12760.075070488918
Number of constraints: 400	 Error = 6671.2952592634665
Number of constraints: 500	 Error = 5261.616625764473
Number of constraints: 600	 Error = 2723.497381427529
Number of constraints: 700	 Error = 2141.6818462299325
Number of constraints: 800	 Error = 1627.3196252931793
Number of constraints: 900	 Error = 1182.562500368671
Number of constraints: 1000	 Error = 835.8368927293632
Number of constraints: 1100	 Error = 512.7072320222123
Number of constraints: 1200	 Error = 485.37677524134597
Number of constraints: 1300	 Error = 380.64139382197754
Number of constraints: 1400	 Error = 303.0271580628905
Number of constraints: 1500	 Error = 287.301817135949
Number of constraints: 1600	 Error = 202.03381878633627
Number of constraints: 1700	 Error = 153.3105115278231
Number of constraints: 1800	 Error = 130.91683969494625
Number of 