In [2]:
using Distributions, Random, Plots, DataFrames, Optim, Statistics
using LinearAlgebra, StatsFuns, ScikitLearn, CSV, SparseArrays, GLM, LaTeXTabulars

In [3]:
bids = DataFrame(CSV.File("../Data/bids.csv"));
items = DataFrame(CSV.File("../Data/items.csv"));
attributes = DataFrame(CSV.File("../Data/sparse_attributes.csv"));
sparse_attributes = sparse(attributes[!,1], attributes[!,2], attributes[!,3]);

# 1. Some summary stats

In [4]:
bids[!,:log_bid_value] = log.(bids[!,:bid_value])
gbids = groupby(bids, :item_num)
gbids = combine(gbids, nrow => :num_bids);
bids_sum = combine(bids, :log_bid_value => mean, :log_bid_value => std, :log_bid_value => median, :log_bid_value => maximum, :log_bid_value => minimum);
gbids_sum = combine(gbids, :num_bids => mean, :num_bids => std, :num_bids => median, :num_bids => maximum, :num_bids => minimum);

In [5]:
latex_tabular("output/ps3_q3_part1summary.tex",
              Tabular("cccccc"),
              [Rule(:top),
               ["", "Mean", "SD", "Median", "Maximum", "Minimum"],
               Rule(:mid),
               hcat("Log Bid Value", round.(Array(bids_sum), digits=3)),
               hcat("Number of Bids", round.(Array(gbids_sum), digits=3)),
               Rule(:bottom)])

# 2. Estimate $\gamma$

In [6]:
# Put into dataframe
items.row_num = 1:nrow(items)
items = innerjoin(gbids, items, on="item_num");
bids = innerjoin(bids, items, on="item_num")
attributes = DataFrame(Matrix{Float64}(sparse_attributes), :auto)
attributes.row_num = 1:nrow(attributes)
attribute_bids = innerjoin(bids, attributes, on="row_num");
attribute_bids.enter .= 1 #since we observe all bids 
attribute_bids.difference = attribute_bids[!,:pred_n_participant] .- attribute_bids[!,:num_bids];


In [7]:
# Now we want to create a dataframe that has pred_n_participant - num_bidders with enter = 0
dflist = []
for x in items.item_num
    rownum = attribute_bids[attribute_bids[!,:item_num].==x, :row_num][1]
    rep = Integer(attribute_bids[attribute_bids[!,:item_num].==x, :difference][1])

    # build my columns
    if rep > 0 
        chars = repeat(DataFrame(attribute_bids[attribute_bids[!,:item_num].==x, [:item_num, :pred_n_participant]][1,:]), rep)
        atts = repeat(attributes[attributes[!,:row_num].==rownum,:], rep)
        entry = DataFrame(enter=repeat([0], rep))
        df = hcat(chars, atts, entry)

        push!(dflist, df)
    end  
end
nobids = reduce(vcat, dflist)
attribute_bids = sort(vcat(nobids, attribute_bids, cols=:union), :item_num);


In [8]:
# Probit to get inverse mills ratio 
att = sum(Term.(Symbol.(names(attribute_bids[:, Not([:log_bid_value, :bid_value, :row_num, :item_num, :pred_n_participant, :enter, :num_bids, :difference])]))))
probit = glm(Term(:enter) ~ att, attribute_bids, Binomial(), ProbitLink())
attribute_bids.probit_fit = GLM.predict(probit, attribute_bids[:, Not([:log_bid_value, :bid_value, :row_num, :item_num, :pred_n_participant, :enter, :num_bids, :difference])]);
attribute_bids.imr = pdf(Normal(), attribute_bids.probit_fit) ./ cdf(Normal(), attribute_bids.probit_fit);

In [9]:
X = sparse(Array(attribute_bids[attribute_bids[:,:enter].==1, names(attribute_bids, Not([:log_bid_value, :bid_value, :row_num, :item_num, :pred_n_participant, :enter, :num_bids, :difference]))]));
Y = Array(attribute_bids[attribute_bids[:,:enter].==1,:log_bid_value]);

In [10]:
# Method 1: Linear regression
@sk_import linear_model: LinearRegression
ols = ScikitLearn.fit!(LinearRegression(), X, Y)
Ŷ = ScikitLearn.predict(ols, X);
ols_mse = mean((Ŷ .- Y).^2)

┌ Info: Running `conda install -y -c conda-forge llvm-openmp` in root environment
└ @ Conda /Users/junwong/.julia/packages/Conda/x2UxR/src/Conda.jl:127


Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... done

# All requested packages already installed.



1.2193714288249031

In [11]:
# Method 2: Lasso
@sk_import linear_model: LassoLarsIC;
lasso=ScikitLearn.fit!(LassoLarsIC(criterion="aic"), X, Y);
Ŷ = ScikitLearn.predict(lasso, X)
lasso_mse = mean((Ŷ .- Y).^2)
println("Lasso MSE is ", lasso_mse, " with alpha ", lasso.alphas_[argmin(lasso.criterion_)])


┌ Info: Running `conda install -y -c conda-forge llvm-openmp` in root environment
└ @ Conda /Users/junwong/.julia/packages/Conda/x2UxR/src/Conda.jl:127


Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... done

# All requested packages already installed.

Lasso MSE is 1.2264922269223795 with alpha 1.160438988948313e-5


If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), LassoLarsIC())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)

Set parameter alpha to: original_alpha * np.sqrt(n_samples). 


In [12]:
# Method 3: Neural net 
@sk_import neural_network: MLPRegressor
clf = MLPRegressor(hidden_layer_sizes=(250))
nn_reg = ScikitLearn.fit!(clf, X, Y)
Ŷ = ScikitLearn.predict(nn_reg, X)
nn_mse = mean((Ŷ .- Y).^2)

┌ Info: Running `conda install -y -c conda-forge llvm-openmp` in root environment
└ @ Conda /Users/junwong/.julia/packages/Conda/x2UxR/src/Conda.jl:127


Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... done

# All requested packages already installed.



0.5800010103208214

In [13]:
latex_tabular("output/ps3_q3_mse.tex",
              Tabular("cc"),
              [Rule(:top),
               ["", "MSE"],
               Rule(:mid),
               ["OLS", round(ols_mse, digits=3)],
               ["Lasso", round(lasso_mse, digits=3)],
               ["Neural Net", round(nn_mse, digits=3)],
               Rule(:bottom)])

In [14]:
# Homogenized log bids
attribute_bids = attribute_bids[attribute_bids[:,:enter].==1,:]
attribute_bids.resid_bid = Y - Ŷ;

highest_bids = combine(groupby(attribute_bids, :item_num)) do sdf
       first(sort(sdf, :resid_bid, rev=true), 2)[:,[:item_num, :pred_n_participant, :resid_bid]]
       end
# reshape into two columns
highest_bids.ranking = repeat(["first", "second"], Integer(size(highest_bids,1)/2));
highest_bids = unstack(highest_bids, [:item_num, :pred_n_participant], :ranking, :resid_bid)

X_3 = Array(highest_bids[2 .<= highest_bids[!,:pred_n_participant] .<= 3, [:first, :second]]);
X_7 = Array(highest_bids[4 .<= highest_bids[!,:pred_n_participant] .<= 7, [:first, :second]]);
X_8 = Array(highest_bids[8 .<= highest_bids[!,:pred_n_participant], [:first, :second]]);

In [15]:
bids_sum = combine(attribute_bids, :resid_bid => mean, :resid_bid => std, :resid_bid => median, :resid_bid => maximum, :resid_bid => minimum);
latex_tabular("output/ps3_q3_part7summary.tex",
              Tabular("cccccc"),
              [Rule(:top),
               ["", "Mean", "SD", "Median", "Maximum", "Minimum"],
               Rule(:mid),
               hcat("Homogenized Log Bid", round.(Array(bids_sum), digits=3)),
               Rule(:bottom)])

# 3. Song (2004)

In [16]:
@sk_import mixture: GaussianMixture 

all_cdf = []
all_pdf = []
for x in [X_3, X_7, X_8]
    mixing = ScikitLearn.fit!(GaussianMixture(n_components=3), x) 
    blah = hcat(-2:0.01:2, -2:0.01:2)
    pdf = exp.(score_samples(mixing, blah))
    cdf = cumsum(pdf) ./ maximum(cumsum(pdf))
    push!(all_pdf, pdf)
    push!(all_cdf, cdf)
end


┌ Info: Running `conda install -y -c conda-forge llvm-openmp` in root environment
└ @ Conda /Users/junwong/.julia/packages/Conda/x2UxR/src/Conda.jl:127


Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... done

# All requested packages already installed.



In [45]:
plot(-2:0.01:2, all_cdf[1], width=2, labels = "2-3 participants")
plot!(-2:0.01:2, all_cdf[2], width=2, labels = "4-7 participants")
plot!(-2:0.01:2, all_cdf[3], width=2, labels ="8+ participants", legend = :topleft,
      xlabel="Homogenized Log Bid", ylabel="F(Homogenized Log Bid)")
savefig("output/ps3_q3_bids_cdf.pdf")

"/Users/junwong/Documents/io2_psets/ps3/output/ps3_q3_bids_cdf.pdf"

In [44]:
# plot optimal reserve price 
iter_v = 1
optimal_price = zeros(length(-2:0.01:2),2)
valuation_range = -2:0.01:2

# you want to minimize v_seller - (r + 1-F(r)/f(r))
# here I just do a really coarse search over a grid from -2 -> 2 by 0:01
for v in 1:length(valuation_range)
    minim_list = []
    for r in 1:length(valuation_range)
        push!(minim_list, abs(valuation_range[v] - valuation_range[r] + (1-all_cdf[1][r])/all_pdf[1][r]))
    end
    optimal_price[v, 1] = valuation_range[argmin(minim_list)]
    optimal_price[v, 2] = minimum(minim_list)
    iter_v +=1
end

plot(exp.(valuation_range), exp.(optimal_price[:,1]), legend=false, 
     width=1.5, xlabel="Seller Valuation", ylabel="Optimal Reserve Price")
savefig("output/ps3_q3_reserve_prices.pdf")

"/Users/junwong/Documents/io2_psets/ps3/output/ps3_q3_reserve_prices.pdf"