In [13]:
using Distributions, Random, Plots, DataFrames, Optim, Statistics
using LinearAlgebra, StatsFuns, ScikitLearn, CSV, SparseArrays, GLM, LaTeXTabulars

In [14]:
bids = DataFrame(CSV.File("../Data/bids.csv"));
items = DataFrame(CSV.File("../Data/items.csv"));
attributes = DataFrame(CSV.File("../Data/sparse_attributes.csv"));
sparse_attributes = sparse(attributes[!,1], attributes[!,2], attributes[!,3]);

# 1. Some summary stats

In [15]:
bids[!,:log_bid_value] = log.(bids[!,:bid_value])
gbids = groupby(bids, :item_num)
gbids = combine(gbids, nrow => :num_bids);
bids_sum = combine(bids, :log_bid_value => mean, :log_bid_value => std, :log_bid_value => median, :log_bid_value => maximum, :log_bid_value => minimum);
gbids_sum = combine(gbids, :num_bids => mean, :num_bids => std, :num_bids => median, :num_bids => maximum, :num_bids => minimum);

In [16]:
latex_tabular("output/ps3_q3_part1summary.tex",
              Tabular("cccccc"),
              [Rule(:top),
               ["", "Mean", "SD", "Median", "Maximum", "Minimum"],
               Rule(:mid),
               hcat("Log Bid Value", round.(Array(bids_sum), digits=3)),
               hcat("Number of Bids", round.(Array(gbids_sum), digits=3)),
               Rule(:bottom)])

# 2. Estimate $\gamma$

In [17]:
# Create a dataframe with items that observe full participation
items.row_num = 1:nrow(items)
gbids = innerjoin(gbids, items, on=:item_num);
no_selection_items = gbids[gbids.num_bids .== gbids.pred_n_participant, [:item_num, :row_num, :pred_n_participant]];
num = no_selection_items.row_num
no_selection_items = innerjoin(bids, no_selection_items, on="item_num");
no_selection_attributes = DataFrame(Matrix{Float64}(sparse_attributes)[num,:], :auto);
no_selection_attributes.row_num = num;
no_selection_items = innerjoin(no_selection_items, no_selection_attributes, on="row_num");

In [18]:
X = sparse(Array(no_selection_items[:,names(no_selection_items, Not([:log_bid_value, :bid_value, :row_num, :item_num, :pred_n_participant]))]));
Y = Array(no_selection_items[:,:log_bid_value]);

In [19]:
# Method 1: Linear regression
@sk_import linear_model: LinearRegression
ols = ScikitLearn.fit!(LinearRegression(), X, Y)
Ŷ = ScikitLearn.predict(ols, X);
ols_mse = mean((Ŷ .- Y).^2)

# Method 2: Lasso
@sk_import linear_model: Lasso
lasso=ScikitLearn.fit!(Lasso(alpha=0.001), X, Y)
Ŷ = ScikitLearn.predict(lasso, X)
lasso_mse = mean((Ŷ .- Y).^2)

# Method 3: Neural net 
@sk_import neural_network: MLPRegressor
clf = MLPRegressor(alpha=1e-5, random_state=1)
nn_reg = ScikitLearn.fit!(clf, X, Y)
Ŷ = ScikitLearn.predict(nn_reg, X)
nn_mse = mean((Ŷ .- Y).^2)

┌ Info: Running `conda install -y -c conda-forge llvm-openmp` in root environment
└ @ Conda /Users/junwong/.julia/packages/Conda/x2UxR/src/Conda.jl:127


Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... done

# All requested packages already installed.



┌ Info: Running `conda install -y -c conda-forge llvm-openmp` in root environment
└ @ Conda /Users/junwong/.julia/packages/Conda/x2UxR/src/Conda.jl:127


Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... done

# All requested packages already installed.



┌ Info: Running `conda install -y -c conda-forge llvm-openmp` in root environment
└ @ Conda /Users/junwong/.julia/packages/Conda/x2UxR/src/Conda.jl:127


Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... done

# All requested packages already installed.



0.2899437096764593

In [20]:
latex_tabular("output/ps3_q3_mse.tex",
              Tabular("cc"),
              [Rule(:top),
               ["", "MSE"],
               Rule(:mid),
               ["OLS", round(ols_mse, digits=3)],
               ["Lasso", round(lasso_mse, digits=3)],
               ["Neural Net", round(nn_mse, digits=3)],
               Rule(:bottom)])

In [21]:
# Homogenized log bids
no_selection_items.resid_bid = Y - Ŷ;

highest_bids = combine(groupby(no_selection_items, :item_num)) do sdf
       first(sort(sdf, :resid_bid, rev=true), 2)[:,[:item_num, :pred_n_participant, :resid_bid]]
       end
# reshape into two columns
highest_bids.ranking = repeat(["first", "second"], Integer(size(highest_bids,1)/2));
highest_bids = unstack(highest_bids, [:item_num, :pred_n_participant], :ranking, :resid_bid)

X_3 = Array(highest_bids[2 .<= highest_bids[!,:pred_n_participant] .<= 3, [:first, :second]]);
X_7 = Array(highest_bids[4 .<= highest_bids[!,:pred_n_participant] .<= 7, [:first, :second]]);
X_8 = Array(highest_bids[8 .<= highest_bids[!,:pred_n_participant], [:first, :second]]);

In [22]:
bids_sum = combine(no_selection_items, :resid_bid => mean, :resid_bid => std, :resid_bid => median, :resid_bid => maximum, :resid_bid => minimum);
latex_tabular("output/ps3_q3_part7summary.tex",
              Tabular("cccccc"),
              [Rule(:top),
               ["", "Mean", "SD", "Median", "Maximum", "Minimum"],
               Rule(:mid),
               hcat("Homogenized Log Bid", round.(Array(bids_sum), digits=3)),
               Rule(:bottom)])

# 3. Song (2004)

In [23]:
@sk_import mixture: GaussianMixture 

all_cdf = []
all_pdf = []
for x in [X_3, X_7, X_8]
    mixing = ScikitLearn.fit!(GaussianMixture(n_components=3), x) 
    blah = hcat(-2:0.01:2, -2:0.01:2)
    pdf = exp.(score_samples(mixing, blah))
    cdf = cumsum(pdf) ./ maximum(cumsum(pdf))
    push!(all_pdf, pdf)
    push!(all_cdf, cdf)
end


┌ Info: Running `conda install -y -c conda-forge llvm-openmp` in root environment
└ @ Conda /Users/junwong/.julia/packages/Conda/x2UxR/src/Conda.jl:127


Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... done

# All requested packages already installed.





In [24]:
plot(-2:0.01:2, all_cdf[1], labels = "2-3 participants")
plot!(-2:0.01:2, all_cdf[2], labels = "4-7 participants")
plot!(-2:0.01:2, all_cdf[3], labels ="8+ participants", legend = :topleft)
savefig("output/bids_cdf.pdf")

"/Users/junwong/Documents/io2_psets/ps3/output/bids_cdf.pdf"

In [None]:
# plot optimal reserve price 
p = []
iter = 1
for x in -2:0.01:2 
    r = (1 .- all_cdf[1][iter])./all_pdf[1][iter]
    push!(p, abs(x - r))
    iter += 1
end

eh = -2:0.01:2
eh[209]

In [None]:
exp(0.08)