In [1]:
using Random
Random.seed!(13)

using CSV
using Plots
using DataFrames
using Statistics
using Missings
using Distributions
using StatsPlots
using LowRankModels, LinearAlgebra

In [2]:
include("proxgrad.jl")
df = CSV.read("airline.csv")

ArgumentError: ArgumentError: "airline.csv" is not a valid file

In [3]:
# Organize and display features along with variable type
feature_names = names(df)
for i in 1:20
    println(string(i), "\t", string(feature_names[i]), "\t\t\t", string(eltype(df[!, i])))
end

UndefVarError: UndefVarError: df not defined

In [4]:
# Reduce number of datapoints for computation time

# Filter data to only consider entries with overall rating
df_OR = df[.!(ismissing.(df[!, :overall_rating])), :];

df_OR_filtered = df_OR[.!(ismissing.(df_OR[!, :seat_comfort_rating])), :]; 
data_OR_filtered = df_OR_filtered[.!(ismissing.(df_OR_filtered[!, :cabin_staff_rating])), :];

# Now adding food_beverages_rating, inflight_entertainment_rating, and value money rating
data_OR_filtered = data_OR_filtered[.!(ismissing.(data_OR_filtered[!, :food_beverages_rating])), :];
data_OR_filtered = data_OR_filtered[.!(ismissing.(data_OR_filtered[!, :inflight_entertainment_rating])), :];
data_OR_filtered = data_OR_filtered[.!(ismissing.(data_OR_filtered[!, :value_money_rating])), :];

train_proportion = 0.8
n = size(data_OR_filtered, 1)
println("Size of dataset: ", string(n))

# Put the first ntrain observations in the DataFrame df into the training set, and the rest into the test set
ntrain = convert(Int, round(train_proportion*n))
println("Size of train: ", string(round(train_proportion*n)))
println("Size of test: ", string(round(n-train_proportion*n)))

target = data_OR_filtered[:, :overall_rating]

# Filter data for entries with only seat_comfort, cabin_staff
# df_OR_filtered = df_OR[.!(ismissing.(df[!, :seat_comfort_rating])), :]; 
# df_OR_filtered = df_OR[.!(ismissing.(df[!, :cabin_staff_rating])), :]; 
# data_OR_filtered = df_OR_filtered[:, filter(col -> (col != :overall_rating), feature_names)]

# the following variable records the features of examples in the training set
train_x = data_OR_filtered[1:ntrain,:]

# the following variable records the features of examples in the test set
test_x = data_OR_filtered[ntrain+1:end,:]

# the following variable records the labels of examples in the training set
train_y = target[1:ntrain]
train_y = collect(skipmissing(train_y))
# the following variable records the labels of examples in the test set
test_y = target[ntrain+1:end]
test_y = collect(skipmissing(test_y));

UndefVarError: UndefVarError: df not defined

In [5]:
"""This function just computes the mean squared error."""
function MSE(y, pred)
    "Fill this in."
    num = size(y,1)
    error = sum(abs.((y.-pred).^2))/num
    return error
end

"""This function plots the main diagonal; 
for a "predicted vs true" plot with perfect predictions,
all data lies on this line"""
function plotDiagonal(xmin, xmax)
    xsamples = [xmin, xmax]
    plot!(xsamples, xsamples, color=:black)
end

"""This helper funciton plots x vs, y and labels the axes."""
function plotdata(x,y,xname, yname; margin=.05, plotDiag=true, zeromin=false)
    scatter(x,y, label="data")
    xlabel!(xname)
    ylabel!(yname)
    range_y = maximum(y) - minimum(y)
    range_x = maximum(x) - minimum(x)
    if plotDiag
        plotDiagonal(minimum(x)-margin*range_x, maximum(x)+margin*range_x)
    end
    if zeromin
        ylims!((0.0,maximum(y)+margin*range_y))
        xlims!((0.0,maximum(x)+margin*range_x))
    else
        ylims!((minimum(y)-margin*range_y,maximum(y)+margin*range_y))
        xlims!((minimum(x)-margin*range_x,maximum(x)+margin*range_x))
    end
end

"""This function plots the predicted labels vs the actual labels
(We only plots the first 1000 points to avoid slow plots.)"""
function plot_pred_true(test_pred, test_y, max_points = 1000)
    plotdata(test_pred[1:max_points], test_y[1:max_points], "Predicted Rating", "True Rating", zeromin=true)
end

plot_pred_true

In [6]:
"This function converts strings to floating point values.
Strings that cannot be represented as a number (like NA) are converted to zeros"
function string_to_float(str)
    try
        parse(Float64, str)
    catch
       0.0
    end
end

labels_real = [
  :overall_rating,
  :seat_comfort_rating,
  :cabin_staff_rating,
  :food_beverages_rating,
  :inflight_entertainment_rating,
  :ground_service_rating,
  :wifi_connectivity_rating,
  :value_money_rating,
  :recommended
]

labels_string = [
    :beds,
    :security_deposit,
    :cleaning_fee
]

3-element Array{Symbol,1}:
 :beds            
 :security_deposit
 :cleaning_fee    

In [7]:
# # Only converting for seat comfort and cabin staff
# train_vals_real = convert(Matrix,train_x[:,filter(col -> (col in labels_real), [:seat_comfort_rating,:cabin_staff_rating])]);
# test_vals_real = convert(Matrix,test_x[:,filter(col -> (col in labels_real), [:seat_comfort_rating,:cabin_staff_rating])]);
# size(train_vals_real,1)
# any(ismissing,train_vals_real)

# Only converting for seat comfort, cabin staff, food/beverage, and inflight entertainment
train_vals_real = convert(Matrix,train_x[:,filter(col -> (col in labels_real), [:seat_comfort_rating,:cabin_staff_rating,:food_beverages_rating,:inflight_entertainment_rating,:value_money_rating])]);
test_vals_real = convert(Matrix,test_x[:,filter(col -> (col in labels_real), [:seat_comfort_rating,:cabin_staff_rating,:food_beverages_rating,:inflight_entertainment_rating,:value_money_rating])]);
size(train_vals_real,1)
any(ismissing,train_vals_real)

# Add offset
train_vals_real = [train_vals_real ones(size(train_vals_real,1))];
test_vals_real = [test_vals_real ones(size(test_vals_real,1))];

[train_vals_real train_y]

UndefVarError: UndefVarError: train_x not defined

In [8]:
train_vals_real = convert(Array{Float64,2},train_vals_real);

UndefVarError: UndefVarError: train_vals_real not defined

In [9]:
# L1 Loss
loss_L1 = 1/n*L1Loss()
λ = 0
# Quad Regularizer
reg = QuadReg(λ)

w_L1 = proxgrad(loss_L1, reg, train_vals_real, train_y, maxiters=10)
train_pred = train_vals_real*w_L1
test_pred = test_vals_real*w_L1
train_MSE = MSE(train_pred,train_y)
test_MSE = MSE(test_pred,test_y)

println("Train MSE:\t", train_MSE)
println("Test MSE: \t", test_MSE)

UndefVarError: UndefVarError: n not defined

In [10]:
# L1 Loss
loss_L1 = 1/n*L1Loss()
# Quad Regularizer
# reg = λ*QuadReg()
# reg = ZeroReg()

train_MSE_array = zeros(101)
test_MSE_array = zeros(101)

for i = 0:100
    
    λ = i/100
    println(λ)
    reg = QuadReg(λ)
    w_L1 = proxgrad(loss_L1, reg, train_vals_real, train_y, maxiters=10)
    train_pred = train_vals_real*w_L1
    test_pred = test_vals_real*w_L1
    train_MSE = MSE(train_pred,train_y)
    test_MSE = MSE(test_pred,test_y)
    train_MSE_array[i+1] = train_MSE 
    test_MSE_array[i+1] = test_MSE
end

plot(collect(0:0.01:1),train_MSE_array)
plot!(collect(0:0.01:1),test_MSE_array)
println("Train MSE:\t", minimum(train_MSE_array))
println("Test MSE: \t", minimum(test_MSE))

UndefVarError: UndefVarError: n not defined

In [11]:
# L2 Loss
loss_L2 = 1/n*QuadLoss()
# Quad Regularizer
# reg = λ*QuadReg()
# reg = ZeroReg()

train_MSE_array = Float64[]
test_MSE_array = Float64[]
λs = 0:.01:1

for λ in λs
    print(λ)
    reg = QuadReg(λ)
    w_L2 = proxgrad(loss_L2, reg, train_vals_real, train_y, maxiters=1000)
    train_pred = train_vals_real*w_L2
    test_pred = test_vals_real*w_L2
    train_MSE = MSE(train_pred,train_y)
    test_MSE = MSE(test_pred,test_y)
    push!(train_MSE_array,train_MSE)
    push!(test_MSE_array,test_MSE)
end
println("")
println("Min Train MSE:\t", minimum(train_MSE_array))
println("Min Test MSE: \t", minimum(test_MSE_array))
println("Optimal λ: \t", λs[argmin(test_MSE_array)])
plot(λs,train_MSE_array)
plot!(λs,test_MSE_array)

UndefVarError: UndefVarError: n not defined

In [12]:
# L2 Loss
loss_L2 = 1/n*QuadLoss()
# Quad Regularizer
# reg = λ*QuadReg()
# reg = ZeroReg()

train_MSE_array = Float64[]
test_MSE_array = Float64[]
λs = 0:.01:1

for λ in λs
    print(λ)
    reg = QuadReg(λ)
    w_L2 = proxgrad(loss_L2, reg, train_vals_real, train_y, maxiters=2000)
    train_pred = train_vals_real*w_L2
    test_pred = test_vals_real*w_L2
    train_MSE = MSE(train_pred,train_y)
    test_MSE = MSE(test_pred,test_y)
    push!(train_MSE_array,train_MSE)
    push!(test_MSE_array,test_MSE)
end
println("")
println("Min Train MSE:\t", minimum(train_MSE_array))
println("Min Test MSE: \t", minimum(test_MSE_array))
println("Optimal λ: \t", λs[argmin(test_MSE_array)])
plot(λs,train_MSE_array)
plot!(λs,test_MSE_array)

UndefVarError: UndefVarError: n not defined