In [1]:
# import gurobi and numpy
from gurobipy import *
import numpy as np
from numpy import genfromtxt
import csv

# get variable names
with open('AirbnbTest.csv') as csvFile:
    reader = csv.reader(csvFile)
    feat_names = next(reader)

# load data
train = genfromtxt('AirbnbTrain.csv', delimiter=',', skip_header = 1)
test = genfromtxt('AirbnbTest.csv', delimiter=',', skip_header = 1)

# separate features from prices
feat_train = train[:, :-1]
prices_train = train[:, -1]

# remove rows with nan
feat_mask = ~np.any(np.isnan(feat_train), axis=1)
feat_train = feat_train[feat_mask]
# remove corresponding prices
prices_train = prices_train[feat_mask]

feat_test = test[:, :-1]
prices_test = test[:, -1]


# Model 1

In [2]:
# initialize model
mod_1 = Model()

# number of features
d = 12
# number of samples
n = 1698

# initialize beta vector variables
beta_vec_1 = mod_1.addVars(d)
# initialize error vector
e_1 = mod_1.addVars(n)

# compute error by accounting for absolute value
for i in range(n):
    mod_1.addConstr(
        e_1[i] >= prices_train[i] - sum(beta_vec_1[j] * feat_train[i][j] for j in range(d))
    )
    mod_1.addConstr(
        e_1[i] >= sum(beta_vec_1[j] * feat_train[i][j] for j in range(d)) - prices_train[i]
    )

# initialize objective value
mod_1.setObjective((1 / n) * sum(e_1[i] for i in range(n)), GRB.MINIMIZE)


Using license file C:\Users\HP\gurobi.lic
Academic license - for non-commercial use only - expires 2021-01-27


In [3]:
# update and solve model
mod_1.update()
mod_1.optimize()


Gurobi Optimizer version 9.1.0 build v9.1.0rc0 (win64)
Thread count: 2 physical cores, 4 logical processors, using up to 4 threads
Optimize a model with 3396 rows, 1710 columns and 41324 nonzeros
Model fingerprint: 0x837d49d5
Coefficient statistics:
  Matrix range     [5e-01, 5e+02]
  Objective range  [6e-04, 6e-04]
  Bounds range     [0e+00, 0e+00]
  RHS range        [1e+01, 2e+03]

Concurrent LP optimizer: dual simplex and barrier
Showing barrier log only...

Presolve time: 0.04s
Presolved: 3396 rows, 1710 columns, 41324 nonzeros

Ordering time: 0.00s

Barrier statistics:
 Dense cols : 12
 AA' NZ     : 3.963e+04
 Factor NZ  : 4.310e+04 (roughly 2 MBytes of memory)
 Factor Ops : 5.486e+05 (less than 1 second per iteration)
 Threads    : 1

                  Objective                Residual
Iter       Primal          Dual         Primal    Dual     Compl     Time
   0   2.12287784e+03  0.00000000e+00  1.36e+03 0.00e+00  2.67e+01     0s
   1   1.62106388e+03  1.35470661e+00  0.00e+00 6

In [4]:
# display optimal error value (on training set)
opt_val_1 = mod_1.objval
print("Minimum error:", opt_val_1)


Minimum error: 36.40538294619911


In [6]:
# display beta vector
for j in range(d):
    print(beta_vec_1[j].x)


306.94902983495786
88.84144342711934
36.926152475997654
9.962147072793314
31.47343459217322
19.877157981000167
0.0
0.31096103666759495
0.0
0.0
0.26541922446623234
5.038274571440098


What is the prediction error, in $/night, of your model on the test set (provided in AirbnbTest.csv)?

In [7]:
# size of test set
n_test = len(prices_test)
# initialize list of errors for each sample
pred_error = [0 for i in range(n_test)]
# compute prediction error for each sample
for i in range(n_test):
    true_price = prices_test[i]
    pred_price = sum(beta_vec_1[j].x * feat_test[i][j] for j in range(d))
    pred_error[i] = abs(true_price - pred_price)
    
# compute mean error and display
print(f'Mean prediction error on test set: {sum(pred_error) / n_test:.2f} $/night')


Mean prediction error on test set: 35.61 $/night


# Model 2

In [9]:
# initialize model
mod_2 = Model()

# number of features
d = 12
# number of features allowed in regression
k = 3
# number of samples
n = 1698

# initialize beta vector variable
beta_vec_2 = mod_2.addVars(d)
# initialize binary indicator of whether or not a given variable has a nonzero coefficient
indicator_2 = mod_2.addVars(d, vtype = GRB.BINARY)
# initialize error vector
e_2 = mod_2.addVars(n)

# compute error by accounting for absolute value
for i in range(n):
    mod_2.addConstr(e_2[i] >= prices_train[i] -
                    sum(beta_vec_2[j] * feat_train[i][j] * indicator_2[j] for j in range(d)))
    mod_2.addConstr(
        e_2[i] >= sum(beta_vec_2[j] * feat_train[i][j] * indicator_2[j] for j in range(d)) -
                    prices_train[i]
    )

# restrict number of features in regression
mod_2.addConstr(sum(indicator_2[j] for j in range(d)) <= k)

# initialize objective value
mod_2.setObjective((1 / n) * sum(e_2[i] for i in range(n)), GRB.MINIMIZE)


In [10]:
# update and solve model
mod_2.update()
mod_2.optimize()


Gurobi Optimizer version 9.1.0 build v9.1.0rc0 (win64)
Thread count: 2 physical cores, 4 logical processors, using up to 4 threads
Optimize a model with 1 rows, 1722 columns and 12 nonzeros
Model fingerprint: 0xad44c584
Model has 3396 quadratic constraints
Variable types: 1710 continuous, 12 integer (12 binary)
Coefficient statistics:
  Matrix range     [1e+00, 1e+00]
  QMatrix range    [5e-01, 5e+02]
  QLMatrix range   [1e+00, 1e+00]
  Objective range  [6e-04, 6e-04]
  Bounds range     [1e+00, 1e+00]
  RHS range        [3e+00, 3e+00]
  QRHS range       [1e+01, 2e+03]
Found heuristic solution: objective 2.000000e+09
Presolve added 822 rows and 409 columns
Presolve time: 0.25s
Presolved: 3417 rows, 2167 columns, 32930 nonzeros
Presolved model has 24 SOS constraint(s)
Variable types: 2143 continuous, 24 integer (24 binary)

Root relaxation: objective 3.640538e+01, 2741 iterations, 0.32 seconds

    Nodes    |    Current Node    |     Objective Bounds      |     Work
 Expl Unexpl |  Obj  

In [11]:
# display optimal error value (on training set)
opt_val_2 = mod_2.objval
print("Minimum error:", opt_val_2)


Minimum error: 38.30565371024743


a) List the names and coefficients of the three variables selected by the optimization model.

In [13]:
for j in range(d):
    if beta_vec_2[j].x != 0:
        print(f'{feat_names[j]}: {beta_vec_2[j].x}')
    

Entire home: 52.0
accommodates: 14.0
bedrooms: 32.0


b) What is the new prediction error, in $/night, of Model 2?

In [14]:
# size of test set
n_test = len(prices_test)
# initialize list of errors for each sample
pred_error = [0 for i in range(n_test)]
# compute prediction error for each sample
for i in range(n_test):
    true_price = prices_test[i]
    pred_price = sum(beta_vec_2[j].x * feat_test[i][j] for j in range(d))
    pred_error[i] = abs(true_price - pred_price)
    
# compute mean error and display
print(f'Mean prediction error on test set: {sum(pred_error) / n_test:.2f} $/night')


Mean prediction error on test set: 37.74 $/night


# Model 3

In [15]:
# initialize model
mod_3 = Model()

# number of features
d = 12
# number of features allowed in regression
k = 3
# number of samples
n = 1698

# initialize beta vector variable
beta_vec_3 = mod_3.addVars(d)
# initialize binary indicator of whether or not a given variable has a nonzero coefficient
indicator_3 = mod_3.addVars(d, vtype = GRB.BINARY)
# initialize error vector
e_3 = mod_3.addVars(n)

# compute error by accounting for absolute value
for i in range(n):
    mod_3.addConstr(
        e_3[i] >= prices_train[i] -
                    sum(beta_vec_3[j] * feat_train[i][j] * indicator_3[j] for j in range(d))
    )
    mod_3.addConstr(
        e_3[i] >= sum(beta_vec_3[j] * feat_train[i][j] * indicator_3[j] for j in range(d)) -
                    prices_train[i]
    )

# require number of beds to be selected as a predictor
idx_beds = feat_names.index('beds')
mod_3.addConstr(indicator_3[idx_beds] == 1)
    
# restrict number of features in regression
mod_3.addConstr(sum(indicator_3[j] for j in range(d)) <= k)

# initialize objective value
mod_3.setObjective((1 / n) * sum(e_3[i] for i in range(n)), GRB.MINIMIZE)


In [16]:
# update and solve model (on training set)
mod_3.update()
mod_3.optimize()


Gurobi Optimizer version 9.1.0 build v9.1.0rc0 (win64)
Thread count: 2 physical cores, 4 logical processors, using up to 4 threads
Optimize a model with 2 rows, 1722 columns and 13 nonzeros
Model fingerprint: 0x3cd5a632
Model has 3396 quadratic constraints
Variable types: 1710 continuous, 12 integer (12 binary)
Coefficient statistics:
  Matrix range     [1e+00, 1e+00]
  QMatrix range    [5e-01, 5e+02]
  QLMatrix range   [1e+00, 1e+00]
  Objective range  [6e-04, 6e-04]
  Bounds range     [1e+00, 1e+00]
  RHS range        [1e+00, 3e+00]
  QRHS range       [1e+01, 2e+03]
Presolve added 821 rows and 408 columns
Presolve time: 0.24s
Presolved: 3415 rows, 2163 columns, 32924 nonzeros
Presolved model has 22 SOS constraint(s)
Variable types: 2141 continuous, 22 integer (22 binary)
Found heuristic solution: objective 60.4690813

Root relaxation: objective 3.640538e+01, 2467 iterations, 0.28 seconds

    Nodes    |    Current Node    |     Objective Bounds      |     Work
 Expl Unexpl |  Obj  De

In [17]:
# display optimal error value
opt_val_3 = mod_3.objval
print("Minimum error:", opt_val_3)


Minimum error: 40.05064782096585


a) List the names and coefficients of the two other variables selected by the optimization model.

In [18]:
for j in range(d):
    if beta_vec_3[j].x != 0 and feat_names[j] != 'beds':
        print(f'{feat_names[j]}: {beta_vec_3[j].x}')


Entire home: 67.875
bedrooms: 47.375


b) Which variable was in Model 2 but is no longer in Model 3? Briefly explain in 1-2 sentences
why this variable might have been dropped.

The variable 'accomodates' was in Model 2 but is no longer in Model 3. This is likely the case since the number of beds is closely related to the number of accomodates and thus the benefit of including the number of accomodates in our model is much lower when we already include the number of beds.

c) What is the new prediction error, in $/night, of Model 3?

In [19]:
# size of test set
n_test = len(prices_test)
# initialize list of errors for each sample
pred_error = [0 for i in range(n_test)]
# compute prediction error for each sample
for i in range(n_test):
    true_price = prices_test[i]
    pred_price = sum(beta_vec_3[j].x * feat_test[i][j] for j in range(d))
    pred_error[i] = abs(true_price - pred_price)
    
# compute mean error and display
print(f'Mean prediction error on test set: {sum(pred_error) / n_test:.2f} $/night')


Mean prediction error on test set: 38.60 $/night
