In [222]:
import numpy as np
np.set_printoptions(precision=4)  # Print few decimal places
np.set_printoptions(suppress=True)  # Suppress scientific notation
import cvxpy as cp
import pandas as pd
from numpy.linalg import cholesky as llt
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [223]:
def norm_pred_error(beta, X, y):
    return np.linalg.norm(X @ beta - y) / np.linalg.norm(y)

In [224]:
rawdata = pd.read_csv('rawhousingdata.csv')

In [225]:
rawdata

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished
...,...,...,...,...,...,...,...,...,...,...,...,...,...
540,1820000,3000,2,1,1,yes,no,yes,no,no,2,no,unfurnished
541,1767150,2400,3,1,1,no,no,no,no,no,0,no,semi-furnished
542,1750000,3620,2,1,1,yes,no,no,no,no,0,no,unfurnished
543,1750000,2910,3,1,1,no,no,no,no,no,0,no,furnished


In [226]:
# furnishingstatus -> categorize with pd.get_dummies
# mainroad, guestroom, basement, hotwaterheating, airconditioning, prefarea -> binarize

In [227]:
pd.get_dummies(rawdata['furnishingstatus'])

Unnamed: 0,furnished,semi-furnished,unfurnished
0,1,0,0
1,1,0,0
2,0,1,0
3,1,0,0
4,1,0,0
...,...,...,...
540,0,0,1
541,0,1,0
542,0,0,1
543,1,0,0


In [298]:
dfnum = rawdata.copy()
binarize_cols = ['mainroad', 'guestroom', 'basement', 'hotwaterheating', 'airconditioning', 'prefarea']
dfnum[binarize_cols] = dfnum[binarize_cols].eq('yes').mul(1)
furn = pd.get_dummies(rawdata['furnishingstatus'])

dfnum = pd.concat([dfnum, furn], axis=1)
dfnum = dfnum.drop(columns=['furnishingstatus', 'hotwaterheating'])

In [299]:
dfnum

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,airconditioning,parking,prefarea,furnished,semi-furnished,unfurnished
0,13300000,7420,4,2,3,1,0,0,1,2,1,1,0,0
1,12250000,8960,4,4,4,1,0,0,1,3,0,1,0,0
2,12250000,9960,3,2,2,1,0,1,0,2,1,0,1,0
3,12215000,7500,4,2,2,1,0,1,1,3,1,1,0,0
4,11410000,7420,4,1,2,1,1,1,1,2,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
540,1820000,3000,2,1,1,1,0,1,0,2,0,0,0,1
541,1767150,2400,3,1,1,0,0,0,0,0,0,0,1,0
542,1750000,3620,2,1,1,1,0,0,0,0,0,0,0,1
543,1750000,2910,3,1,1,0,0,0,0,0,0,1,0,0


In [300]:
df_norm = dfnum.copy()
df_norm['price'] = df_norm['price'] / 1e6
df_norm['area'] = df_norm['area'] / 1e3

In [301]:
df_norm = df_norm.drop(columns=['semi-furnished', 'unfurnished'])
df_norm

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,airconditioning,parking,prefarea,furnished
0,13.30000,7.42,4,2,3,1,0,0,1,2,1,1
1,12.25000,8.96,4,4,4,1,0,0,1,3,0,1
2,12.25000,9.96,3,2,2,1,0,1,0,2,1,0
3,12.21500,7.50,4,2,2,1,0,1,1,3,1,1
4,11.41000,7.42,4,1,2,1,1,1,1,2,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
540,1.82000,3.00,2,1,1,1,0,1,0,2,0,0
541,1.76715,2.40,3,1,1,0,0,0,0,0,0,0
542,1.75000,3.62,2,1,1,1,0,0,0,0,0,0
543,1.75000,2.91,3,1,1,0,0,0,0,0,0,1


In [302]:
Xy = df_norm.to_numpy()
X = Xy[:, 1:]
X = np.column_stack([np.ones(X.shape[0]), X])
y = Xy[:, 0]

In [303]:
# now, to create synthetic, but reasonable prices

In [304]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [317]:
cols = list(df_norm)
dftrain = pd.DataFrame(X_train)
dftrain.columns = cols
dftrain['price'] = y_train
dftrain.to_csv('train.csv', index=False)
dftrain

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,airconditioning,parking,prefarea,furnished
0,1.750,3.62,2.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2.695,4.00,2.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2.870,3.04,2.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2.590,3.60,2.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4.515,9.86,3.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
431,6.790,4.00,3.0,2.0,2.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0
432,4.305,10.36,2.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0
433,9.800,5.75,3.0,2.0,4.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0
434,3.710,3.60,3.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0


In [318]:
dftest = pd.DataFrame(X_test)
dftest.columns = cols
dftest['price'] = y_test
dftest.to_csv('test.csv', index=False)
dftest

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,airconditioning,parking,prefarea,furnished
0,4.5850,4.00,3.0,1.0,2.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0
1,6.0830,9.62,3.0,1.0,1.0,1.0,0.0,1.0,0.0,2.0,1.0,1.0
2,4.0075,3.46,4.0,1.0,2.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
3,6.9300,13.20,2.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0
4,2.9400,3.66,4.0,1.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
104,6.6500,6.42,3.0,2.0,3.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0
105,5.8100,5.20,3.0,1.0,3.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
106,4.1230,6.06,2.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
107,3.0800,4.50,2.0,1.0,2.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0


In [307]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((436, 12), (436,), (109, 12), (109,))

In [336]:
# performance with real data
# l1
N, p = X_train.shape
beta = cp.Variable(p)
lambd = 0
obj = cp.Minimize(cp.norm(X_train @ beta - y_train, 1) + lambd * cp.norm(beta, 1))
constraints = [beta >= 0]
prob = cp.Problem(obj, constraints)
res = prob.solve()
print(res)
print(beta.value)
print('test error:', norm_pred_error(beta.value, X_test, y_test))
beta_l1 = beta.value

349.7838893807218
[0.     0.2429 0.1098 0.9599 0.4851 0.3455 0.2295 0.2659 0.6119 0.1978
 0.7688 0.3211]
test error: 0.1919012948270386


In [342]:
# l2
beta = cp.Variable(p)
lambd = 0
obj = cp.Minimize(cp.sum_squares(X_train @ beta - y_train) + lambd * cp.sum_squares(beta))
constraints = []
prob = cp.Problem(obj, constraints)
res = prob.solve()
print(res)
print(beta.value)
beta_test = np.maximum(beta.value, 0)
print('test error:', norm_pred_error(beta_test, X_test, y_test))

539.4059392665804
[-0.3901  0.2502  0.1641  1.0604  0.4385  0.4322  0.3983  0.3453  0.7756
  0.2749  0.6768  0.284 ]
test error: 0.20796809520319737


In [310]:
# end up just using the real prices, the numbers work out fine

In [312]:
# zero out a few of the columns
beta_synth = beta_l1.copy()
beta_synth[[6, 8, 11]] = 0
beta_synth

array([-0.    ,  0.3335,  0.0817,  0.9532,  0.5006,  0.1368,  0.    ,
        0.2806,  0.    ,  0.1293,  0.5181,  0.    ])

In [313]:
np.random.seed(0)
ysynth_train = X_train @ beta_synth + np.random.normal(size=y_train.shape)
ysynth_test = X_test @ beta_synth + np.random.normal(size=y_test.shape)
# ysynth_train, ysynth_test

In [314]:
# performance with synth data
# l1
N, p = X_train.shape
beta = cp.Variable(p)
lambd = 0
obj = cp.Minimize(cp.norm(X_train @ beta - ysynth_train, 1) + lambd * cp.norm(beta, 1))
constraints = [beta >= 0]
prob = cp.Problem(obj, constraints)
res = prob.solve()
print(res)
print(beta.value)
print('test error:', norm_pred_error(beta.value, X_test, ysynth_test))
# beta_l1 = beta.value

342.4988785718045
[0.2885 0.3692 0.0413 0.8452 0.4218 0.     0.1716 0.065  0.1422 0.0534
 0.5896 0.    ]
test error: 0.2223222484082424


In [315]:
# l2
beta = cp.Variable(p)
lambd = 0
obj = cp.Minimize(cp.sum_squares(X_train @ beta - ysynth_train) + lambd * cp.sum_squares(beta))
constraints = []
prob = cp.Problem(obj, constraints)
res = prob.solve()
print(res)
beta_test = np.maximum(beta.value, 0)
print(beta.value, beta_test)
print('test error:', norm_pred_error(beta_test, X_test, ysynth_test))

416.3696940802548
[ 0.2744  0.3468 -0.0302  1.0276  0.5498 -0.0488  0.2317  0.168  -0.0135
  0.0543  0.5952 -0.1183] [0.2744 0.3468 0.     1.0276 0.5498 0.     0.2317 0.168  0.     0.0543
 0.5952 0.    ]
test error: 0.22702583322535663


In [316]:
df_norm

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,airconditioning,parking,prefarea,furnished
0,13.30000,7.42,4,2,3,1,0,0,1,2,1,1
1,12.25000,8.96,4,4,4,1,0,0,1,3,0,1
2,12.25000,9.96,3,2,2,1,0,1,0,2,1,0
3,12.21500,7.50,4,2,2,1,0,1,1,3,1,1
4,11.41000,7.42,4,1,2,1,1,1,1,2,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
540,1.82000,3.00,2,1,1,1,0,1,0,2,0,0
541,1.76715,2.40,3,1,1,0,0,0,0,0,0,0
542,1.75000,3.62,2,1,1,1,0,0,0,0,0,0
543,1.75000,2.91,3,1,1,0,0,0,0,0,0,1
