In [1]:
import pandas as pd

import sklearn
from sklearn.linear_model import *


In [2]:
X_train = pd.read_csv("./data/X_train.csv")
X_train.set_index("property_id", inplace=True)
X_test = pd.read_csv("./data/X_test.csv")
X_test.set_index("property_id", inplace=True)


y_train = pd.read_csv("./data/y_train.csv")
y_train.set_index("property_id", inplace=True)
y_test = pd.read_csv("./data/y_test.csv")
y_test.set_index("property_id", inplace=True)


In [3]:
def train_and_test_model(regressor: LinearRegression, X_train, y_train, X_test, y_test):
    regressor.fit(X_train, y_train)
    print("Train score is: ", regressor.score(X_train, y_train))
    print("Test score is: ", regressor.score(X_test, y_test))
    try:
        coeffs = list(zip(regressor.coef_.flatten().tolist(), regressor.feature_names_in_))
        coeffs.sort(key=lambda x: abs(x[0]), reverse=True)
        print(coeffs)
    except:
        pass

In [15]:
train_and_test_model(LinearRegression(), X_train, y_train, X_test, y_test)
#low score due to colinear values

Train score is:  0.6577083818861549
Test score is:  -3.719477528086696e+19
[(3.9018846398804794e+17, 'number_of_facades_5.0'), (3.9018846398801286e+17, 'number_of_facades_4.0'), (3.901884639879836e+17, 'number_of_facades_3.0'), (3.90188463987968e+17, 'number_of_facades_2.0'), (3.9018846398795603e+17, 'number_of_facades_1.0'), (-2.2602344279975043e+17, 'state_of_building_TO_RESTORE'), (-2.2602344279967904e+17, 'state_of_building_TO_RENOVATE'), (-2.2602344279964355e+17, 'state_of_building_TO_BE_DONE_UP'), (-2.2602344279960006e+17, 'state_of_building_GOOD'), (-2.2602344279957926e+17, 'state_of_building_JUST_RENOVATED'), (-2.2602344279954374e+17, 'state_of_building_AS_NEW'), (2.144688605468549e+17, 'kitchen_type_USA_HYPER_EQUIPPED'), (2.1446886054683754e+17, 'kitchen_type_USA_SEMI_EQUIPPED'), (2.1446886054683264e+17, 'kitchen_type_HYPER_EQUIPPED'), (2.1446886054682848e+17, 'kitchen_type_USA_UNINSTALLED'), (2.1446886054680826e+17, 'kitchen_type_NOT_INSTALLED'), (2.144688605468075e+17, 'kitc

In [40]:
#for col in X_train.columns:
    #print(col)
cols=[]
for col in X_train.columns:
    if not col.startswith("postal") and not col.startswith("latlon") and not col.startswith("property_subtype") and not col.startswith("number") and not col.startswith("kitchen") and not col.startswith("state"):
        cols.append(col)

print(cols)

['living_area', 'garden', 'garden_area', 'furnished', 'open_fire', 'terrace']


In [41]:
#train LinearRegression without colinear values
train_and_test_model(LinearRegression(), X_train[cols], y_train, X_test[cols], y_test)

Train score is:  0.25111283590446987
Test score is:  0.21535579735044696
[(1237184.0718787957, 'living_area'), (212482.11431260777, 'furnished'), (-97861.40328653052, 'open_fire'), (-42608.84947569718, 'garden'), (-5019.943726785686, 'garden_area'), (-3233.235578004853, 'terrace')]


In [24]:
columns_to_keep = ["living_area"]
train_and_test_model(Ridge(), X_train[columns_to_keep], y_train, X_test[columns_to_keep], y_test)

Train score is:  0.22777244750078995
Test score is:  0.19212943569797358
[(1185922.860416972, 'living_area')]


In [18]:
columns_to_keep = ["living_area"]+[f'number_of_rooms_{i}.0' for i in range(1,22)]
train_and_test_model(Ridge(), X_train[columns_to_keep], y_train, X_test[columns_to_keep], y_test)

Train score is:  0.3297410141098597
Test score is:  0.2785904116212454
[(1033054.337334426, 'living_area'), (896228.9658988384, 'number_of_rooms_17.0'), (-418011.5320157245, 'number_of_rooms_20.0'), (361129.3403855837, 'number_of_rooms_4.0'), (256631.13276918835, 'number_of_rooms_15.0'), (245249.84677005513, 'number_of_rooms_16.0'), (-221397.48109120136, 'number_of_rooms_2.0'), (-211350.11693128996, 'number_of_rooms_1.0'), (-199153.40363936176, 'number_of_rooms_7.0'), (-179256.53590998176, 'number_of_rooms_8.0'), (-171739.36445241436, 'number_of_rooms_10.0'), (-159120.35691118025, 'number_of_rooms_9.0'), (138063.0978555053, 'number_of_rooms_3.0'), (-127731.29645343746, 'number_of_rooms_6.0'), (-103389.5616942063, 'number_of_rooms_19.0'), (-87368.47561903449, 'number_of_rooms_13.0'), (-77607.28428581836, 'number_of_rooms_11.0'), (69964.59376243223, 'number_of_rooms_14.0'), (68019.02394112186, 'number_of_rooms_21.0'), (-58190.855534382965, 'number_of_rooms_12.0'), (-45962.935788625124, '

In [19]:
columns_to_keep = ["living_area"]+[f'postal_code_{i}.0' for i in range(1,24)]
train_and_test_model(Ridge(), X_train[columns_to_keep], y_train, X_test[columns_to_keep], y_test)

Train score is:  0.506166677712013
Test score is:  0.507247094203608
[(1226400.502872481, 'living_area'), (587070.4023467721, 'postal_code_17.0'), (-163076.40611585684, 'postal_code_14.0'), (-151018.91278040086, 'postal_code_15.0'), (-134482.9774549331, 'postal_code_22.0'), (-120159.40444466655, 'postal_code_10.0'), (-117386.46365818974, 'postal_code_9.0'), (-104323.54894518458, 'postal_code_16.0'), (-99386.8737857298, 'postal_code_6.0'), (-99357.1323308653, 'postal_code_13.0'), (-98102.53861372784, 'postal_code_3.0'), (-93419.27202057844, 'postal_code_11.0'), (-93326.89212385872, 'postal_code_23.0'), (-90095.18493719856, 'postal_code_20.0'), (-77155.77355503962, 'postal_code_7.0'), (67611.585148216, 'postal_code_4.0'), (-66805.98948910263, 'postal_code_19.0'), (-50272.894410995716, 'postal_code_1.0'), (37516.20008423059, 'postal_code_2.0'), (-26517.843729166267, 'postal_code_12.0'), (-22726.87738813865, 'postal_code_5.0'), (-18770.592270836925, 'postal_code_18.0'), (11809.973057264513

In [20]:
columns_to_keep = ["living_area"]+[column_name for column_name in X_train.columns if "latlon" in column_name]
train_and_test_model(Ridge(), X_train[columns_to_keep], y_train, X_test[columns_to_keep], y_test)

Train score is:  0.5348583781940546
Test score is:  0.4977898596716359
[(1204601.582740117, 'living_area'), (986845.4477040761, 'latlon_22x4'), (770767.7010766002, 'latlon_22x3'), (600668.8461254812, 'latlon_21x3'), (435954.3300718195, 'latlon_12x5'), (357711.10472800804, 'latlon_21x4'), (295517.0814320541, 'latlon_5x19'), (255182.54650370078, 'latlon_5x13'), (244665.11497019604, 'latlon_3x13'), (223181.8990150892, 'latlon_6x21'), (-205075.15947215472, 'latlon_4x9'), (197213.86457963838, 'latlon_4x19'), (196332.67456146114, 'latlon_2x13'), (188191.34250576695, 'latlon_12x7'), (181375.69859114394, 'latlon_4x13'), (-179212.67473357136, 'latlon_3x2'), (-171598.139269295, 'latlon_0x18'), (-166283.2820348223, 'latlon_0x19'), (-166209.25077181717, 'latlon_7x3'), (163864.44881850996, 'latlon_3x15'), (162959.9186602593, 'latlon_11x5'), (155590.49563161726, 'latlon_8x20'), (152893.53805050263, 'latlon_19x15'), (150731.5595422095, 'latlon_2x14'), (148924.84146907434, 'latlon_3x12'), (-145669.994

In [21]:
columns_to_keep = ["living_area"]+[f'postal_code_{i}.0' for i in range(1,24)]+[column_name for column_name in X_train.columns if "latlon" in column_name]
train_and_test_model(Ridge(), X_train[columns_to_keep], y_train, X_test[columns_to_keep], y_test)

Train score is:  0.6019057332538413
Test score is:  0.5796106905001287
[(1215628.954933328, 'living_area'), (669303.5084042334, 'postal_code_17.0'), (-534730.048258152, 'latlon_21x2'), (-520183.9460261601, 'latlon_18x3'), (-504882.69541087595, 'latlon_17x4'), (-491083.28357778507, 'latlon_16x3'), (475853.6744915832, 'latlon_12x5'), (-415709.41450030764, 'latlon_20x2'), (-380260.92930862505, 'latlon_17x3'), (350515.1763235928, 'latlon_5x19'), (326441.1166923049, 'latlon_22x4'), (-283825.1277825598, 'latlon_21x4'), (252757.32274899856, 'latlon_6x21'), (243487.48313257773, 'latlon_4x19'), (239310.42697410085, 'latlon_5x13'), (-228492.59718195535, 'latlon_19x14'), (214395.83729239303, 'latlon_3x13'), (-194555.12750642395, 'latlon_3x2'), (186570.98662452016, 'latlon_11x5'), (184059.24234483214, 'latlon_7x19'), (180735.47387170407, 'latlon_5x15'), (-175498.45686298292, 'latlon_7x3'), (172623.07308240866, 'latlon_12x8'), (171134.22892789828, 'latlon_8x20'), (167306.33481044252, 'latlon_8x15')

In [22]:
columns_to_keep = ["living_area"]+[f'number_of_rooms_{i}.0' for i in range(1,22)]+[f'postal_code_{i}.0' for i in range(1,24)]+[column_name for column_name in X_train.columns if "latlon" in column_name]
train_and_test_model(Ridge(), X_train[columns_to_keep], y_train, X_test[columns_to_keep], y_test)

Train score is:  0.6352296581055074
Test score is:  0.6094011550236391
[(1126158.850817441, 'living_area'), (593217.5507602543, 'postal_code_17.0'), (528502.7848158437, 'number_of_rooms_17.0'), (-480518.1930455247, 'latlon_21x2'), (-457381.6746618908, 'latlon_18x3'), (-447180.9954413652, 'latlon_16x3'), (-446910.9636742902, 'latlon_17x4'), (408228.20528886275, 'latlon_12x5'), (-356248.3293011782, 'latlon_20x2'), (-355706.5941216891, 'latlon_17x3'), (351516.02085166937, 'latlon_22x4'), (309713.4340874825, 'latlon_5x19'), (-304880.2026341019, 'latlon_3x2'), (-301207.9304362873, 'number_of_rooms_20.0'), (-280116.97534675203, 'latlon_7x3'), (266561.8311812004, 'latlon_6x21'), (-259960.4731554867, 'latlon_20x5'), (259941.2270413083, 'number_of_rooms_4.0'), (-257091.8946363723, 'latlon_6x1'), (-233729.26980413584, 'latlon_21x4'), (-209803.33133196042, 'latlon_12x0'), (203612.80685298037, 'latlon_5x13'), (201401.8332854388, 'latlon_4x19'), (195141.3400053179, 'latlon_12x8'), (192896.916841229

In [23]:
train_and_test_model(Ridge(), X_train, y_train, X_test, y_test)

Train score is:  0.6562928934459692
Test score is:  0.6234299962051523
[(1076781.0917499163, 'living_area'), (570897.3223329096, 'postal_code_17.0'), (537448.5068525604, 'number_of_rooms_17.0'), (-450895.52770091663, 'latlon_21x2'), (-437468.3278304705, 'latlon_16x3'), (-429203.8844913182, 'latlon_18x3'), (380979.5900099334, 'latlon_12x5'), (-379850.2431425606, 'latlon_17x4'), (-353155.5325927343, 'latlon_17x3'), (349075.4649737646, 'latlon_22x4'), (-345490.10850602353, 'latlon_20x2'), (-276799.9217205105, 'number_of_rooms_20.0'), (-267094.6377974837, 'latlon_20x5'), (266738.7388559233, 'latlon_5x19'), (-266686.8376741313, 'latlon_7x3'), (-258998.06846398496, 'latlon_3x2'), (-237143.0273664584, 'latlon_6x1'), (-223338.8020644511, 'latlon_21x4'), (220752.28010731746, 'number_of_rooms_4.0'), (207551.12822749274, 'latlon_12x8'), (-200307.11397072082, 'latlon_12x0'), (190684.6502869452, 'latlon_5x13'), (180194.18775545273, 'latlon_13x8'), (-171523.6299685418, 'latlon_17x1'), (166675.214683

In [14]:
train_and_test_model(ElasticNet(alpha = 0.1), X_train, y_train, X_test, y_test)

Train score is:  0.4895672407019035
Test score is:  0.482390995648847
[(308575.3612194913, 'postal_code_17.0'), (297308.05996634666, 'living_area'), (187278.36480119886, 'latlon_22x4'), (136852.4170804479, 'number_of_rooms_4.0'), (-129939.76754652012, 'number_of_rooms_1.0'), (91646.09608619742, 'property_subtype_PENTHOUSE'), (67045.13449271132, 'state_of_building_AS_NEW'), (62802.23246506205, 'latlon_22x3'), (61588.97785141081, 'latlon_12x7'), (61305.6697683901, 'kitchen_type_USA_HYPER_EQUIPPED'), (-60160.28139560175, 'property_subtype_FLAT_STUDIO'), (52832.899928070394, 'latlon_21x3'), (52696.55345582743, 'kitchen_type_HYPER_EQUIPPED'), (-50442.13430372744, 'postal_code_15.0'), (-49554.50949276277, 'latlon_21x2'), (48618.87178697305, 'number_of_rooms_3.0'), (-47987.17788796457, 'kitchen_type_SEMI_EQUIPPED'), (-44564.147633521614, 'postal_code_14.0'), (44193.13541485427, 'postal_code_4.0'), (-41579.782814678525, 'postal_code_3.0'), (-41162.23938200958, 'kitchen_type_INSTALLED'), (-3535

In [26]:
train_and_test_model(Lasso(), X_train, y_train, X_test, y_test)

Train score is:  0.6576972650405695
Test score is:  0.6199910309903351
[(1089148.4211055045, 'living_area'), (-684534.2014995484, 'latlon_16x3'), (659506.3587459403, 'number_of_rooms_17.0'), (-646857.4061629189, 'latlon_7x3'), (563327.2196131618, 'postal_code_17.0'), (-542099.7838271998, 'latlon_20x5'), (-531030.5156782917, 'latlon_17x4'), (-508916.6171856825, 'latlon_18x3'), (-482582.6063434089, 'latlon_21x2'), (450678.5647598375, 'latlon_12x5'), (-409155.4332554541, 'latlon_17x3'), (-406638.94513000466, 'number_of_rooms_20.0'), (-388247.4568502027, 'latlon_20x2'), (-387245.99304713984, 'latlon_3x2'), (-367458.7043350592, 'latlon_6x1'), (325050.2078315872, 'latlon_5x19'), (323382.81870828953, 'latlon_22x4'), (-305563.36063392117, 'latlon_8x1'), (-298212.0722041227, 'latlon_11x4'), (-276765.42684331094, 'latlon_12x0'), (-272908.21522200265, 'latlon_5x1'), (-263333.90264360514, 'latlon_13x3'), (-256319.16980060787, 'latlon_21x4'), (252637.58748227684, 'number_of_rooms_4.0'), (-249009.03

  model = cd_fast.enet_coordinate_descent(


In [27]:
from sklearn.tree import *

In [28]:
train_and_test_model(DecisionTreeRegressor(), X_train, y_train, X_test, y_test)

Train score is:  0.9970055214935338
Test score is:  0.43638387521629884


In [35]:
train_and_test_model(DecisionTreeRegressor(min_samples_leaf=15), X_train, y_train, X_test, y_test)

Train score is:  0.7434782193538454
Test score is:  0.6524882438466739


In [36]:
from sklearn.ensemble import *

In [37]:
train_and_test_model(StackingRegressor([('lr', Ridge()),('dt', DecisionTreeRegressor(min_samples_leaf=15))]), X_train, y_train, X_test, y_test)

  y = column_or_1d(y, warn=True)


Train score is:  0.7473131556913437
Test score is:  0.6929307347802562


In [38]:
train_and_test_model(GradientBoostingRegressor(), X_train, y_train, X_test, y_test)

  y = column_or_1d(y, warn=True)  # TODO: Is this still required?


Train score is:  0.7930347882567576
Test score is:  0.7303363693880325


In [50]:
train_and_test_model(GradientBoostingRegressor(n_estimators=1000), X_train, y_train, X_test, y_test)

  y = column_or_1d(y, warn=True)  # TODO: Is this still required?


Train score is:  0.8704313593601994
Test score is:  0.7452683479579716


In [51]:
train_and_test_model(StackingRegressor([('gb1', GradientBoostingRegressor(n_estimators=1000, random_state=42)),('gb2', GradientBoostingRegressor(n_estimators=1000, random_state=43))]), X_train, y_train, X_test, y_test)

  y = column_or_1d(y, warn=True)


Train score is:  0.8716575336512519
Test score is:  0.7458347030213972


: 

In [40]:
train_and_test_model(RandomForestRegressor(min_samples_leaf=15), X_train, y_train, X_test, y_test)

  return fit_method(estimator, *args, **kwargs)


Train score is:  0.7456317606272493
Test score is:  0.7019502695957017


In [41]:
train_and_test_model(StackingRegressor([('lr', GradientBoostingRegressor()),('dt', RandomForestRegressor(min_samples_leaf=15))]), X_train, y_train, X_test, y_test)

  y = column_or_1d(y, warn=True)


Train score is:  0.7963628956468634
Test score is:  0.7317843532134196


In [43]:
from sklearn.neural_network import *

In [49]:
#train_and_test_model(MLPRegressor(hidden_layer_sizes=[8,4], max_iter=2000), X_train, y_train, X_test, y_test)

  y = column_or_1d(y, warn=True)


Train score is:  0.6801621009353542
Test score is:  0.6407117724294851




In [None]:
#cross validation to check 

#how important are features: coefficient

