In [145]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

seed = np.random.seed(0)
pd.options.mode.chained_assignment = None  # default='warn'
pd.set_option('display.max_rows', 10000)
pd.set_option('display.max_columns', 100)
pd.set_option('display.width', 3000)


In [146]:
train_data_apartments = pd.read_csv("../input/housing-prices/data/apartments_train.csv")
train_data_buildings = pd.read_csv("../input/housing-prices/data/buildings_train.csv")

train_data = pd.merge(train_data_apartments, train_data_buildings.set_index('id'), how='left', left_on='building_id', right_index=True)
num_of_training_samples = train_data.shape[0]
# Don't drop outliers, as it worsens submit performance
y_train = (np.asarray(train_data.price))

test_data_apartments = pd.read_csv("../input/housing-prices/data/apartments_test.csv")
test_data_buildings = pd.read_csv("../input/housing-prices/data/buildings_test.csv")

test_data = pd.merge(test_data_apartments, test_data_buildings.set_index('id'), how='left', left_on='building_id', right_index=True)
test_ids = test_data.id

train_data = train_data.drop("id", axis = 1)
test_data = test_data.drop("id", axis = 1)

metro_coordinates = pd.read_csv("../input/housing-prices/data/metro.csv")
add_data = pd.read_csv("../input/housing-prices/data/sberbank.csv")
add_data_sub_areas = pd.read_csv("../input/housing-prices/data/sberbank_sub_areas.csv")

print("Amount of duplicates in training data: ", len(train_data[train_data.duplicated()]))


In [147]:
print(add_data.keys().values)

In [148]:
all_data = pd.concat([train_data, test_data])

plt.plot(all_data.ceiling)
print("Amount of ceilings higher than 200m: ", len(all_data.ceiling[all_data.ceiling >200]))
print("Amount of ceilings between 25m and 200m: ", len(all_data.ceiling[(all_data.ceiling > 25) & (all_data.ceiling < 200)]))
# Rescaling out of scale ceilings
all_data.ceiling[all_data.ceiling > 200] = all_data.ceiling/100
all_data.ceiling[(all_data.ceiling > 25) & (all_data.ceiling < 200)] = all_data.ceiling/10

In [149]:
def haversine_array(lat1, lng1, lat2 = 55.75, lng2 = 37.6):
    lat1, lng1, lat2, lng2 = map(np.radians, (lat1, lng1, lat2, lng2))
    AVG_EARTH_RADIUS = 6371  # in km
    lat = lat2 - lat1
    lng = lng2 - lng1
    d = np.sin(lat * 0.5) ** 2 + np.cos(lat1) * np.cos(lat2) * np.sin(lng * 0.5) ** 2
    h = 2 * AVG_EARTH_RADIUS * np.arcsin(np.sqrt(d))
    return h

In [150]:
def euclid_dist_from_center(lat, long, lat2 = 55.75, long2 = 37.6):

    return np.sqrt(np.power(np.abs(lat2 - lat), 2) + np.power(np.abs(long2 - long), 2))

In [151]:
amount_of_sub_areas = len(add_data_sub_areas)
#print(amount_of_sub_areas)

broadcasted_lat = np.broadcast_to(np.expand_dims(np.asarray(all_data.latitude), axis = 1), (33222,amount_of_sub_areas))
broadcasted_long = np.broadcast_to(np.expand_dims(np.asarray(all_data.longitude), axis = 1), (33222,amount_of_sub_areas))


# IN THE CSV LATITUDE AND LONGITUDE HAVE BEEN MIXED UP!
broadcasted_sub_areas_long = np.broadcast_to(np.expand_dims(np.asarray(add_data_sub_areas.latitude), axis=1).T, (33222,amount_of_sub_areas))
broadcasted_sub_areas_lat = np.broadcast_to(np.expand_dims(np.asarray(add_data_sub_areas.longitude), axis=1).T, (33222,amount_of_sub_areas))

idx_of_closest_sub_area = np.argmin((haversine_array(broadcasted_lat, broadcasted_long, broadcasted_sub_areas_lat, broadcasted_sub_areas_long)), axis = 1)
all_data["sub_area"] = add_data_sub_areas.sub_area.iloc[idx_of_closest_sub_area].values
print(len(np.unique(all_data.sub_area)))

In [152]:
plt.scatter(all_data.latitude[all_data.sub_area == "Poselenie Pervomajskoe"], all_data.longitude[all_data.sub_area == "Poselenie Pervomajskoe"])
plt.scatter(all_data.latitude[all_data.sub_area == "Novo-Peredelkino"], all_data.longitude[all_data.sub_area == "Novo-Peredelkino"])

plt.scatter(add_data_sub_areas.longitude[add_data_sub_areas.sub_area == "Poselenie Pervomajskoe"], add_data_sub_areas.latitude[add_data_sub_areas.sub_area == "Poselenie Pervomajskoe"])
plt.scatter(add_data_sub_areas.longitude[add_data_sub_areas.sub_area == "Novo-Peredelkino"], add_data_sub_areas.latitude[add_data_sub_areas.sub_area == "Novo-Peredelkino"])

In [153]:
#all_data["raion_popul"] = all_data.sub_area.map(add_data.groupby("sub_area").mean().raion_popul)
#all_data["green_zone_part"] = all_data.sub_area.map(add_data.groupby("sub_area").mean().green_zone_part)
#all_data["indust_part"] = all_data.sub_area.map(add_data.groupby("sub_area").mean().indust_part)
#all_data["preschool_quota"] = all_data.sub_area.map(add_data.groupby("sub_area").mean().preschool_quota)
#all_data["children_preschool"] = all_data.sub_area.map(add_data.groupby("sub_area").mean().children_preschool)
#all_data["preschool_education_centers_raion"] = all_data.sub_area.map(add_data.groupby("sub_area").mean().preschool_education_centers_raion)
#all_data["children_school"] = all_data.sub_area.map(add_data.groupby("sub_area").mean().children_school)
#all_data["school_quota"] = all_data.sub_area.map(add_data.groupby("sub_area").mean().school_quota)
#all_data["school_education_centers_raion"] = all_data.sub_area.map(add_data.groupby("sub_area").mean().school_education_centers_raion)
#all_data["school_education_centers_top_20_raion"] = all_data.sub_area.map(add_data.groupby("sub_area").mean().school_education_centers_top_20_raion)
all_data["hospital_beds_raion"] = all_data.sub_area.map(add_data.groupby("sub_area").mean().hospital_beds_raion)
all_data["healthcare_centers_raion"] = all_data.sub_area.map(add_data.groupby("sub_area").mean().healthcare_centers_raion)
#all_data["university_top_20_raion"] = all_data.sub_area.map(add_data.groupby("sub_area").mean().university_top_20_raion)
#all_data["culture_objects_top_25_raion"] = all_data.sub_area.map(add_data.groupby("sub_area").mean().culture_objects_top_25_raion)
#all_data["shopping_centers_raion"] = all_data.sub_area.map(add_data.groupby("sub_area").mean().shopping_centers_raion)
#all_data["office_raion"] = all_data.sub_area.map(add_data.groupby("sub_area").mean().office_raion)
#all_data["school_education_centers_raion"] = all_data.sub_area.map(add_data.groupby("sub_area").mean().school_education_centers_raion)
all_data["railroad_km"] = all_data.sub_area.map(add_data.groupby("sub_area").mean().railroad_km)
all_data["railroad_station_walk_min"] = all_data.sub_area.map(add_data.groupby("sub_area").mean().railroad_station_walk_min)
all_data["metro_min_avto"] = all_data.sub_area.map(add_data.groupby("sub_area").mean().metro_min_avto)
all_data["metro_km_avto"] = all_data.sub_area.map(add_data.groupby("sub_area").mean().metro_km_avto)
all_data["cafe_count_5000"] = all_data.sub_area.map(add_data.groupby("sub_area").mean().cafe_count_5000)
all_data["public_healthcare_km"] = all_data.sub_area.map(add_data.groupby("sub_area").mean().public_healthcare_km)
all_data["public_transport_station_km"] = all_data.sub_area.map(add_data.groupby("sub_area").mean().public_transport_station_km)
all_data["theater_km"] = all_data.sub_area.map(add_data.groupby("sub_area").mean().theater_km)
all_data["kindergarten_km"] = all_data.sub_area.map(add_data.groupby("sub_area").mean().kindergarten_km)

In [154]:
print(all_data.material)
#all_data.material = all_data.material[all_data.material == 1]
print(all_data.material)

In [155]:
# Unifying street and address
all_data["street_and_address"] = all_data.street + " " + all_data.address
# Adding shared and private bathrooms
all_data["bathrooms"] = all_data.bathrooms_shared  + all_data.bathrooms_private
# Adding balconies and loggias
all_data["balconies_and_loggias"] = all_data.balconies + all_data.loggias

# Imputing coordinates outside of moscow and NaNs

#Бунинские Луга ЖК к2/2/1 = Ulitsa Aleksandry Monakhovoy, 97, coordinates: 55.5415152, 37.4821752
# улица 1-я Линия 57, coordinates: 55.6324711, 37.4536057
#улица Центральная 75 and 48, ESTIMATED coordinates: 55.750651,37.6083208
# пос. Коммунарка Москва А101 ЖК. coordinates: 55.5676692, 37.4816608
all_data.latitude[all_data.street_and_address == "Бунинские Луга ЖК к2/2/1"] = 55.5415152
all_data.longitude[all_data.street_and_address == "Бунинские Луга ЖК к2/2/1"] = 37.4821752
all_data.latitude[all_data.street_and_address == "Бунинские Луга ЖК к2/2/2"] = 55.5415152
all_data.longitude[all_data.street_and_address == "Бунинские Луга ЖК к2/2/2"] = 37.4821752
all_data.latitude[all_data.street_and_address == "улица 1-я Линия 57"] = 55.6324711
all_data.longitude[all_data.street_and_address == "улица 1-я Линия 57"] = 37.4536057
all_data.latitude[all_data.street_and_address == "улица Центральная 75"] = 55.5415152
all_data.longitude[all_data.street_and_address == "улица Центральная 75"] = 37.4821752
all_data.latitude[all_data.street_and_address == "улица Центральная 48"] = 55.5415152
all_data.longitude[all_data.street_and_address == "улица Центральная 48"] = 37.4821752
# NaNs
all_data.latitude[all_data.street_and_address == "пос. Коммунарка Москва А101 ЖК"] = 55.5676692
all_data.longitude[all_data.street_and_address == "пос. Коммунарка Москва А101 ЖК"] = 37.4816608



# New feature: Euclidean distance from the city center
all_data["dist_from_city_center"] = haversine_array(all_data.latitude, all_data.longitude)


all_data = all_data.drop(["address", "street", "bathrooms_shared", "bathrooms_private", "balconies", "loggias",
"windows_court", "windows_street", "elevator_service", "elevator_passenger", "garbage_chute", 
"layout", "parking", "heating", "elevator_without", "new"], axis = 1)
print(all_data.keys())


In [156]:
broadcasted_lat = np.broadcast_to(np.expand_dims(np.asarray(all_data.latitude), axis = 1), (33222,268))
broadcasted_long = np.broadcast_to(np.expand_dims(np.asarray(all_data.longitude), axis = 1), (33222,268))

broadcasted_metro_lat = np.broadcast_to(np.expand_dims(np.asarray(metro_coordinates.latitude), axis=1).T, (33222,268))
broadcasted_metro_long = np.broadcast_to(np.expand_dims(np.asarray(metro_coordinates.longitude), axis=1).T, (33222,268))

#print(np.sum((haversine_array(broadcasted_lat, broadcasted_long, broadcasted_metro_lat, broadcasted_metro_long) <= 5), axis = 1))

dist_to_closest_metro = np.min((haversine_array(broadcasted_lat, broadcasted_long, broadcasted_metro_lat, broadcasted_metro_long)), axis = 1)
idx_of_closest_metros = np.argmin((haversine_array(broadcasted_lat, broadcasted_long, broadcasted_metro_lat, broadcasted_metro_long)), axis = 1)
print(idx_of_closest_metros)
print(metro_coordinates.latitude[idx_of_closest_metros].values)

dist_of_closest_metro_to_center = haversine_array(metro_coordinates.latitude[idx_of_closest_metros].values, metro_coordinates.longitude[idx_of_closest_metros].values)
all_data["metro"] = dist_to_closest_metro*dist_of_closest_metro_to_center
print(all_data.metro)
#all_data["metros_in_5km"] = np.sum((haversine_array(broadcasted_lat, broadcasted_long, broadcasted_metro_lat, broadcasted_metro_long) <= 5), axis = 1)

#idxs = (haversine_array(broadcasted_lat, broadcasted_long, broadcasted_metro_lat, broadcasted_metro_long) <= 5)
#print(idxs)
#print(metro_coordinates.latitude[(haversine_array(broadcasted_lat, broadcasted_long, broadcasted_metro_lat, broadcasted_metro_long) <= 5)[0]])
#print(metro_coordinates.latitude[(haversine_array(broadcasted_lat, broadcasted_long, broadcasted_metro_lat, broadcasted_metro_long) <= 5)[0]].values)


In [157]:
#print(all_data.latitude[np.isnan(all_data.district)].values)
#print((all_data.longitude[np.isnan(all_data.district)]).values)
#print(all_data.groupby("district").mean().longitude)
#print(all_data.groupby("district").mean().latitude)

all_data.district[(all_data.latitude==55.59516) & (all_data.longitude==37.741109)] = 5.0
all_data.district[(all_data.latitude==55.5676692) & (all_data.longitude==37.4816608)] = 11.0
all_data.district[(all_data.latitude==55.921627) & (all_data.longitude==37.781578)] = 3.0
all_data.district[(all_data.latitude==55.5415152) & (all_data.longitude==37.4821752)] = 11.0
all_data.district[(all_data.latitude==55.6324711) & (all_data.longitude==37.4536057)] = 6.0
all_data.district[(all_data.latitude==55.583551) & (all_data.longitude==37.711356)] = 5.0
all_data.district[(all_data.latitude==55.932127) & (all_data.longitude==37.793705)] = 3.0

In [158]:
print("Amount of duplicates in all data: ", len(all_data[all_data.duplicated()]))
# Drop dups from imputation data
all_data_no_dups = all_data.drop(all_data[all_data.duplicated()].index, axis = 0)
# Drop dups from training data
print("Amount of duplicates in all data without duplicates: ", len(all_data_no_dups[all_data_no_dups.duplicated()]))

#print((all_data[all_data.duplicated(subset = ["price", "street_and_address", "area_total"], keep = False)].sort_values(["price", "street_and_address", "area_total"])))
all_data_no_dups = all_data.drop("price", axis = 1)
print(len(all_data))


In [159]:
import category_encoders as ce

from sklearn.impute import SimpleImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import LabelEncoder

categorical_columns = ["seller", "condition", "material", "street_and_address"]
numerical_columns = ["area_total", "area_living", "area_kitchen", "bathrooms", "floor", "rooms", "ceiling", "phones", "balconies_and_loggias", "constructed", "stories"]
boolean_columns = []
# Encode string-addresses into integers
string_encoder = LabelEncoder()
all_data["street_and_address"] = string_encoder.fit_transform(all_data.street_and_address)
all_data["sub_area"] = string_encoder.fit_transform(all_data.sub_area)

categorical_imputer = SimpleImputer(missing_values = np.nan, strategy = "most_frequent")
categorical_imputer.fit(all_data_no_dups[categorical_columns])
#all_data[categorical_columns] = categorical_imputer.transform(all_data[categorical_columns])



#numerical_imputer = IterativeImputer(missing_values = np.nan, random_state = 0)
#numerical_imputer.fit(all_data)
#imputed_area = numerical_imputer.transform(all_data["area_living", "area_kitchen"])

print(len(all_data))


In [160]:
all_data.area_total = np.log(all_data.area_total)

In [161]:
from sklearn.neighbors import BallTree
tree = BallTree(all_data[["latitude", "longitude"]])              
dist, ind = tree.query(all_data[["latitude", "longitude"]], k=1000)

In [162]:
mean_sqm_price_of_cluster = []
for row in ind:
    mean_sqm_price_of_cluster.append(np.mean(all_data.price.iloc[row]/all_data.area_total.iloc[row]))

all_data["mean_sqm_price_of_cluster"] = mean_sqm_price_of_cluster

In [163]:
all_data.fillna(-999, inplace = True)

In [164]:
train_data = all_data[0:num_of_training_samples]
test_data = all_data[num_of_training_samples:len(all_data)]

In [165]:

#print(test_data.head(100))

print("Amount of duplicates in train data: ", len(train_data[train_data.duplicated()]))
train_data = train_data.drop(train_data[train_data.duplicated()].index, axis = 0)

price_per_district = train_data.groupby("district").mean().price
area_per_district = train_data.groupby("district").mean().area_total

train_data["price_per_sq_dist_cat"], bins = pd.qcut(train_data.dist_from_city_center, q = 150, retbins = True)
test_data["price_per_sq_dist_cat"] = pd.cut(test_data.dist_from_city_center, bins = bins)

price_per_dist = train_data.groupby("price_per_sq_dist_cat").mean().price
area_per_dist = train_data.groupby("price_per_sq_dist_cat").mean().area_total

a = np.log(price_per_district/area_per_district)
b = np.log(price_per_dist/area_per_dist)

train_data = pd.concat([train_data, pd.DataFrame(columns = ["sqm_per_dist"])], axis = 1)
test_data = pd.concat([test_data, pd.DataFrame(columns = ["sqm_per_dist"])], axis = 1)

train_data["district"] = train_data["district"].map(a).astype(float)
test_data["district"] = test_data["district"].map(a).astype(float)

train_data["sqm_per_dist"] = train_data["price_per_sq_dist_cat"].map(b).astype(float)
test_data["sqm_per_dist"] = test_data["price_per_sq_dist_cat"].map(b).astype(float)

train_data = train_data.drop("price_per_sq_dist_cat", axis = 1)
test_data = test_data.drop("price_per_sq_dist_cat", axis = 1)

In [166]:
print(len(test_data.sqm_per_dist[(np.isnan(test_data.sqm_per_dist)) & (test_data.dist_from_city_center < b.index[0].right)]))
print(len(test_data.sqm_per_dist[(np.isnan(test_data.sqm_per_dist)) & (test_data.dist_from_city_center > b.index[len(b)-1].left)]))


print(len(test_data.sqm_per_dist[(np.isnan(test_data.sqm_per_dist)) & (test_data.dist_from_city_center < b.index[0].right)]))
print(len(test_data.sqm_per_dist[(np.isnan(test_data.sqm_per_dist)) & (test_data.dist_from_city_center > b.index[len(b)-1].left)]))

test_data.district[(np.isnan(test_data.district)) & (test_data.district < a[0])] = a[0]
test_data.district[(np.isnan(test_data.district)) & (test_data.district > a[len(a)-1])] = a[len(a)-1]

test_data.sqm_per_dist[(np.isnan(test_data.sqm_per_dist)) & (test_data.dist_from_city_center < b.index[0].right)] = b[0]
test_data.sqm_per_dist[(np.isnan(test_data.sqm_per_dist)) & (test_data.dist_from_city_center > b.index[len(b)-1].left)] = b[len(b)-1]



In [167]:
y_train = train_data[["price", "area_total"]]
x_train = train_data.drop("price", axis = 1)
test_data = test_data.drop("price", axis = 1)

x_train_building_id = x_train.building_id

x_train = x_train.drop(["building_id", "material", "phones", "seller", "healthcare_centers_raion", "district", "metro_km_avto"], axis = 1)
test_area = test_data.area_total
test_data = test_data.drop(["building_id", "material", "phones", "seller", "healthcare_centers_raion", "district", "metro_km_avto"], axis = 1)

#x_train_np = np.asarray(x_train)
#y_train_np = np.asarray(y_train)
#test_data_np = np.asarray(test_data)

In [168]:
print(x_train.keys())


In [169]:
from tensorflow.keras.losses import mean_squared_logarithmic_error
from tensorflow.keras.backend import sqrt

def root_mean_squared_log_error(y_true, y_pred):
    return sqrt(mean_squared_logarithmic_error(y_true, y_pred))

In [170]:

from sklearn.ensemble import RandomForestRegressor

model2 = RandomForestRegressor(
    n_estimators=100,
    criterion='mse',
    max_depth=None,
#    min_samples_split=2,
#    min_samples_leaf=2,
    min_weight_fraction_leaf=0.0,
    max_features='auto',
    max_leaf_nodes=None,
    min_impurity_decrease=0.0,
    bootstrap=True,
    oob_score=False,
    n_jobs=None,
    random_state=42,
    verbose=0,
    warm_start=False,
    ccp_alpha=0.0,
    max_samples=None
)

In [184]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LogisticRegression, ElasticNet, RidgeCV, Lasso
from sklearn.kernel_ridge import KernelRidge
from sklearn.ensemble import RandomForestRegressor, StackingRegressor, BaggingRegressor
from sklearn.feature_selection import SelectFromModel
from sklearn.compose import TransformedTargetRegressor
from sklearn.preprocessing import QuantileTransformer
from sklearn.preprocessing import RobustScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import make_pipeline

from xgboost import XGBRegressor
from catboost import CatBoostRegressor

from lightgbm import LGBMRegressor

final_lgbm = LGBMRegressor(
    objective="regression",
    num_leaves=16,
    max_depth=4, 
    random_state=42, 
    n_jobs=-2, 
    n_estimators=1000,
    colsample_bytree=1,
    subsample=1,
    verbose = -1,
    enable_categorical = True
)

model5 = LGBMRegressor(
    objective="regression",
    num_leaves=32,
    max_depth=5, 
    random_state=42, 
    n_jobs=-2, 
    n_estimators=5000,
    colsample_bytree=0.7,
    subsample=0.7,
    verbose = -1,
    enable_categorical = True
)

model_xgb_1 = XGBRegressor(
    #objective = "reg:logistic",
    n_estimators=1500,
    max_depth=5,
    n_jobs=-2,
    booster='gbtree',
    enable_categorical = True,
    colsample_bytree=0.7,
    subsample=0.7,
    learning_rate = 0.1,
    random_state=42069,
)

model_cat_1 = CatBoostRegressor(
    #loss_function="MAE",
    iterations=2000,
    depth = 7,
    learning_rate=0.1,
    l2_leaf_reg=0.45,
    silent=True,
    random_seed = 42069,
)

final_model = RidgeCV()


base_learners = [
    ('xgb_tree', model_xgb_1),
    #("random_forest", model2),
    ("lgbm", model5),
    # ('knn_256', model3),
    #('knn_512', model_knn_512),
    # ('knn_1024', model5),
    ('catboost', model_cat_1),
]

stacking_model = StackingRegressor(estimators=base_learners, n_jobs=-2, final_estimator=final_model, cv = 5, verbose = 1)

In [188]:
from sklearn.model_selection import KFold
from sklearn.model_selection import GroupShuffleSplit
kf = KFold(random_state = 42069, shuffle = True)

models = []
errors = []
predictions_test = []
train_idxs = []
test_idxs = []

#print(x_train.keys())
#print(test_data.keys())

#print(set(x_train.building_id).intersection(test_data.building_id))

#print(np.unique(x_train.building_id))

g = GroupShuffleSplit(n_splits = 5, train_size = 0.8, random_state = 42069)
for train_index, test_index in g.split(x_train, groups = x_train_building_id):

    train_idxs.append(train_index)
    test_idxs.append(test_index)

    print("TRAIN:", train_index, "TEST:", test_index)
    x_train_fold, x_test_fold = x_train.iloc[train_index], x_train.iloc[test_index]
    y_train_fold, y_test_fold = y_train.iloc[train_index], y_train.iloc[test_index]

    y_train_fold = np.log(y_train_fold.price)#/y_train_fold.area_total)

    current_model = stacking_model.fit(x_train_fold, (y_train_fold))
    models.append(current_model)
    predictions_val = np.exp(current_model.predict(x_test_fold))#*y_test_fold.area_total
    #print(predictions_val)
    error = root_mean_squared_log_error(y_test_fold.price, predictions_val)
    errors.append(error)

    #print(x_train.keys())
    #print(current_model.feature_importances_)

    predictions_test.append(np.exp(current_model.predict(test_data)))#*test_area)

    print("Error: ", error)
print("Mean error: ", np.mean(errors))

In [187]:
prediction = np.mean(predictions_test, axis = 0)
print(prediction)
submission = pd.DataFrame()
submission['id'] = test_ids
submission["price_prediction"] = prediction
#submission.to_csv('simple_nn_submission.csv', index=False)