In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [84]:
df = pd.read_csv("real_estate_novosibirsk.csv")
df

Unnamed: 0,area,area_raw,item_id,type_of_house,floor,floors_in_house,rooms_number,price,district,price_order_id
0,91.0,91.0,198518500076,Кирпичный,3.0,16.0,3,5100000.0,Кировский,1
1,91.0,91.0,198518500076,Кирпичный,3.0,16.0,3,5100000.0,Кировский,2
2,91.0,91.0,198518500076,Кирпичный,3.0,16.0,3,5100000.0,Кировский,3
3,18.7,18.7,257626750244,Кирпичный,4.0,5.0,Студия,1300000.0,Ленинский,1
4,,,367907500565,Кирпичный,4.0,4.0,4,3500000.0,Кировский,1
...,...,...,...,...,...,...,...,...,...,...
124571,33.0,33.0,832255750867,Кирпичный,10.0,10.0,Студия,2950000.0,Заельцовский,2
124572,33.0,33.0,832255750867,Кирпичный,10.0,10.0,Студия,2950000.0,Заельцовский,3
124573,51.0,51.0,833043000375,Панельный,4.0,10.0,2,3300000.0,Октябрьский,1
124574,51.0,51.0,833043000375,Панельный,4.0,10.0,2,3300000.0,Октябрьский,2


In [85]:
df = df.drop_duplicates(subset=["item_id"], keep="last")
df = df.dropna(subset=["area"])

In [86]:
def process_floor(row):
    if row["floor"] == 1:
        return "first"

    if row["floor"] == row["floors_in_house"]:
        return "last"

    return "middle"


df["floor_cat"] = df.apply(process_floor, axis=1)
df["price_per_meter"] = df["price"] / df["area"]
df = df[(df["area"] > 0) & (df["price"] > 0)]

In [87]:
q1 = df["price_per_meter"].quantile(0.25)
q3 = df["price_per_meter"].quantile(0.75)
iqr = q3 - q1
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr
q1, q3, iqr, lower_bound, upper_bound

(54744.52554744526,
 78031.212484994,
 23286.686937548744,
 19814.49514112214,
 112961.24289131712)

In [88]:
filtered_df = df[
    (df["price_per_meter"] >= lower_bound) & (df["price_per_meter"] <= upper_bound)
]

In [89]:
df["price_per_meter"].mean(), filtered_df["price_per_meter"].mean()

(68278.3232953137, 66014.60245121935)

In [90]:
def mape(y_true, prediction):
    return ((y_true - prediction) / y_true).abs().mean()


def fit(df):
    return df["price_per_meter"].mean()


def predict(df, mean_price):
    return mean_price * df["area"]

In [91]:
def get_bounds(df):
    q1 = df["price_per_meter"].quantile(0.25)
    q3 = df["price_per_meter"].quantile(0.75)
    iqr = q3 - q1
    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr
    return lower_bound, upper_bound

In [92]:
X, y = df.drop("price", axis=1), df["price"]
X_train, X_test, y_train, y_test = train_test_split(X, y)
mean_price = fit(X_train)
prediction_train = predict(X_train, mean_price)
prediction_test = predict(X_test, mean_price)
mape(y_train, prediction_train), mape(y_test, prediction_test)

(0.5864909958462239, 0.4084335780025154)

In [93]:
X, y = df.drop("price", axis=1), df["price"]
X_train, X_test, y_train, y_test = train_test_split(X, y)
lower_bound, upper_bound = get_bounds(X_train)
filter = (X_train["price_per_meter"] >= lower_bound) & (
    X_train["price_per_meter"] <= upper_bound
)
X_train = X_train[filter]
y_train = y_train[filter]
mean_price = fit(X_train)
prediction_train = predict(X_train, mean_price)
prediction_test = predict(X_test, mean_price)
mape(y_train, prediction_train), mape(y_test, prediction_test)

(0.23431075034592166, 0.25141536669265413)

In [94]:
def fit_with_district(df):
    mean_price = df["price_per_meter"].mean()
    mean_price_by_district = df.groupby("district")["price_per_meter"].mean()
    return mean_price, mean_price_by_district


def predict_with_district(df, mean_price, mean_price_by_district):
    def get_prediction(row):
        if row["district"] in mean_price_by_district:
            return row["area"] * mean_price_by_district[row["district"]]
        else:
            return row["area"] * mean_price

    return df.apply(get_prediction, axis=1)

In [95]:
X, y = df.drop("price", axis=1), df["price"]
X_train, X_test, y_train, y_test = train_test_split(X, y)
lower_bound, upper_bound = get_bounds(X_train)
filter = (X_train["price_per_meter"] >= lower_bound) & (
    X_train["price_per_meter"] <= upper_bound
)
X_train = X_train[filter]
y_train = y_train[filter]
mean_price, mean_price_by_district = fit_with_district(X_train)
prediction_train = predict_with_district(X_train, mean_price, mean_price_by_district)
prediction_test = predict_with_district(X_test, mean_price, mean_price_by_district)
mape(y_train, prediction_train), mape(y_test, prediction_test)

(0.20724926416882886, 0.21928450728338314)

In [96]:
def fit_with_district_type(df):
    mean_price = df["price_per_meter"].mean()
    mean_price_by_district = df.groupby("district")["price_per_meter"].mean()
    mean_price_by_district_type = df.groupby(["district", "type_of_house"])[
        "price_per_meter"
    ].mean()
    return mean_price, mean_price_by_district, mean_price_by_district_type


def predict_with_district_type(
    df, mean_price, mean_price_by_district, mean_price_by_district_type
):
    def get_prediction(row):
        if row["district"] in mean_price_by_district:
            if row["type_of_house"] in mean_price_by_district_type[row["district"]]:
                return (
                    row["area"]
                    * mean_price_by_district_type[row["district"]][row["type_of_house"]]
                )
            return row["area"] * mean_price_by_district[row["district"]]
        return row["area"] * mean_price

    return df.apply(get_prediction, axis=1)

In [97]:
X, y = df.drop("price", axis=1), df["price"]
X_train, X_test, y_train, y_test = train_test_split(X, y)
lower_bound, upper_bound = get_bounds(X_train)
filter = (X_train["price_per_meter"] >= lower_bound) & (
    X_train["price_per_meter"] <= upper_bound
)
X_train = X_train[filter]
y_train = y_train[filter]
(
    mean_price,
    mean_price_by_district,
    mean_price_by_district_type,
) = fit_with_district_type(X_train)
prediction_train = predict_with_district_type(
    X_train, mean_price, mean_price_by_district, mean_price_by_district_type
)
prediction_test = predict_with_district_type(
    X_test, mean_price, mean_price_by_district, mean_price_by_district_type
)
mape(y_train, prediction_train), mape(y_test, prediction_test)

(0.19092185004108497, 0.2046326013305992)

In [98]:
def fit_with_district_type_floor(df):
    mean_price = df["price_per_meter"].mean()
    mean_price_by_district = df.groupby("district")["price_per_meter"].mean()
    mean_price_by_district_type = df.groupby(["district", "type_of_house"])[
        "price_per_meter"
    ].mean()
    mean_price_by_district_type_floor = df.groupby(
        ["district", "type_of_house", "floor_cat"]
    )["price_per_meter"].mean()
    return (
        mean_price,
        mean_price_by_district,
        mean_price_by_district_type,
        mean_price_by_district_type_floor,
    )


def predict_with_district_type_floor(
    df,
    mean_price,
    mean_price_by_district,
    mean_price_by_district_type,
    mean_price_by_district_type_floor,
):
    def get_prediction(row):
        if row["district"] in mean_price_by_district_type_floor:
            if (
                row["type_of_house"]
                in mean_price_by_district_type_floor[row["district"]]
            ):
                if (
                    row["floor_cat"]
                    in mean_price_by_district_type_floor[row["district"]][
                        row["type_of_house"]
                    ]
                ):
                    return (
                        row["area"]
                        * mean_price_by_district_type_floor[row["district"]][
                            row["type_of_house"]
                        ][row["floor_cat"]]
                    )
                return (
                    row["area"]
                    * mean_price_by_district_type[row["district"]][row["type_of_house"]]
                )
            return row["area"] * mean_price_by_district[row["district"]]
        return row["area"] * mean_price

    return df.apply(get_prediction, axis=1)

In [99]:
X, y = df.drop("price", axis=1), df["price"]
X_train, X_test, y_train, y_test = train_test_split(X, y)
lower_bound, upper_bound = get_bounds(X_train)
filter = (X_train["price_per_meter"] >= lower_bound) & (
    X_train["price_per_meter"] <= upper_bound
)
X_train = X_train[filter]
y_train = y_train[filter]
(
    mean_price,
    mean_price_by_district,
    mean_price_by_district_type,
    mean_price_by_district_type_floor,
) = fit_with_district_type_floor(X_train)
prediction_train = predict_with_district_type_floor(
    X_train,
    mean_price,
    mean_price_by_district,
    mean_price_by_district_type,
    mean_price_by_district_type_floor,
)
prediction_test = predict_with_district_type_floor(
    X_test,
    mean_price,
    mean_price_by_district,
    mean_price_by_district_type,
    mean_price_by_district_type_floor,
)
mape(y_train, prediction_train), mape(y_test, prediction_test)

(0.18599240008693435, 0.21688010060028978)

In [102]:
X, y = df.drop("price", axis=1), df["price"]
lower_bound, upper_bound = get_bounds(X)
filter = (X["price_per_meter"] >= lower_bound) & (X["price_per_meter"] <= upper_bound)
X = X[filter]
y = y[filter]
(
    mean_price,
    mean_price_by_district,
    mean_price_by_district_type,
) = fit_with_district_type(X)
prediction_train = predict_with_district_type(
    X, mean_price, mean_price_by_district, mean_price_by_district_type
)
mape(y, prediction_train)

0.19128490259262002

In [105]:
test_df = pd.read_csv("dataset_521000_13.txt", sep=";")

In [107]:
prediction_test = predict_with_district_type(
    test_df, mean_price, mean_price_by_district, mean_price_by_district_type
)

In [109]:
prediction_test.to_csv("solution.csv", header=False, index=False)