In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

In [8]:

# === 1. Загрузка данных ===
room_1_csv = '../data/raw/1_room.csv'
room_2_csv = '../data/raw/2_room.csv'
room_3_csv = '../data/raw/3_room.csv'
raw_df = pd.concat([pd.read_csv(room_1_csv), pd.read_csv(room_2_csv), pd.read_csv(room_3_csv)])

In [13]:
# === 2. Предобработка данных ===
df = raw_df[['url', 'floor', 'rooms_count', 'total_meters', 'author_type', 'price']]
def get_url_id(url):
    return url[30:-1]
df['id'] = df['url'].apply(get_url_id)
df = df.drop('url', axis=1)
df = df.set_index('id')
df = df.drop_duplicates()
df = df[df['price'] > 0]  # Удалим строки с нереальной ценой -1


# Категориальные переменные
cat_features = ['author_type']

for col in cat_features:
    df[col] = LabelEncoder().fit_transform(df[col].astype(str))

df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['id'] = df['url'].apply(get_url_id)


Unnamed: 0_level_0,floor,rooms_count,total_meters,author_type,price
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
315745436,25,1,60.0,3,75000000
315045740,13,1,37.9,3,12500000
317607506,4,1,26.8,5,8400000
313640113,6,1,40.1,3,35500000
314672030,14,1,40.0,4,12700000
...,...,...,...,...,...
314082897,2,4,62.2,3,14000000
316585431,22,3,74.2,3,44950000
317369751,7,3,74.0,4,14900000
317640391,9,3,65.6,0,20900000


In [15]:
# === 3. Разделим на X и y ===
X = df.drop('price', axis=1)
y = df['price']

# === 4. Разделение на train/test ===
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# === 5. Обучение моделей ===

models = {
    "Linear Regression": LinearRegression(),
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42),
    "XGBoost": XGBRegressor(n_estimators=100, random_state=42)
}

# === 6. Оценка моделей ===
results = {}

for name, model in models.items():
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    
    mae = mean_absolute_error(y_test, preds)
    rmse = mean_squared_error(y_test, preds)
    r2 = r2_score(y_test, preds)
    
    results[name] = {
        "MAE": round(mae, 2),
        "RMSE": round(rmse, 2),
        "R²": round(r2, 4)
    }

# === 7. Вывод результатов ===
results_df = pd.DataFrame(results).T
print(results_df)

                          MAE          RMSE      R²
Linear Regression  9595622.65  1.822370e+14  0.5571
Random Forest      8686798.69  1.718549e+14  0.5823
XGBoost            8158475.00  1.503276e+14  0.6346
