In [1]:
import pandas as pd


df = pd.read_csv("data/AB_NYC_2019.csv")

df.head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,2539,Clean & quiet apt home by the park,2787,John,Brooklyn,Kensington,40.64749,-73.97237,Private room,149,1,9,2018-10-19,0.21,6,365
1,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,2019-05-21,0.38,2,355
2,3647,THE VILLAGE OF HARLEM....NEW YORK !,4632,Elisabeth,Manhattan,Harlem,40.80902,-73.9419,Private room,150,3,0,,,1,365
3,3831,Cozy Entire Floor of Brownstone,4869,LisaRoxanne,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,1,270,2019-07-05,4.64,1,194
4,5022,Entire Apt: Spacious Studio/Loft by central park,7192,Laura,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9,2018-11-19,0.1,1,0


In [2]:
df["room_type"].value_counts()

room_type
Entire home/apt    25409
Private room       22326
Shared room         1160
Name: count, dtype: int64

In [3]:
df = df.drop(
    columns=[
        "id",
        "name",
        "host_id",
        "host_name",
        "neighbourhood",
    ]
)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48895 entries, 0 to 48894
Data columns (total 11 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   neighbourhood_group             48895 non-null  object 
 1   latitude                        48895 non-null  float64
 2   longitude                       48895 non-null  float64
 3   room_type                       48895 non-null  object 
 4   price                           48895 non-null  int64  
 5   minimum_nights                  48895 non-null  int64  
 6   number_of_reviews               48895 non-null  int64  
 7   last_review                     38843 non-null  object 
 8   reviews_per_month               38843 non-null  float64
 9   calculated_host_listings_count  48895 non-null  int64  
 10  availability_365                48895 non-null  int64  
dtypes: float64(3), int64(5), object(3)
memory usage: 4.1+ MB


In [4]:
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split

# Загрузка данных
df = pd.read_csv("data/AB_NYC_2019.csv")

# Создаем копию для предобработки
df_prep = df.copy()

# Кодируем категориальные переменные
le = LabelEncoder()
df_prep["room_type"] = le.fit_transform(df_prep["room_type"])

# Заполняем пропущенные значения
numeric_columns = [
    "price",
    "minimum_nights",
    "number_of_reviews",
    "reviews_per_month",
    "calculated_host_listings_count",
    "availability_365",
    "latitude",
    "longitude",
]
df_prep[numeric_columns] = df_prep[numeric_columns].fillna(
    df_prep[numeric_columns].mean()
)

# Масштабируем числовые признаки
scaler = StandardScaler()
df_prep[numeric_columns] = scaler.fit_transform(df_prep[numeric_columns])

# One-hot encoding для neighbourhood_group using pandas
neighbourhood_group_encoded = pd.get_dummies(
    df_prep["neighbourhood_group"], prefix="neighbourhood_group"
)

# Удаляем оригинальный столбец и добавляем закодированные
df_prep = pd.concat(
    [df_prep.drop("neighbourhood_group", axis=1), neighbourhood_group_encoded], axis=1
)

# Подготовка признаков и целевой переменной
X = df_prep.drop(
    ["room_type", "id", "name", "host_id", "host_name", "neighbourhood", "last_review"],
    axis=1,
)
y = df_prep["room_type"]
X = X.to_numpy()
y = y.to_numpy()

# Разделение на обучающую и тестовую выборки
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("Размер обучающей выборки:", X_train.shape)
print("Размер тестовой выборки:", X_test.shape)

Размер обучающей выборки: (39116, 13)
Размер тестовой выборки: (9779, 13)


In [5]:
from random_forest import RandomForestClassifier


cls = RandomForestClassifier(n_estimators=100, bootstrap_size=0.5, min_oob_score=0.5)
cls.fit(X_train, y_train)

<random_forest.RandomForestClassifier at 0x11fd786e0>

In [6]:
cls.predict(X_test)

array([1, 1, 0, ..., 1, 0, 1], shape=(9779,))

In [7]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


# Получаем предсказания
y_pred = cls.predict(X_test)

# Вычисляем точность
accuracy = accuracy_score(y_test, y_pred)
print(f"Точность модели: {accuracy:.4f}")

# Выводим подробный отчет о классификации
print("\nОтчет о классификации:")
print(classification_report(y_test, y_pred))

Точность модели: 0.8550

Отчет о классификации:
              precision    recall  f1-score   support

           0       0.87      0.89      0.88      5029
           1       0.84      0.85      0.85      4509
           2       0.87      0.28      0.42       241

    accuracy                           0.85      9779
   macro avg       0.86      0.67      0.71      9779
weighted avg       0.86      0.85      0.85      9779



In [12]:
n_estimators_values = [50, 100, 300]
bootstrap_size_values = [0.5, 0.7]
min_oob_score_values = [0.5, 0.7]

# Создаем список для хранения результатов
results = []

# Проводим эксперименты
for n_est in n_estimators_values:
    for bs_size in bootstrap_size_values:
        for min_oob_score in min_oob_score_values:
            print(f"n_estimators: {n_est}, bootstrap_size: {bs_size}, min_oob_score: {min_oob_score}")
            # Обучаем модель с текущими параметрами
            clf = RandomForestClassifier(n_estimators=n_est, bootstrap_size=bs_size, min_oob_score=min_oob_score)
            clf.fit(X_train, y_train)

            # Получаем предсказания
            y_pred = clf.predict(X_test)

            # Вычисляем метрики
            accuracy = accuracy_score(y_test, y_pred)

            # Сохраняем результаты
            results.append(
                {"n_estimators": n_est, "bootstrap_size": bs_size, "min_oob_score": min_oob_score, "accuracy": accuracy}
            )

# Создаем DataFrame с результатами
results_df = pd.DataFrame(results)

# Преобразуем таблицу в более читаемый формат
pivot_table = results_df.pivot(
    index="n_estimators", columns=["bootstrap_size", "min_oob_score"], values="accuracy"
)

# Выводим таблицу с форматированием
print("Точность модели для разных параметров:")
print("\nbootstrap_size →")
print(pivot_table.round(4))

n_estimators: 50, bootstrap_size: 0.5, min_oob_score: 0.5
n_estimators: 50, bootstrap_size: 0.5, min_oob_score: 0.7
n_estimators: 50, bootstrap_size: 0.7, min_oob_score: 0.5
n_estimators: 50, bootstrap_size: 0.7, min_oob_score: 0.7
n_estimators: 100, bootstrap_size: 0.5, min_oob_score: 0.5
n_estimators: 100, bootstrap_size: 0.5, min_oob_score: 0.7
n_estimators: 100, bootstrap_size: 0.7, min_oob_score: 0.5
n_estimators: 100, bootstrap_size: 0.7, min_oob_score: 0.7
n_estimators: 300, bootstrap_size: 0.5, min_oob_score: 0.5
n_estimators: 300, bootstrap_size: 0.5, min_oob_score: 0.7
n_estimators: 300, bootstrap_size: 0.7, min_oob_score: 0.5
n_estimators: 300, bootstrap_size: 0.7, min_oob_score: 0.7
Точность модели для разных параметров:

bootstrap_size →
bootstrap_size     0.5             0.7        
min_oob_score      0.5     0.7     0.5     0.7
n_estimators                                  
50              0.8535  0.8516  0.8563  0.8541
100             0.8581  0.8563  0.8590  0.8558
300 

In [13]:
from sklearn.ensemble import RandomForestClassifier as SklearnRF

# Создаем и обучаем модель из sklearn
sklearn_clf = SklearnRF(n_estimators=100, max_samples=0.5, random_state=42)
sklearn_clf.fit(X_train, y_train)

# Получаем предсказания
y_pred_sklearn = sklearn_clf.predict(X_test)

# Вычисляем точность
accuracy_sklearn = accuracy_score(y_test, y_pred_sklearn)
print(f"Точность модели sklearn: {accuracy_sklearn:.4f}")

# Выводим подробный отчет о классификации
print("\nОтчет о классификации sklearn:")
print(classification_report(y_test, y_pred_sklearn))

Точность модели sklearn: 0.8593

Отчет о классификации sklearn:
              precision    recall  f1-score   support

           0       0.87      0.89      0.88      5029
           1       0.85      0.85      0.85      4509
           2       0.92      0.25      0.39       241

    accuracy                           0.86      9779
   macro avg       0.88      0.67      0.71      9779
weighted avg       0.86      0.86      0.85      9779



In [None]:
from sklearn.model_selection import cross_val_score
import time
import numpy as np


# Параметры
n_estimators = 100
bootstrap_size = 0.5
n_folds = 5

# Сравнение времени обучения и качества
print("Сравнение реализаций Random Forest:")
print("-" * 50)

# Наша реализация
start_time = time.time()
our_scores = []
for fold in range(n_folds):
    # Создаем и обучаем модель
    our_clf = RandomForestClassifier(
        n_estimators=n_estimators, bootstrap_size=bootstrap_size
    )
    our_clf.fit(X_train, y_train)
    our_scores.append(accuracy_score(y_test, our_clf.predict(X_test)))
our_time = time.time() - start_time

print("Наша реализация:")
print(f"Среднее качество: {np.mean(our_scores):.4f} ± {np.std(our_scores):.4f}")
print(f"Время обучения: {our_time:.2f} секунд")

# Реализация sklearn
start_time = time.time()
sklearn_clf = SklearnRF(
    n_estimators=n_estimators, max_samples=bootstrap_size, random_state=42
)
sklearn_scores = cross_val_score(sklearn_clf, X_train, y_train, cv=n_folds)
sklearn_time = time.time() - start_time

print("\nSklearn реализация:")
print(f"Среднее качество: {np.mean(sklearn_scores):.4f} ± {np.std(sklearn_scores):.4f}")
print(f"Время обучения: {sklearn_time:.2f} секунд")

Сравнение реализаций Random Forest:
--------------------------------------------------
