
    Теперь решаем задачу регрессии - предскажем цены на недвижимость. Использовать датасет https://www.kaggle.com/c/house-prices-advanced-regression-techniques/data (train.csv)
    Данных немного, поэтому необходимо использовать 10-fold кросс-валидацию для оценки качества моделей
    Построить случайный лес, вывести важность признаков
    Обучить стекинг как минимум 3х моделей, использовать хотя бы 1 линейную модель и 1 нелинейную
    Для валидации модели 2-го уровня использовать отдельный hold-out датасет, как на занятии
    Показать, что использование ансамблей моделей действительно улучшает качество (стекинг vs другие модели сравнивать на hold-out)
    В качестве решения: Jupyter notebook с кодом, комментариями и графиками, ссылка на гитхаб

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
data = pd.read_csv("house-prices/train.csv")
data

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,RL,62.0,7917,Pave,,Reg,Lvl,AllPub,...,0,,,,0,8,2007,WD,Normal,175000
1456,1457,20,RL,85.0,13175,Pave,,Reg,Lvl,AllPub,...,0,,MnPrv,,0,2,2010,WD,Normal,210000
1457,1458,70,RL,66.0,9042,Pave,,Reg,Lvl,AllPub,...,0,,GdPrv,Shed,2500,5,2010,WD,Normal,266500
1458,1459,20,RL,68.0,9717,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2010,WD,Normal,142125


In [2]:
#Находим категориальные признаки
cat_feat = list(data.dtypes[data.dtypes == object].index)
#закодируем пропущенные значений строкой, факт пропущенного значения тоже может нести в себе информацию
data[cat_feat] = data[cat_feat].fillna('nan').copy()

#отфильтруем непрерывные признаки
num_feat = [f for f in data if f not in (cat_feat + ['Id', 'SalePrice'])]
#Заменяем пропуски на специальное значение -999, чтобы деревья могли их отличить
data[num_feat] = data[num_feat].fillna(-999).copy()

# Смотрим сколько у нас значений по каждому категориальному признаку
cat_nunique = data[cat_feat].nunique()
print("cat_nunique")
for cat, count in cat_nunique.items():
    if count > 20:
        print(f"{cat} : {count}")

cat_nunique
Neighborhood : 25


Максимальное количество значений по одному признаку = 25, что приемлемо

In [3]:
# Создаем дамми-переменные для категорий
dummy = pd.get_dummies(data[cat_feat], columns=cat_feat)

prepared_data = pd.concat([data[num_feat], dummy], axis=1)
prepared_data

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,60,65.0,8450,7,5,2003,2003,196.0,706,0,...,0,0,0,1,0,0,0,0,1,0
1,20,80.0,9600,6,8,1976,1976,0.0,978,0,...,0,0,0,1,0,0,0,0,1,0
2,60,68.0,11250,7,5,2001,2002,162.0,486,0,...,0,0,0,1,0,0,0,0,1,0
3,70,60.0,9550,7,5,1915,1970,0.0,216,0,...,0,0,0,1,1,0,0,0,0,0
4,60,84.0,14260,8,5,2000,2000,350.0,655,0,...,0,0,0,1,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,60,62.0,7917,6,5,1999,2000,0.0,0,0,...,0,0,0,1,0,0,0,0,1,0
1456,20,85.0,13175,6,6,1978,1988,119.0,790,163,...,0,0,0,1,0,0,0,0,1,0
1457,70,66.0,9042,7,9,1941,2006,0.0,275,0,...,0,0,0,1,0,0,0,0,1,0
1458,20,68.0,9717,5,6,1950,1996,0.0,49,1029,...,0,0,0,1,0,0,0,0,1,0


In [4]:
X_prepared = prepared_data.copy()
y_prepared = data[["SalePrice"]].copy()

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_prepared, y_prepared, test_size=0.2)

Для начала, ради интереса, попробуем предсказать с помощью desision tree

In [5]:
from sklearn.tree import DecisionTreeRegressor
rg_tree = DecisionTreeRegressor(max_depth=15, min_samples_leaf=20)
rg_tree.fit(X_train, y_train)

y_predict = rg_tree.predict(X_test)

from sklearn.metrics import mean_absolute_error
mean_absolute_error(y_predict, y_test)

27094.99023248844

использовать 10-fold кросс-валидацию для оценки качества моделей

In [6]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer

scores = cross_val_score(rg_tree, X_prepared, y_prepared, cv=10, scoring=make_scorer(mean_absolute_error))
np.mean(scores)

23850.6418852318

Построить случайный лес

In [7]:
from sklearn.ensemble import RandomForestRegressor

rg_rf = RandomForestRegressor(n_estimators=10, max_depth=5, min_samples_leaf=20, max_features=0.5, n_jobs=-1)
scores = cross_val_score(rg_rf, X_prepared, y_prepared.values.ravel(), cv=10, scoring=make_scorer(mean_absolute_error))
print(f"cross-val score: {np.mean(scores)}")
rg_rf.fit(X_train, y_train.values.ravel())  
y_predict = rg_rf.predict(X_test)
print(f"test score: {mean_absolute_error(y_predict, y_test)}")

rg_rf = RandomForestRegressor(n_estimators=10, n_jobs=-1)
scores = cross_val_score(rg_rf, X_prepared, y_prepared.values.ravel(), cv=10, scoring=make_scorer(mean_absolute_error))
print(f"cross-val score: {np.mean(scores)}")
rg_rf.fit(X_train, y_train.values.ravel())  
y_predict = rg_rf.predict(X_test)
print(f"test score: {mean_absolute_error(y_predict, y_test)}")

cross-val score: 21240.87605015826
test score: 22501.096798454073
cross-val score: 18806.587123287674
test score: 19060.919863013696


Ради интереса беггинг

In [8]:
from sklearn.ensemble import BaggingRegressor
bag_rf = BaggingRegressor(n_estimators=10, base_estimator=DecisionTreeRegressor(), n_jobs=-1) 
scores = cross_val_score(bag_rf, X_prepared, y_prepared.values.ravel(), cv=10, scoring=make_scorer(mean_absolute_error))
print(f"cross-val score: {np.mean(scores)}")
bag_rf = BaggingRegressor(n_estimators=10, base_estimator=DecisionTreeRegressor(), n_jobs=-1)
bag_rf.fit(X_train, y_train.values.ravel())  
y_predict = bag_rf.predict(X_test)
print(f"test score: {mean_absolute_error(y_predict, y_test)}")

cross-val score: 18636.846506849317
test score: 19573.15719178082


вывести важность признаков

In [9]:
rg_rf = RandomForestRegressor(n_estimators=10, n_jobs=-1)
rg_rf.fit(X_train, y_train.values.ravel())  
imp = pd.Series(rg_rf.feature_importances_)
df = pd.DataFrame(imp)
df = df.reset_index()
df['feat_names'] = df["index"].apply(lambda ind: X_train.columns[ind])
most_important_feats = df.sort_values(0, ascending=False).copy()
most_important_feats

Unnamed: 0,index,0,feat_names
3,3,0.594724,OverallQual
15,15,0.109939,GrLivArea
11,11,0.047029,TotalBsmtSF
12,12,0.033745,1stFlrSF
26,26,0.032423,GarageArea
...,...,...,...
191,191,0.000000,BsmtCond_Po
42,42,0.000000,Street_Pave
94,94,0.000000,Condition1_RRAe
55,55,0.000000,Utilities_NoSeWa


In [10]:
best_20_feats = list(most_important_feats.feat_names.values[:20])
best_20_feats

['OverallQual',
 'GrLivArea',
 'TotalBsmtSF',
 '1stFlrSF',
 'GarageArea',
 'BsmtFinSF1',
 'LotArea',
 'TotRmsAbvGrd',
 'YearBuilt',
 'YearRemodAdd',
 'FullBath',
 '2ndFlrSF',
 'WoodDeckSF',
 'BsmtUnfSF',
 'KitchenQual_Gd',
 'GarageYrBlt',
 'OverallCond',
 'HalfBath',
 'GarageCars',
 'LotFrontage']

Обучить стекинг как минимум 3х моделей, использовать хотя бы 1 линейную модель и 1 нелинейную

In [11]:
#Заново подготовим данные. То что годилось для деревьев решений не годится для других моделей.
#Потому по другому заполним пробелы и отскейлим.
data = pd.read_csv("house-prices/train.csv")

#Находим категориальные признаки
cat_feat = list(data.dtypes[data.dtypes == object].index)
#закодируем пропущенные значений строкой, факт пропущенного значения тоже может нести в себе информацию
data[cat_feat] = data[cat_feat].fillna('nan').copy()

#отфильтруем непрерывные признаки
num_feat = [f for f in data if f not in (cat_feat + ['Id', 'SalePrice'])]
#закодируем пропущенные значения средним значением

data[num_feat] = data[num_feat].fillna(data[num_feat].mean()).copy()

# Смотрим сколько у нас значений по каждому категориальному признаку
cat_nunique = data[cat_feat].nunique()
print("cat_nunique")
for cat, count in cat_nunique.items():
    if count > 20:
        print(f"{cat} : {count}")

cat_nunique
Neighborhood : 25


In [12]:
# скейлим числовые признаки
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(data[num_feat])
num_feat_scaled = pd.DataFrame(scaler.transform(data[num_feat]), columns=num_feat)

In [13]:
# Создаем дамми-переменные для категорий
dummy = pd.get_dummies(data[cat_feat], columns=cat_feat)

In [14]:
prepared_data = pd.concat([num_feat_scaled, dummy], axis=1)

#Оставляем только 20 самых важных фич
prepared_data = prepared_data[best_20_feats]

X_prepared = prepared_data
y_prepared = data[["SalePrice"]].values.ravel()
X_train, X_test, y_train, y_test = train_test_split(X_prepared, y_prepared, test_size=0.2)
prepared_data

Unnamed: 0,OverallQual,GrLivArea,TotalBsmtSF,1stFlrSF,GarageArea,BsmtFinSF1,LotArea,TotRmsAbvGrd,YearBuilt,YearRemodAdd,FullBath,2ndFlrSF,WoodDeckSF,BsmtUnfSF,KitchenQual_Gd,GarageYrBlt,OverallCond,HalfBath,GarageCars,LotFrontage
0,0.651479,0.370333,-0.459303,-0.793434,0.351000,0.575425,-0.207142,0.912210,1.050994,0.878668,0.789741,1.161852,-0.752176,-0.944591,1,1.021157,-0.517200,1.227585,0.311725,-0.229372
1,-0.071836,-0.482512,0.466465,0.257140,-0.060731,1.171992,-0.091886,-0.318683,0.156734,-0.429577,0.789741,-0.795163,1.626195,-0.641228,0,-0.104483,2.179628,-0.761621,0.311725,0.451936
2,0.651479,0.515013,-0.313369,-0.627826,0.631726,0.092907,0.073480,-0.318683,0.984752,0.830215,0.789741,1.189351,-0.752176,-0.301643,1,0.937776,-0.517200,1.227585,0.311725,-0.093110
3,0.651479,0.383659,-0.687324,-0.521734,0.790804,-0.499274,-0.096897,0.296763,-1.863632,-0.720298,-1.026041,0.937276,-0.752176,-0.061670,1,0.812705,-0.517200,-0.761621,1.650307,-0.456474
4,1.374795,1.299326,0.199680,-0.045611,1.698485,0.463568,0.375148,1.527656,0.951632,0.733308,0.789741,1.617877,0.780197,-0.174865,1,0.896086,-0.517200,1.227585,1.650307,0.633618
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,-0.071836,0.250402,-0.238122,-0.542435,-0.060731,-0.973018,-0.260560,0.296763,0.918511,0.733308,0.789741,0.795198,-0.752176,0.873321,0,0.854395,-0.517200,1.227585,0.311725,-0.365633
1456,-0.071836,1.061367,1.104925,2.355701,0.126420,0.759659,0.266407,0.296763,0.222975,0.151865,0.789741,-0.795163,2.033231,0.049262,0,-0.021102,0.381743,-0.761621,0.311725,0.679039
1457,0.651479,1.569647,0.215641,0.065656,-1.033914,-0.369871,-0.147810,1.527656,-1.002492,1.024029,0.789741,1.844744,-0.752176,0.701265,1,-1.563645,3.078570,-0.761621,-1.026858,-0.183951
1458,-0.795151,-0.832788,0.046905,-0.218982,-1.090059,-0.865548,-0.080160,-0.934130,-0.704406,0.539493,-1.026041,-0.795163,2.168910,-1.284176,1,-1.188432,0.381743,-0.761621,-1.026858,-0.093110


In [15]:
# делаем стекинг
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.neighbors import KNeighborsRegressor

def get_model_name(model):
    return type(model).__name__

def fit_model(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train);
    print(f"score {get_model_name(model)}: {mean_absolute_error(model.predict(X_test), y_test)}")
    return model


stacked_df = pd.DataFrame()
stacked_test_df = pd.DataFrame()

for model in (LinearRegression(), LogisticRegression(), KNeighborsRegressor()):
    
    fitted_model = fit_model(model, X_train, X_test, y_train, y_test)
    
    stacked_df[get_model_name(model)] = fitted_model.predict(X_train)
    stacked_test_df[get_model_name(model)] = fitted_model.predict(X_test)
    
stacked_df


score LinearRegression: 23647.452627528965




score LogisticRegression: 35404.91095890411
score KNeighborsRegressor: 21969.528767123287


Unnamed: 0,LinearRegression,LogisticRegression,KNeighborsRegressor
0,180799.654390,122000,133380.0
1,198526.030876,190000,213035.6
2,119570.489539,110000,141400.0
3,255539.971477,190000,234158.0
4,166519.054798,147000,143680.0
...,...,...,...
1163,164023.199725,148500,149800.0
1164,132988.795224,165000,169180.0
1165,243416.530692,151400,226780.0
1166,214091.094501,214000,200300.0


In [16]:
# обучаем на стекинге RandomForest
rf = RandomForestRegressor(n_estimators=10, max_depth=5, min_samples_leaf=20, max_features=0.5, n_jobs=-1)
rf.fit(stacked_df, y_train)

print(f"score {get_model_name(rf)}: {mean_absolute_error(rf.predict(stacked_test_df), y_test)}")

score RandomForestRegressor: 19943.765474978616


Реально видим, что score стекинга лучше чем score каждой из использующихся в отдельности моделей 