In [2]:
import matplotlib as mlp

mlp.rcParams['lines.linewidth'] = 5

mlp.rcParams['xtick.major.size'] = 20
mlp.rcParams['xtick.major.width'] = 5
mlp.rcParams['xtick.labelsize'] = 20
mlp.rcParams['xtick.color'] = '#FF5533'

mlp.rcParams['ytick.major.size'] = 20
mlp.rcParams['ytick.major.width'] = 5
mlp.rcParams['ytick.labelsize'] = 20
mlp.rcParams['ytick.color'] = '#FF5533'

mlp.rcParams['axes.labelsize'] = 20
mlp.rcParams['axes.titlesize'] = 20
mlp.rcParams['axes.titlecolor'] = '#00B050'
mlp.rcParams['axes.labelcolor'] = '#00B050'

In [787]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import make_scorer, mean_squared_error
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.svm import SVR
from sklearn.preprocessing import FunctionTransformer
from sklearn.impute import KNNImputer
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, StackingRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import LinearRegression, SGDRegressor, Ridge, Lasso, ElasticNet
import warnings

warnings.filterwarnings('ignore')
pd.options.display.max_columns = 500

In [746]:
def custom_cv(model, data=data, kf=kf):
    X = data.drop('SalePrice', axis=1)
    y = np.log1p(data['SalePrice'])
    
    
    cv_result = cross_validate(model, X, y,
                          scoring='neg_root_mean_squared_error',
                          cv=kf, return_train_score=True)

    cv_error_test = -np.mean(cv_result['test_score'])
    cv_error_train = -np.mean(cv_result['train_score'])

    print(f"RMSLE на Кросс-валидации на трейне: {cv_error_train}")
    print(f"RMSLE на Кросс-валидации на тесте: {cv_error_test}")
    
    return cv_error_test

In [747]:
def test_cv_new_data(data, new_col, col_drop=None):
    print('Текущая ошибка на кросс-валидации')
    custom_cv(pipe2, data)
    new_data = data.copy()
    new_data[col] = new_col
    if col_drop:
        new_data = new_data.drop(col_drop, axis=1)
    print("Ошибка после преобразования на кросс-валидации")
    custom_cv(pipe2, new_data)
    return

In [748]:
# Секретные функции для фильтрации признаков
def get_redundant_pairs(df):
    pairs_to_drop = set()
    cols = df.columns
    for i in range(0, df.shape[1]):
        for j in range(0, i + 1):
            pairs_to_drop.add((cols[i], cols[j]))
    return pairs_to_drop


def get_top_abs_correlations(df, n=5):
    au_corr = df.corr().abs().unstack()
    labels_to_drop = get_redundant_pairs(df)
    au_corr = au_corr.drop(labels=labels_to_drop).sort_values(ascending=False)
    return au_corr[0:n]

In [749]:
def correlation(df, threshold):
    col_corr = set() # все удаленные колонки
    corr_matrix = df.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if (corr_matrix.iloc[i, j] > threshold) and (corr_matrix.columns[j] not in col_corr):
                colname = corr_matrix.columns[i]
                col_corr.add(colname)
                if colname in df.columns:
                    del df[colname] # удаляем из датафрейма коррелирующую
                    
    print('Dropped cols:', col_corr)
    
    return df   

In [750]:
def fully_weighted(data, data_test, pipe, col, alpha1=0.71, alpha2=0.71, info=True):
    q_low = data[col].quantile(0.25)
    q_high = data[col].quantile(0.75) 

    tentacle_len = q_high - q_low

    upper_tentacle = q_high + alpha1 * tentacle_len
    lower_tentacle = q_low - alpha2 * tentacle_len

    before = data.shape[0]
    if info:
        print('Соотношение выбросов с остальными данными на тесте')
        print(data[data[col] >= upper_tentacle].shape[0], before)
        print(data[data[col] <= lower_tentacle].shape[0], before)
        print()
    
    data_outs = data[(data[col] > lower_tentacle) & (data[col] < upper_tentacle)]

    data_only_outs = data[(data[col] <= lower_tentacle) | (data[col] >= upper_tentacle)]

    n = before
    
    if info:
        print('Процент выбросов:', data_only_outs.shape[0] / n)
        print()
        
        weighted_cv_error = (data_only_outs.shape[0] / n) * custom_cv(model=pipe, data=data_only_outs) + \
        (data_outs.shape[0] / n) * custom_cv(model=pipe, data=data_outs)

        print("Взвешенная ошибка двух моделей на кросс-валидации", weighted_cv_error)
    
    x_outs = data_outs.drop('SalePrice', axis=1) 
    y_outs = data_outs['SalePrice']
    
    x_only_outs = data_only_outs.drop('SalePrice', axis=1)    
    y_only_outs = data_only_outs['SalePrice']
    
    pipe_outs = pipe.fit(x_outs, y_outs)
    pipe_only_outs = pipe.fit(x_only_outs, y_only_outs)
    
    # Test part 
    
    q_low_t = data_test[col].quantile(0.25)
    q_high_t = data_test[col].quantile(0.75) 

    tentacle_len_t = q_high_t - q_low_t

    upper_tentacle_t = q_high_t + alpha1 * tentacle_len_t
    lower_tentacle_t = q_low_t - alpha2 * tentacle_len_t

    before = data_test.shape[0]
    
    if info:
        print()
        print('Соотношение выбросов с остальными данными на трейне')
        print(data_test[data_test[col] >= upper_tentacle_t].shape[0], before)
        print(data_test[data_test[col] <= lower_tentacle_t].shape[0], before)
        print()
    
    data_outs_test = data_test[(data_test[col] > lower_tentacle_t) & (data_test[col] < upper_tentacle_t)]
    data_outs_test_idx = data_outs_test.index
    
    data_only_outs_test = data_test[(data_test[col] <= lower_tentacle_t) | (data_test[col] >= upper_tentacle_t)]
    data_only_outs_test_idx = data_only_outs_test.index

    if info:
        print('Процент выбросов:', data_only_outs_test.shape[0] / before)
        
    pred_outs = pipe_outs.predict(data_outs_test)
    pred_only_outs = pipe_only_outs.predict(data_only_outs_test)
       
    one = pd.DataFrame(pred_outs).set_index(data_outs_test_idx)
    two = pd.DataFrame(pred_only_outs).set_index(data_only_outs_test_idx)

    preds = pd.concat([one, two], ignore_index=False).rename(columns={0: 'y'})

    print('Mean value:', preds.mean())
    
    return preds

In [824]:
data = pd.read_csv('house_prices_kaggle/train.csv')
data = data.set_index('Id')
data.columns = data.columns.astype(str)
target = data['SalePrice']

data.head()

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1
1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7,5,2003,2003,Gable,CompShg,VinylSd,VinylSd,BrkFace,196.0,Gd,TA,PConc,Gd,TA,No,GLQ,706,Unf,0,150,856,GasA,Ex,Y,SBrkr,856,854,0,1710,1,0,2,1,3,1,Gd,8,Typ,0,,Attchd,2003.0,RFn,2,548,TA,TA,Y,0,61,0,0,0,0,,,,0,2,2008,WD,Normal,208500
2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,Gtl,Veenker,Feedr,Norm,1Fam,1Story,6,8,1976,1976,Gable,CompShg,MetalSd,MetalSd,,0.0,TA,TA,CBlock,Gd,TA,Gd,ALQ,978,Unf,0,284,1262,GasA,Ex,Y,SBrkr,1262,0,0,1262,0,1,2,0,3,1,TA,6,Typ,1,TA,Attchd,1976.0,RFn,2,460,TA,TA,Y,298,0,0,0,0,0,,,,0,5,2007,WD,Normal,181500
3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7,5,2001,2002,Gable,CompShg,VinylSd,VinylSd,BrkFace,162.0,Gd,TA,PConc,Gd,TA,Mn,GLQ,486,Unf,0,434,920,GasA,Ex,Y,SBrkr,920,866,0,1786,1,0,2,1,3,1,Gd,6,Typ,1,TA,Attchd,2001.0,RFn,2,608,TA,TA,Y,0,42,0,0,0,0,,,,0,9,2008,WD,Normal,223500
4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,Norm,Norm,1Fam,2Story,7,5,1915,1970,Gable,CompShg,Wd Sdng,Wd Shng,,0.0,TA,TA,BrkTil,TA,Gd,No,ALQ,216,Unf,0,540,756,GasA,Gd,Y,SBrkr,961,756,0,1717,1,0,1,0,3,1,Gd,7,Typ,1,Gd,Detchd,1998.0,Unf,3,642,TA,TA,Y,0,35,272,0,0,0,,,,0,2,2006,WD,Abnorml,140000
5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,Gtl,NoRidge,Norm,Norm,1Fam,2Story,8,5,2000,2000,Gable,CompShg,VinylSd,VinylSd,BrkFace,350.0,Gd,TA,PConc,Gd,TA,Av,GLQ,655,Unf,0,490,1145,GasA,Ex,Y,SBrkr,1145,1053,0,2198,1,0,2,1,4,1,Gd,9,Typ,1,TA,Attchd,2000.0,RFn,3,836,TA,TA,Y,192,84,0,0,0,0,,,,0,12,2008,WD,Normal,250000


In [825]:
data.describe()

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,TotRmsAbvGrd,Fireplaces,GarageYrBlt,GarageCars,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
count,1460.0,1201.0,1460.0,1460.0,1460.0,1460.0,1460.0,1452.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1379.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0
mean,56.89726,70.049958,10516.828082,6.099315,5.575342,1971.267808,1984.865753,103.685262,443.639726,46.549315,567.240411,1057.429452,1162.626712,346.992466,5.844521,1515.463699,0.425342,0.057534,1.565068,0.382877,2.866438,1.046575,6.517808,0.613014,1978.506164,1.767123,472.980137,94.244521,46.660274,21.95411,3.409589,15.060959,2.758904,43.489041,6.321918,2007.815753,180921.19589
std,42.300571,24.284752,9981.264932,1.382997,1.112799,30.202904,20.645407,181.066207,456.098091,161.319273,441.866955,438.705324,386.587738,436.528436,48.623081,525.480383,0.518911,0.238753,0.550916,0.502885,0.815778,0.220338,1.625393,0.644666,24.689725,0.747315,213.804841,125.338794,66.256028,61.119149,29.317331,55.757415,40.177307,496.123024,2.703626,1.328095,79442.502883
min,20.0,21.0,1300.0,1.0,1.0,1872.0,1950.0,0.0,0.0,0.0,0.0,0.0,334.0,0.0,0.0,334.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,1900.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2006.0,34900.0
25%,20.0,59.0,7553.5,5.0,5.0,1954.0,1967.0,0.0,0.0,0.0,223.0,795.75,882.0,0.0,0.0,1129.5,0.0,0.0,1.0,0.0,2.0,1.0,5.0,0.0,1961.0,1.0,334.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,2007.0,129975.0
50%,50.0,69.0,9478.5,6.0,5.0,1973.0,1994.0,0.0,383.5,0.0,477.5,991.5,1087.0,0.0,0.0,1464.0,0.0,0.0,2.0,0.0,3.0,1.0,6.0,1.0,1980.0,2.0,480.0,0.0,25.0,0.0,0.0,0.0,0.0,0.0,6.0,2008.0,163000.0
75%,70.0,80.0,11601.5,7.0,6.0,2000.0,2004.0,166.0,712.25,0.0,808.0,1298.25,1391.25,728.0,0.0,1776.75,1.0,0.0,2.0,1.0,3.0,1.0,7.0,1.0,2002.0,2.0,576.0,168.0,68.0,0.0,0.0,0.0,0.0,0.0,8.0,2009.0,214000.0
max,190.0,313.0,215245.0,10.0,9.0,2010.0,2010.0,1600.0,5644.0,1474.0,2336.0,6110.0,4692.0,2065.0,572.0,5642.0,3.0,2.0,3.0,2.0,8.0,3.0,14.0,3.0,2010.0,4.0,1418.0,857.0,547.0,552.0,508.0,480.0,738.0,15500.0,12.0,2010.0,755000.0


In [826]:
data.describe(include='object')

Unnamed: 0,MSZoning,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinType2,Heating,HeatingQC,CentralAir,Electrical,KitchenQual,Functional,FireplaceQu,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,PoolQC,Fence,MiscFeature,SaleType,SaleCondition
count,1460,1460,91,1460,1460,1460,1460,1460,1460,1460,1460,1460,1460,1460,1460,1460,1460,588,1460,1460,1460,1423,1423,1422,1423,1422,1460,1460,1460,1459,1460,1460,770,1379,1379,1379,1379,1460,7,281,54,1460,1460
unique,5,2,2,4,4,2,5,3,25,9,8,5,8,6,8,15,16,3,4,5,6,4,4,4,6,6,6,5,2,5,4,7,5,6,3,5,5,3,3,4,4,9,6
top,RL,Pave,Grvl,Reg,Lvl,AllPub,Inside,Gtl,NAmes,Norm,Norm,1Fam,1Story,Gable,CompShg,VinylSd,VinylSd,BrkFace,TA,TA,PConc,TA,TA,No,Unf,Unf,GasA,Ex,Y,SBrkr,TA,Typ,Gd,Attchd,Unf,TA,TA,Y,Gd,MnPrv,Shed,WD,Normal
freq,1151,1454,50,925,1311,1459,1052,1382,225,1260,1445,1220,726,1141,1434,515,504,445,906,1282,647,649,1311,953,430,1256,1428,741,1365,1334,735,1360,380,870,605,1311,1326,1340,3,157,49,1267,1198


In [827]:
# Посомтрим на колонки где нанов больше n% процентов
for col in data.columns:
    if data[col].isna().any() and (data[col].isna().value_counts().loc[True] / data.shape[0]) > 0.05:
        print('Nanfull col:', col)
        

Nanfull col: LotFrontage
Nanfull col: Alley
Nanfull col: MasVnrType
Nanfull col: FireplaceQu
Nanfull col: GarageType
Nanfull col: GarageYrBlt
Nanfull col: GarageFinish
Nanfull col: GarageQual
Nanfull col: GarageCond
Nanfull col: PoolQC
Nanfull col: Fence
Nanfull col: MiscFeature


In [828]:
# Функция для работы с данными
def data_wrangling(data):
    data = data.drop(['PoolQC', 'PoolArea'], axis=1)

    data['MiscFeature'] = data['MiscFeature'].fillna('No misc feature')
    data['Alley'] = data['Alley'].fillna('No alley access')
    data['Fence'] = data['Fence'].fillna('No fence')
    
    df_filtered = data[(data['MasVnrArea'] >= 288) & (data['MasVnrArea'] <= 344)]
    
    masvnr_type_mode = df_filtered['MasVnrType'].mode()

    if not masvnr_type_mode.empty:
        moda_value = masvnr_type_mode[0]

        data.loc[(data['MasVnrType'].isna()) & 
                     (data['MasVnrArea'] >= 288) & 
                     (data['MasVnrArea'] <= 344), 
                     'MasVnrType'] = moda_value

        data.loc[(data['MasVnrArea'] == 1.0) & 
                     (data['MasVnrType'].isna()), 
                     'MasVnrType'] = moda_value

    data['MasVnrType'] = data['MasVnrType'].fillna('No Masonry veneer type')
    
    data['MasVnrArea'] = data['MasVnrArea'].fillna(0.0)
    data['FireplaceQu'] = data['FireplaceQu'].fillna('No fireplace')
    
    # Creating a KNN imputer using selected variables
    imputer = KNNImputer(n_neighbors=5)

    # Selecting variables for imputation, including 'LotFrontage'
    features = ['LotFrontage', 'LotArea', 'OverallQual', 'GrLivArea', 'YearBuilt']

    # Applying the imputer
    df_imputed = pd.DataFrame(imputer.fit_transform(data[features]), columns=features)

    # Creating a flag indicating whether the value was imputed
    data['LotFrontage_Imputed_Flag'] = data['LotFrontage'].isnull().astype(int)

    # Replacing only the null values in the original column
    data['LotFrontage'] = data.apply(lambda row: df_imputed.loc[row.name, 'LotFrontage'] if pd.isnull(row['LotFrontage']) else row['LotFrontage'], axis=1)
    
    data['GarageYrBlt'] = data['GarageYrBlt'].fillna(-1)

    # Defining labels and bins
    bins = [-1, 1900, 1940, 1960, 1980, 2000, 2010, 2025]
    labels = ['Null', '1900-1940', '1941-1960', '1961-1980', '1981-2000', '2001-2010', '2011+']

    # Creating the bin column for df_train
    data['GarageYrBlt_Bin'] = pd.cut(data['GarageYrBlt'], bins=bins, labels=labels, right=False)
    data['GarageYrBlt_Bin'] = data['GarageYrBlt_Bin'].astype(object)

    data['GarageType'] = data['GarageType'].fillna('No garage type')

    data['GarageFinish'] = data['GarageFinish'].fillna('No garage finish')

    data['GarageQual'] = data['GarageQual'].fillna('No garage quality')

    data['GarageCond'] = data['GarageCond'].fillna('No garage condition')

    data['BsmtFinType1'] = data['BsmtFinType1'].fillna('No Basement')

    data['BsmtFinType2'] = data['BsmtFinType2'].fillna('No Basement')

    data['BsmtExposure'] = data['BsmtExposure'].fillna('No Basement')

    data['BsmtCond'] = data['BsmtCond'].fillna('No Basement')

    data['BsmtQual'] = data['BsmtQual'].fillna('No Basement')
    
    for col in ['Electrical', 'MSZoning', 'Functional', 'Utilities', 'KitchenQual', 'Exterior1st', 'Exterior2nd', 'SaleType']:
        data[col] = data[col].fillna(data[col].mode()[0])
    
    for col in ['BsmtFullBath', 'BsmtHalfBath', 'GarageCars', 'GarageArea', 'TotalBsmtSF', 'BsmtUnfSF', 'BsmtFinSF2', 'BsmtFinSF1']:
        data[col] = data[col].fillna(0)
    
    return data

In [829]:
data = data_wrangling(data)

In [830]:
# Feature engineering (s.loc[(s['x2'] == 'GdPrv'), 'x2a'] = 300)

# WoodDeckSF: Площадь деревянной палубы в квадратных футах
# OpenPorchSF: Площадь открытого крыльца в квадратных футах
# EnclosedPorch: Площадь закрытого крыльца в квадратных футах
# 3SsnPorch: Площадь крыльца для трех сезонов в квадратных футах
# ScreenPorch: Площадь крыльца в квадратных футах
s = pd.DataFrame()
s['y'] = data['SalePrice']
s['x1'] = data['WoodDeckSF']
s['x2'] = data['EnclosedPorch']
s['x3'] = data['OpenPorchSF']
s['x5'] = data['ScreenPorch']
s['x'] = (2 * s['x1'] + 3.5 * s['x3'] - 2 * s['x2'] + 3 * s['x5'])
s.loc[:, s.dtypes!=object].corr()

data['PorchFeature'] = s['x']

In [831]:
numeric_columns = data.loc[:, data.dtypes!=object].columns
categorical_columns = data.loc[:, data.dtypes==object].columns

In [832]:
# Замена вещественных нанов средним
for col in numeric_columns:
    to_fill = data[col].mean()
    data[col] = data[col].fillna(to_fill)
    
data[numeric_columns].isna().any().any()

False

In [833]:
# Линейно зависимые признаки
print("Top Absolute Correlations")
print(get_top_abs_correlations(data[numeric_columns.drop('SalePrice')], 41))

Top Absolute Correlations
GarageCars     GarageArea      0.882475
GrLivArea      TotRmsAbvGrd    0.825489
TotalBsmtSF    1stFlrSF        0.819530
2ndFlrSF       GrLivArea       0.687501
BedroomAbvGr   TotRmsAbvGrd    0.676620
BsmtFinSF1     BsmtFullBath    0.649212
OpenPorchSF    PorchFeature    0.633346
GrLivArea      FullBath        0.630012
WoodDeckSF     PorchFeature    0.626003
2ndFlrSF       TotRmsAbvGrd    0.616423
               HalfBath        0.609707
OverallQual    GarageCars      0.600671
GarageYrBlt    GarageCars      0.597993
OverallQual    GrLivArea       0.593007
YearBuilt      YearRemodAdd    0.592855
OverallQual    YearBuilt       0.572323
1stFlrSF       GrLivArea       0.566024
OverallQual    GarageArea      0.562022
GarageYrBlt    GarageArea      0.560771
FullBath       TotRmsAbvGrd    0.554784
OverallQual    YearRemodAdd    0.550684
               FullBath        0.550600
YearBuilt      GarageCars      0.537850
OverallQual    TotalBsmtSF     0.537808
BsmtFinSF1    

In [834]:
# удалим колонки, где абс корреляция оказывается >= 0.83
 
new_df = correlation(data[numeric_columns.drop('SalePrice')], 0.88)
new_numeric_columns = new_df.loc[:,new_df.dtypes!=object].columns

Dropped cols: {'GarageArea'}


In [835]:
dropped_cols = ['GarageArea']

In [836]:
# Замена категориальных нанов на самые популярные
for col in categorical_columns:
    most_popular = data[col].value_counts().index[0]
    data[col] = data[col].fillna(most_popular)

data[categorical_columns].isna().any().any()

False

In [837]:
# Конкатим обработанный датасет
data = pd.concat((data[categorical_columns], data[new_numeric_columns]), axis=1)
data = pd.concat((data, target), axis=1)
data.head()

Unnamed: 0_level_0,MSZoning,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinType2,Heating,HeatingQC,CentralAir,Electrical,KitchenQual,Functional,FireplaceQu,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,Fence,MiscFeature,SaleType,SaleCondition,GarageYrBlt_Bin,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,TotRmsAbvGrd,Fireplaces,GarageYrBlt,GarageCars,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,MiscVal,MoSold,YrSold,LotFrontage_Imputed_Flag,PorchFeature,SalePrice
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1
1,RL,Pave,No alley access,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,Gable,CompShg,VinylSd,VinylSd,BrkFace,Gd,TA,PConc,Gd,TA,No,GLQ,Unf,GasA,Ex,Y,SBrkr,Gd,Typ,No fireplace,Attchd,RFn,TA,TA,Y,No fence,No misc feature,WD,Normal,2001-2010,60,65.0,8450,7,5,2003,2003,196.0,706,0,150,856,856,854,0,1710,1,0,2,1,3,1,8,0,2003.0,2,0,61,0,0,0,0,2,2008,0,213.5,208500
2,RL,Pave,No alley access,Reg,Lvl,AllPub,FR2,Gtl,Veenker,Feedr,Norm,1Fam,1Story,Gable,CompShg,MetalSd,MetalSd,No Masonry veneer type,TA,TA,CBlock,Gd,TA,Gd,ALQ,Unf,GasA,Ex,Y,SBrkr,TA,Typ,TA,Attchd,RFn,TA,TA,Y,No fence,No misc feature,WD,Normal,1961-1980,20,80.0,9600,6,8,1976,1976,0.0,978,0,284,1262,1262,0,0,1262,0,1,2,0,3,1,6,1,1976.0,2,298,0,0,0,0,0,5,2007,0,596.0,181500
3,RL,Pave,No alley access,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,Gable,CompShg,VinylSd,VinylSd,BrkFace,Gd,TA,PConc,Gd,TA,Mn,GLQ,Unf,GasA,Ex,Y,SBrkr,Gd,Typ,TA,Attchd,RFn,TA,TA,Y,No fence,No misc feature,WD,Normal,2001-2010,60,68.0,11250,7,5,2001,2002,162.0,486,0,434,920,920,866,0,1786,1,0,2,1,3,1,6,1,2001.0,2,0,42,0,0,0,0,9,2008,0,147.0,223500
4,RL,Pave,No alley access,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,Norm,Norm,1Fam,2Story,Gable,CompShg,Wd Sdng,Wd Shng,No Masonry veneer type,TA,TA,BrkTil,TA,Gd,No,ALQ,Unf,GasA,Gd,Y,SBrkr,Gd,Typ,Gd,Detchd,Unf,TA,TA,Y,No fence,No misc feature,WD,Abnorml,1981-2000,70,60.0,9550,7,5,1915,1970,0.0,216,0,540,756,961,756,0,1717,1,0,1,0,3,1,7,1,1998.0,3,0,35,272,0,0,0,2,2006,0,-421.5,140000
5,RL,Pave,No alley access,IR1,Lvl,AllPub,FR2,Gtl,NoRidge,Norm,Norm,1Fam,2Story,Gable,CompShg,VinylSd,VinylSd,BrkFace,Gd,TA,PConc,Gd,TA,Av,GLQ,Unf,GasA,Ex,Y,SBrkr,Gd,Typ,TA,Attchd,RFn,TA,TA,Y,No fence,No misc feature,WD,Normal,2001-2010,60,84.0,14260,8,5,2000,2000,350.0,655,0,490,1145,1145,1053,0,2198,1,0,2,1,4,1,9,1,2000.0,3,192,84,0,0,0,0,12,2008,0,678.0,250000


In [838]:
# Создаем траснформер с мин таргетом, ван-хот
from sklearn.compose import ColumnTransformer
from category_encoders import TargetEncoder
from category_encoders.one_hot import OneHotEncoder
from sklearn.preprocessing import StandardScaler

# Делим колонки на onehot и target
cols_ohe = [x for x in categorical_columns if data[x].nunique() < 5]
cols_mte = [x for x in categorical_columns if data[x].nunique() >= 5]

# возьмем индексы колонок
ohe_index = [list(data.drop('SalePrice', axis=1).columns).index(col) for col in cols_ohe]
mte_index = [list(data.drop('SalePrice', axis=1).columns).index(col) for col in cols_mte]
num_index = [list(data.drop('SalePrice', axis=1).columns).index(col) for col in new_numeric_columns]

# создаем кастомный трансформер

t = [('OneHotEncoder', OneHotEncoder(), ohe_index),
     ('MeanTargetEncoder', TargetEncoder(), mte_index),
     ('StandardScaler', StandardScaler(), num_index)]

col_transform = ColumnTransformer(transformers=t)

In [839]:
# Разбиваем выборку
from sklearn.model_selection import train_test_split

X = data.drop('SalePrice', axis=1)

y = target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, shuffle=True, random_state=42)

In [840]:
from sklearn.model_selection import KFold, cross_validate

kf = KFold(n_splits=5, shuffle=True, random_state=42)

kf1 = KFold(n_splits=15, shuffle=True, random_state=42)

In [841]:
# !pip install xgboost

In [842]:
import xgboost as xgb

xgb_params = {
    'objective': 'reg:squarederror',
    'eval_metric': 'rmse',
    'max_depth': 2,
    'learning_rate': 0.12,
    'n_estimators': 1000,
    'gamma': 0.000001,
    'subsample': 0.7,
    'colsample_bytree': 0.9,
    'reg_alpha': 3
#     'reg_lambda': 100
}

xgb_model = xgb.XGBRegressor(**xgb_params)

pipe2 = Pipeline([
    ('trans', col_transform),
    ('boostin', xgb_model)
])

custom_cv(pipe2, data, kf)
# 0.1272
pipe2.fit(X_train, np.log1p(y_train))
preds = pipe2.predict(X_test)

print('RMSLE:', np.mean((preds - np.log1p(y_test)) ** 2) ** 0.5)

RMSLE на Кросс-валидации на трейне: 0.09922795919669229
RMSLE на Кросс-валидации на тесте: 0.12840840441850304
RMSLE: 0.13359002618238616


In [843]:
data_test = pd.read_csv('house_prices_kaggle/test.csv')
data_test_id = data_test['Id']
data_test = data_test.drop('Id', axis=1)
data_test.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,Inside,Gtl,NAmes,Feedr,Norm,1Fam,1Story,5,6,1961,1961,Gable,CompShg,VinylSd,VinylSd,,0.0,TA,TA,CBlock,TA,TA,No,Rec,468.0,LwQ,144.0,270.0,882.0,GasA,TA,Y,SBrkr,896,0,0,896,0.0,0.0,1,0,2,1,TA,5,Typ,0,,Attchd,1961.0,Unf,1.0,730.0,TA,TA,Y,140,0,0,0,120,0,,MnPrv,,0,6,2010,WD,Normal
1,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,Corner,Gtl,NAmes,Norm,Norm,1Fam,1Story,6,6,1958,1958,Hip,CompShg,Wd Sdng,Wd Sdng,BrkFace,108.0,TA,TA,CBlock,TA,TA,No,ALQ,923.0,Unf,0.0,406.0,1329.0,GasA,TA,Y,SBrkr,1329,0,0,1329,0.0,0.0,1,1,3,1,Gd,6,Typ,0,,Attchd,1958.0,Unf,1.0,312.0,TA,TA,Y,393,36,0,0,0,0,,,Gar2,12500,6,2010,WD,Normal
2,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,Inside,Gtl,Gilbert,Norm,Norm,1Fam,2Story,5,5,1997,1998,Gable,CompShg,VinylSd,VinylSd,,0.0,TA,TA,PConc,Gd,TA,No,GLQ,791.0,Unf,0.0,137.0,928.0,GasA,Gd,Y,SBrkr,928,701,0,1629,0.0,0.0,2,1,3,1,TA,6,Typ,1,TA,Attchd,1997.0,Fin,2.0,482.0,TA,TA,Y,212,34,0,0,0,0,,MnPrv,,0,3,2010,WD,Normal
3,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,Inside,Gtl,Gilbert,Norm,Norm,1Fam,2Story,6,6,1998,1998,Gable,CompShg,VinylSd,VinylSd,BrkFace,20.0,TA,TA,PConc,TA,TA,No,GLQ,602.0,Unf,0.0,324.0,926.0,GasA,Ex,Y,SBrkr,926,678,0,1604,0.0,0.0,2,1,3,1,Gd,7,Typ,1,Gd,Attchd,1998.0,Fin,2.0,470.0,TA,TA,Y,360,36,0,0,0,0,,,,0,6,2010,WD,Normal
4,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,Inside,Gtl,StoneBr,Norm,Norm,TwnhsE,1Story,8,5,1992,1992,Gable,CompShg,HdBoard,HdBoard,,0.0,Gd,TA,PConc,Gd,TA,No,ALQ,263.0,Unf,0.0,1017.0,1280.0,GasA,Ex,Y,SBrkr,1280,0,0,1280,0.0,0.0,2,0,2,1,Gd,5,Typ,0,,Attchd,1992.0,RFn,2.0,506.0,TA,TA,Y,0,82,0,0,144,0,,,,0,1,2010,WD,Normal


In [844]:
data_test = data_wrangling(data_test)

In [845]:
for col in dropped_cols:
    if col in data_test.columns:
        data_test = data_test.drop(col, axis=1)

In [846]:
s = pd.DataFrame()
s['x1'] = data_test['WoodDeckSF']
s['x2'] = data_test['EnclosedPorch']
s['x3'] = data_test['OpenPorchSF']
s['x5'] = data_test['ScreenPorch']
s['x'] = (2 * s['x1'] + 3.5 * s['x3'] - 2 * s['x2'] + 3 * s['x5'])

data_test['PorchFeature'] = s['x']

In [847]:
# базовые модели для стэккинга
base_models = [
    ('ridge', Ridge(alpha=0.5)),
    ('lasso', Lasso(alpha=0.01)),
    ('elasticnet', ElasticNet(alpha=0.05, l1_ratio=0.3)),
    ('svr', SVR(kernel='rbf', C=0.5)),
    ('gboost', GradientBoostingRegressor(learning_rate=0.01, n_estimators=100)),
    ('xgboost', xgb.XGBRegressor(learning_rate=0.1, max_depth=3, n_estimators=100)),
    ('rf', RandomForestRegressor(max_depth=3, n_estimators=100))
]

# Defining the meta model for stacking
meta_model = xgb_model

# Creating the stacking model using the base models and the meta model
stacking_model = StackingRegressor(estimators=base_models, final_estimator=meta_model)

# Defining the RMSLE function for scoring
def rmsle(y_true, y_pred):
    return np.sqrt(np.mean((np.log1p(y_pred) - np.log1p(y_true)) ** 2))

# Creating a scorer object for RMSLE
rmsle_scorer = make_scorer(rmsle, greater_is_better=False)

In [849]:
#  Scaling the features using col_transform

# X = col_transform.fit_transform(X, y)  
# data_test = col_transform.transform(data_test) 

# # Adjusting the scaled training and test data to avoid negative values
# offset = 1.0 - np.min(X)
# X = X + offset
# data_test = data_test + offset

# # Applying log1p transformation to the adjusted scaled features
# log_transformer = FunctionTransformer(func=np.log1p, validate=True)
# X_logt = log_transformer.fit_transform(X)
# data_test_logt = log_transformer.transform(data_test)

# Defining the parameter distribution for RandomizedSearchCV
param_dist = {
#     'final_estimator__learning_rate': [0.05],
#     'final_estimator__max_depth': [3],
    'ridge__alpha': [0.5],
    'lasso__alpha': [0.01],
    'elasticnet__alpha': [0.05],
    'elasticnet__l1_ratio': [0.3],
    'svr__C': [0.5],
    'gboost__learning_rate': [0.01],
    'gboost__n_estimators': [1000],
    'xgboost__learning_rate': [0.1],
    'xgboost__max_depth': [3],
    'rf__max_depth': [3],
    'rf__n_estimators': [1000]
}

# Setting up RandomizedSearchCV with the stacking model and parameter distribution
random_search = RandomizedSearchCV(estimator=stacking_model, param_distributions=param_dist, n_iter=1, cv=5, scoring=rmsle_scorer, n_jobs=-1, random_state=42)

# Fitting the RandomizedSearchCV
random_search.fit(X_logt, np.log1p(y))

# Retrieving the best model and parameters from the search
best_model = random_search.best_estimator_
best_params = random_search.best_params_

# Retrieving and printing the best RMSLE score
best_score = -random_search.best_score_
print(f"Best RMSLE from cross-validation: {best_score}")
print(f"Best parameters: {best_params}")

Best RMSLE from cross-validation: 0.009678961670358525
Best parameters: {'xgboost__max_depth': 3, 'xgboost__learning_rate': 0.1, 'svr__C': 0.5, 'ridge__alpha': 0.5, 'rf__n_estimators': 1000, 'rf__max_depth': 3, 'lasso__alpha': 0.01, 'gboost__n_estimators': 1000, 'gboost__learning_rate': 0.01, 'elasticnet__l1_ratio': 0.3, 'elasticnet__alpha': 0.05}


In [850]:
# Make predictions on the test set
predictions = best_model.predict(data_test_logt)
# Convert the predictions from Log1p back to the original scale,
# since Kaggle requires predicted values not to be in this normalization format.
predictions = np.expm1(predictions)

submission = pd.DataFrame({
    'Id': data_test_id,
    'SalePrice': predictions
}).set_index('Id')

submission.to_csv('house_prices_kaggle/submission.csv')