In [117]:
# base
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

# Важная настройка для корректной настройки pipeline!
import sklearn
sklearn.set_config(transform_output="pandas")

# Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.base import BaseEstimator, TransformerMixin

# Preprocessing
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.preprocessing import OneHotEncoder, StandardScaler, RobustScaler, MinMaxScaler, OrdinalEncoder, TargetEncoder
from sklearn.model_selection import GridSearchCV, KFold

# for model learning
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score

#models
from sklearn.neighbors import KNeighborsClassifier, RadiusNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, BaggingClassifier
from sklearn.svm import SVC
from catboost import CatBoostRegressor

# Metrics
from sklearn.metrics import *


# tunning hyperparamters model
import optuna

In [118]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
df = pd.read_csv('/home/zef/DS_Bootcamp/HousePrices/TempData/train.csv')
df.shape
df['SalePrice'] = df['SalePrice'].map(np.log)

In [119]:
X_train, X_valid, y_train, y_valid = train_test_split(df.drop('SalePrice', axis=1), df['SalePrice'], test_size=0.2, random_state=42)

In [120]:
Cat_With_NaNs = pd.DataFrame(data={'NaN_count': df.isna().sum(), 'Sum':df.count(), 'data_type':df.dtypes})
Cat_With_NaNs = Cat_With_NaNs[(Cat_With_NaNs['NaN_count'] != 0) & (Cat_With_NaNs['data_type'] == 'object')]
NaN_Is_A_Class = Cat_With_NaNs.index.to_list()
NaN_Is_Abcence = [NaN_Is_A_Class.pop(NaN_Is_A_Class.index('Electrical'))]

NaN_Is_A_Class, NaN_Is_Abcence

(['Alley',
  'MasVnrType',
  'BsmtQual',
  'BsmtCond',
  'BsmtExposure',
  'BsmtFinType1',
  'BsmtFinType2',
  'FireplaceQu',
  'GarageType',
  'GarageFinish',
  'GarageQual',
  'GarageCond',
  'PoolQC',
  'Fence',
  'MiscFeature'],
 ['Electrical'])

In [121]:
pp_cat = df.select_dtypes('object').nunique(0,dropna=False).to_frame().rename(columns={0:'Uniqie_amount'})
OrdEncCol = pp_cat[pp_cat['Uniqie_amount'] == 2].index.to_list()
TrgEncCol = pp_cat[pp_cat['Uniqie_amount'] > 2].index.to_list() 
OrdEncCol, TrgEncCol

(['Street', 'Utilities', 'CentralAir'],
 ['MSZoning',
  'Alley',
  'LotShape',
  'LandContour',
  'LotConfig',
  'LandSlope',
  'Neighborhood',
  'Condition1',
  'Condition2',
  'BldgType',
  'HouseStyle',
  'RoofStyle',
  'RoofMatl',
  'Exterior1st',
  'Exterior2nd',
  'MasVnrType',
  'ExterQual',
  'ExterCond',
  'Foundation',
  'BsmtQual',
  'BsmtCond',
  'BsmtExposure',
  'BsmtFinType1',
  'BsmtFinType2',
  'Heating',
  'HeatingQC',
  'Electrical',
  'KitchenQual',
  'Functional',
  'FireplaceQu',
  'GarageType',
  'GarageFinish',
  'GarageQual',
  'GarageCond',
  'PavedDrive',
  'PoolQC',
  'Fence',
  'MiscFeature',
  'SaleType',
  'SaleCondition'])

In [122]:
drop_features = ['Id']

imputer = ColumnTransformer(
    transformers = [
        ('drop_features', 'drop', drop_features), #дроп id
        ('cat_imputer_mf', SimpleImputer(strategy='most_frequent'), NaN_Is_Abcence),
        ('cat_imputer_no_class', SimpleImputer(strategy='constant',fill_value='No_Class'), NaN_Is_A_Class)
    ],
    verbose_feature_names_out = False,
    remainder = 'passthrough' 
)    

Categorial_NonNan_Encoder = ColumnTransformer(
    transformers = [
        ('ordinal_encoding', OrdinalEncoder(dtype='int64'), OrdEncCol),
        ('target_encoding_columns',TargetEncoder(target_type='continuous'),TrgEncCol)
    ],
    verbose_feature_names_out = False,
    remainder = 'passthrough' 
)    

Encoded_Cat_Scaler = ColumnTransformer(
    [
        ('scaling_num_columns', StandardScaler(), TrgEncCol)
    ],
    verbose_feature_names_out = False,
    remainder = 'passthrough' 
)

NNE = Categorial_NonNan_Encoder.fit_transform(X_train,y_train)
NNE = Encoded_Cat_Scaler.fit_transform(NNE,y_train)
NNE.head(10)

Unnamed: 0,MSZoning,Alley,LotShape,LandContour,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinType2,Heating,HeatingQC,Electrical,KitchenQual,Functional,FireplaceQu,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,PoolQC,Fence,MiscFeature,SaleType,SaleCondition,Street,Utilities,CentralAir,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,TotRmsAbvGrd,Fireplaces,GarageYrBlt,GarageCars,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold
254,0.323815,0.151786,-0.793156,-0.110062,-0.233965,-0.38506,-0.587411,0.129609,-0.13111,0.227619,-0.248565,-0.47675,-0.202693,-1.007769,-1.032332,-0.722004,-0.705773,-0.635126,-0.754919,-0.820536,0.132973,-0.489806,-1.018225,0.152964,0.084751,-1.051488,0.263656,-0.780356,0.184792,-1.027356,0.53862,0.598424,0.252314,0.291811,0.280258,-0.096275,0.355535,0.028733,-0.23043,-0.124985,1,0,1,255,20,70.0,8400,5,6,1957,1957,0.0,922,0,392,1314,1314,0,0,1314,1,0,1,0,3,1,5,0,1957.0,1,294,250,0,0,0,0,0,0,6,2010
1066,0.359774,0.205287,1.079035,0.063419,-0.321166,-0.039721,0.491572,0.214399,0.146055,0.314176,1.161958,-0.468193,-0.018068,1.072921,1.111017,-0.743549,1.092943,0.310801,1.070271,0.611206,0.170927,-0.452255,-0.226311,0.198417,0.144418,-0.697768,0.300354,-0.809788,0.243735,0.736006,0.568254,0.612352,0.286239,0.296527,0.30403,-0.007217,0.455641,0.190739,-0.211813,-0.138519,1,0,1,1067,60,59.0,7837,6,7,1993,1994,0.0,0,0,799,799,799,772,0,1571,0,0,2,1,3,1,7,1,1993.0,2,380,0,40,0,0,0,0,0,5,2009
638,0.323815,0.151786,-0.793156,-0.110062,-0.233965,-0.38506,-1.128702,-2.442652,-0.13111,0.227619,-0.248565,-0.47675,-0.202693,-1.007769,-0.987523,-0.722004,-0.705773,0.18021,-0.754919,-1.398587,0.132973,-0.489806,-0.254726,0.152964,0.084751,-0.551071,-3.141656,-0.780356,0.184792,-1.027356,-2.584743,-2.395025,-3.732406,-3.872411,-1.693917,-0.096275,-1.738653,0.028733,-0.23043,-0.124985,1,0,1,639,30,67.0,8777,5,7,1910,1950,0.0,0,0,796,796,796,0,0,796,0,0,1,0,2,1,4,0,,0,0,328,0,164,0,0,0,0,5,2008
799,0.381549,0.18419,-0.760906,-0.073524,-0.51562,-0.328325,-0.473682,-2.689456,-0.092129,0.257969,-1.548132,-0.562583,-0.193116,-1.124124,-1.15707,0.778801,-0.679262,0.187404,-1.175782,0.582506,0.136889,-0.518885,-0.406653,0.182301,0.05823,0.95073,0.281473,0.697696,0.23843,0.729302,-1.066435,-0.791142,0.253559,0.289958,0.293039,-0.119029,-2.171598,0.061378,-0.272115,-0.21894,1,0,1,800,50,60.0,7200,5,7,1937,1950,252.0,569,0,162,731,981,787,0,1768,1,0,1,1,3,1,7,2,1939.0,1,240,0,0,264,0,0,0,0,6,2007
380,0.381549,-0.317267,-0.760906,-0.073524,-0.344188,-0.328325,-0.473682,0.170518,-0.092129,0.257969,-1.548132,-0.562583,-0.193116,0.873549,-1.15707,-0.785675,-0.679262,0.187404,-1.175782,-0.781628,0.136889,-0.518885,-1.104933,0.182301,0.05823,-1.041657,0.281473,0.697696,0.23843,0.982799,-1.066435,-0.791142,0.253559,0.289958,0.293039,-0.119029,0.388402,0.061378,-0.272115,-0.21894,1,0,1,381,50,50.0,5000,5,6,1924,1950,0.0,218,0,808,1026,1026,665,0,1691,0,0,2,0,3,1,6,1,1924.0,1,308,0,0,242,0,0,0,0,5,2010
303,0.43805,0.330139,-0.630135,0.107413,0.089017,0.274997,0.60938,0.304332,0.455559,0.414362,-0.152249,-0.368721,0.209,1.064145,1.133942,-0.705824,-0.678332,0.4079,1.147751,-0.802982,0.225625,-0.414196,-0.507105,0.333713,0.247138,-1.00831,0.345866,-0.764678,0.379195,-0.930309,0.637853,-0.820857,0.301397,0.329084,0.355725,0.126501,-2.62227,0.325724,-0.177995,-1.457126,1,0,1,304,20,70.0,9800,5,7,1972,1972,0.0,894,0,0,894,894,0,0,894,1,0,1,0,3,1,5,0,1975.0,2,552,256,0,0,0,0,0,0,7,2006
86,0.359774,0.205287,2.808217,0.063419,-0.321166,-0.039721,0.491572,0.214399,0.146055,0.314176,1.161958,-0.468193,-0.018068,1.072921,1.111017,-0.743549,1.092943,0.310801,1.070271,0.611206,0.170927,1.228647,-0.226311,0.198417,0.144418,0.975676,0.300354,0.751824,0.243735,1.037137,1.567027,1.175812,0.286239,0.296527,0.30403,-0.007217,0.455641,0.190739,-0.211813,-0.138519,1,0,1,87,60,122.0,11911,6,5,2005,2005,0.0,0,0,684,684,684,876,0,1560,0,0,2,1,3,1,6,1,2005.0,2,400,100,38,0,0,0,0,0,3,2009
1385,-2.075844,0.205287,-0.71053,0.063419,-0.321166,-0.039721,-1.703482,0.214399,0.146055,0.314176,-1.921156,-0.468193,-0.018068,1.072921,1.111017,-0.743549,-0.697702,-1.137083,-1.269143,-0.801879,0.170927,-0.452255,-0.860952,0.198417,0.144418,-0.697768,0.300354,0.751824,0.243735,-0.973053,-1.093635,-0.82598,0.286239,0.296527,-3.737969,-0.007217,-1.95529,0.190739,-0.211813,-0.138519,1,0,1,1386,50,40.0,5436,4,8,1922,2007,0.0,735,0,61,796,796,358,0,1154,1,0,1,0,3,1,7,0,1922.0,1,240,0,96,0,0,0,0,0,5,2010
265,0.318749,0.151996,1.098479,-0.01529,-0.292563,-0.371329,0.338529,0.164211,-0.136881,0.210726,-0.334757,-0.592699,-0.216002,-0.968706,-0.979683,0.801316,-0.698246,-0.936149,-0.696102,0.561811,0.138785,-0.47902,1.448908,-0.72078,0.070096,-1.059765,0.266676,0.718228,0.188622,0.792157,0.545961,1.155521,0.258354,0.285775,0.242197,-0.145255,0.696037,0.006343,-0.252341,-0.183107,1,0,1,266,20,78.0,12090,6,6,1981,1981,210.0,588,228,606,1422,1422,0,0,1422,0,0,2,0,3,1,7,1,1981.0,2,576,276,0,0,0,0,0,0,6,2008
793,0.323815,0.151786,-0.793156,-0.110062,-0.233965,-0.38506,0.838193,0.129609,-0.13111,0.227619,-0.248565,-0.47675,-0.202693,1.12393,1.144709,2.246693,1.058251,0.18021,1.069703,0.608771,0.132973,0.992724,-0.254726,0.152964,0.084751,0.948448,0.263656,0.734339,0.184792,-1.027356,0.53862,1.143005,0.252314,0.291811,0.280258,-0.096275,0.355535,0.028733,3.056844,2.962403,1,0,1,794,20,76.0,9158,8,5,2007,2007,140.0,0,0,1496,1496,1496,0,0,1496,0,0,2,0,3,1,7,0,2007.0,2,474,168,130,0,0,0,0,0,6,2007


In [123]:
ml_pipeline = Pipeline(
    [
        ('imputer', imputer),
        ('encoder', Categorial_NonNan_Encoder),
        ('scaler', Encoded_Cat_Scaler)
        #('model', vc)
    ]
)

temp = ml_pipeline.fit_transform(X_train,y_train)
temp.head()

Unnamed: 0,MSZoning,Alley,LotShape,LandContour,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinType2,Heating,HeatingQC,Electrical,KitchenQual,Functional,FireplaceQu,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,PoolQC,Fence,MiscFeature,SaleType,SaleCondition,Street,Utilities,CentralAir,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,TotRmsAbvGrd,Fireplaces,GarageYrBlt,GarageCars,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold
254,0.376628,0.245149,-0.649237,0.017046,-0.214104,-0.121943,-0.486875,0.242697,0.107658,0.280281,-0.144244,-0.46268,-0.013392,-0.933213,-0.898821,-0.740162,-0.697164,-0.541528,-0.69669,-0.825668,0.184002,-0.471082,-0.955042,0.240356,0.155776,-1.023923,0.327558,-0.787589,0.264953,-0.96137,0.591793,0.613649,0.318719,0.332387,0.329036,0.044046,0.501332,0.214027,-0.237285,-0.133609,1,0,1,20,70.0,8400,5,6,1957,1957,0.0,922,0,392,1314,1314,0,0,1314,1,0,1,0,3,1,5,0,1957.0,1,294,250,0,0,0,0,0,0,6,2010
1066,0.35681,0.128837,1.061507,-0.021825,-0.366545,-0.275263,0.460934,0.108499,-0.1719,0.195822,1.098237,-0.546109,-0.254757,1.039794,1.116341,-0.815487,1.060399,0.150859,1.063232,0.56529,0.104102,-0.478153,-0.271734,0.107674,0.022698,-0.776661,0.240112,-0.792648,0.209794,0.805332,0.597518,0.639167,0.235823,0.257411,0.25626,-0.188211,0.311362,-0.05228,-0.266001,-0.237588,1,0,1,60,59.0,7837,6,7,1993,1994,0.0,0,0,799,799,799,772,0,1571,0,0,2,1,3,1,7,1,1993.0,2,380,0,40,0,0,0,0,0,5,2009
638,0.358744,0.173693,-0.728217,-0.043982,-0.34497,-0.184219,-1.1078,-2.420559,-0.036765,0.283591,-0.197277,-0.614386,-0.177168,-1.125904,-1.087782,-0.744133,-0.696336,0.264648,-0.706224,-1.337734,0.128914,-0.435277,-0.231679,0.188903,0.102559,-0.66573,-2.663785,-0.769205,0.194041,-1.008941,-2.284749,-2.124876,-3.401441,-3.397711,-1.912519,-0.078859,-1.940416,0.055146,-0.238607,-0.171973,1,0,1,30,67.0,8777,5,7,1910,1950,0.0,0,0,796,796,796,0,0,796,0,0,1,0,2,1,4,0,,0,0,328,0,164,0,0,0,0,5,2008
799,0.35681,0.128837,-0.771769,-0.021825,-0.495256,-0.275263,-0.68178,-2.313203,-0.1719,0.195822,-1.756104,-0.546109,-0.254757,-1.11965,-1.154067,0.772351,-0.686755,0.150859,-1.334798,0.56529,0.104102,-0.478153,-0.478026,0.107674,0.022698,0.927775,0.240112,0.75026,0.209794,0.805332,-1.134225,-0.869938,0.235823,0.257411,0.25626,-0.188211,-1.91782,-0.05228,-0.266001,-0.237588,1,0,1,50,60.0,7200,5,7,1937,1950,252.0,569,0,162,731,981,787,0,1768,1,0,1,1,3,1,7,2,1939.0,1,240,0,0,264,0,0,0,0,6,2007
380,0.391284,-0.593509,-0.787016,0.066639,-0.181903,0.02869,-0.729475,0.302037,0.269555,0.390804,-1.606958,-0.320124,0.144637,0.829651,-1.066544,-0.717957,-0.688443,0.335012,-1.326971,-0.792669,0.240437,-0.470165,-0.744412,0.321007,0.239616,-1.123942,0.35419,0.762721,0.358756,1.047442,-1.102997,-0.821754,0.316517,0.328495,0.339319,0.094685,0.530125,0.336262,-0.183053,-0.109564,1,0,1,50,50.0,5000,5,6,1924,1950,0.0,218,0,808,1026,1026,665,0,1691,0,0,2,0,3,1,6,1,1924.0,1,308,0,0,242,0,0,0,0,5,2010


In [132]:
plt.figure(figsize=(15,10),dpi=300)
pd.concat([temp,y_train],axis=1).corr()['SalePrice'].sort_values(ascending=False,key = lambda x: np.abs(x))

SalePrice        1.000000
OverallQual      0.807946
Neighborhood     0.732702
GrLivArea        0.689011
GarageCars       0.684451
KitchenQual      0.667634
BsmtQual         0.666445
ExterQual        0.656724
GarageArea       0.654520
GarageFinish     0.618243
TotalBsmtSF      0.597214
FullBath         0.583747
1stFlrSF         0.580947
YearBuilt        0.576725
GarageType       0.574519
YearRemodAdd     0.562152
FireplaceQu      0.545306
Foundation       0.532544
GarageYrBlt      0.530321
TotRmsAbvGrd     0.520082
HeatingQC        0.489835
Fireplaces       0.481761
BsmtFinType1     0.467507
MasVnrArea       0.422217
MasVnrType       0.415056
Exterior1st      0.410556
GarageCond       0.384596
Exterior2nd      0.374311
MSZoning         0.372734
BsmtExposure     0.367981
CentralAir       0.364677
GarageQual       0.363837
BsmtFinSF1       0.340878
WoodDeckSF       0.340702
LotFrontage      0.334227
SaleCondition    0.331122
HouseStyle       0.315553
SaleType         0.313685
2ndFlrSF    

<Figure size 4500x3000 with 0 Axes>