In [247]:
# base
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

# Важная настройка для корректной настройки pipeline!
import sklearn
sklearn.set_config(transform_output="pandas")

# Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.base import BaseEstimator, TransformerMixin

# Preprocessing
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.preprocessing import OneHotEncoder, StandardScaler, RobustScaler, MinMaxScaler, OrdinalEncoder, TargetEncoder
from sklearn.model_selection import GridSearchCV, KFold

# for model learning
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score

#models
from sklearn.neighbors import KNeighborsClassifier, RadiusNeighborsClassifier
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, BaggingClassifier
from sklearn.svm import SVC
from catboost import CatBoostRegressor

# Metrics
from sklearn.metrics import root_mean_squared_log_error


# tunning hyperparamters model
import optuna

### Подгружаем датасет ###

In [248]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
df = pd.read_csv('/home/zef/DS_Bootcamp/HousePrices/TempData/train.csv')

#Логарифмируем цену
df['SalePrice'] = df['SalePrice'].map(np.log)
#df.Utilities.value_counts()

In [249]:
#Подгружена заготовленная таблица параметров
PT = pd.read_csv('/home/zef/DS_Bootcamp/HousePrices/ParamTable.csv',index_col=[0])
PT.head(10)

Unnamed: 0,NaN_count,diff_vals,data_type,drop,NaN_imp_type,encoder,to_scale
Id,0,1460,int64,True,,,True
YearBuilt,0,112,int64,False,,,True
YearRemodAdd,0,61,int64,False,,,True
KitchenAbvGr,0,4,int64,False,,,True
BedroomAbvGr,0,8,int64,False,,,True
HalfBath,0,3,int64,False,,,True
FullBath,0,4,int64,False,,,True
OverallCond,0,9,int64,False,,,True
BsmtHalfBath,0,3,int64,False,,,True
GrLivArea,0,861,int64,False,,,True


In [250]:
#Сплит
X_train, X_valid, y_train, y_valid = train_test_split(df.drop('SalePrice', axis=1), df['SalePrice'], test_size=0.2, random_state=42)

In [251]:
#Первичный отбор признаков на заполнение: делим все где NaN это отдельный класс и где это действительно пропуск (увы, пока вручную)
Cat_With_NaNs = pd.DataFrame(data={'NaN_count': df.isna().sum(), 'Sum':df.count(), 'data_type':df.dtypes})
Cat_With_NaNs = Cat_With_NaNs[(Cat_With_NaNs['NaN_count'] != 0) & (Cat_With_NaNs['data_type'] == 'object')]
NaN_Is_A_Class = Cat_With_NaNs.index.to_list()
NaN_Is_Abcence = [NaN_Is_A_Class.pop(NaN_Is_A_Class.index('Electrical'))]

#результат
NaN_Is_A_Class, NaN_Is_Abcence

(['Alley',
  'MasVnrType',
  'BsmtQual',
  'BsmtCond',
  'BsmtExposure',
  'BsmtFinType1',
  'BsmtFinType2',
  'FireplaceQu',
  'GarageType',
  'GarageFinish',
  'GarageQual',
  'GarageCond',
  'PoolQC',
  'Fence',
  'MiscFeature'],
 ['Electrical'])

In [252]:
#Подставляем в таблицу
for col in NaN_Is_A_Class:
    PT.loc[col,'NaN_imp_type'] = 'constant'

for col in NaN_Is_Abcence:
    PT.loc[col,'NaN_imp_type'] = 'most_frequent'

PT

Unnamed: 0,NaN_count,diff_vals,data_type,drop,NaN_imp_type,encoder,to_scale
Id,0,1460,int64,True,,,True
YearBuilt,0,112,int64,False,,,True
YearRemodAdd,0,61,int64,False,,,True
KitchenAbvGr,0,4,int64,False,,,True
BedroomAbvGr,0,8,int64,False,,,True
HalfBath,0,3,int64,False,,,True
FullBath,0,4,int64,False,,,True
OverallCond,0,9,int64,False,,,True
BsmtHalfBath,0,3,int64,False,,,True
GrLivArea,0,861,int64,False,,,True


In [253]:
#Определяем тип энкодера - кого больше 2 - target/catboost/(можно и one hot), кого меньше - label
pp_cat = df.select_dtypes('object').nunique(0,dropna=False).to_frame().rename(columns={0:'Uniqie_amount'})
OrdEncCol = pp_cat[pp_cat['Uniqie_amount'] == 2].index.to_list()
TrgEncCol = pp_cat[pp_cat['Uniqie_amount'] > 2].index.to_list() 
OrdEncCol, TrgEncCol

(['Street', 'Utilities', 'CentralAir'],
 ['MSZoning',
  'Alley',
  'LotShape',
  'LandContour',
  'LotConfig',
  'LandSlope',
  'Neighborhood',
  'Condition1',
  'Condition2',
  'BldgType',
  'HouseStyle',
  'RoofStyle',
  'RoofMatl',
  'Exterior1st',
  'Exterior2nd',
  'MasVnrType',
  'ExterQual',
  'ExterCond',
  'Foundation',
  'BsmtQual',
  'BsmtCond',
  'BsmtExposure',
  'BsmtFinType1',
  'BsmtFinType2',
  'Heating',
  'HeatingQC',
  'Electrical',
  'KitchenQual',
  'Functional',
  'FireplaceQu',
  'GarageType',
  'GarageFinish',
  'GarageQual',
  'GarageCond',
  'PavedDrive',
  'PoolQC',
  'Fence',
  'MiscFeature',
  'SaleType',
  'SaleCondition'])

In [254]:
#Подставляем в таблицу
for col in OrdEncCol:
    PT.loc[col,'encoder'] = 'ordinal'

for col in TrgEncCol:
    PT.loc[col,'encoder'] = 'target'

In [255]:
# PT['to_scale'] = False

# for col in X_train.select_dtypes(exclude='object').columns:
#     if df[col].min() < -1 or df[col].max() > 1:
#         PT.loc[col,'to_scale'] = True

# PT.drop(['SalePrice'])
# PT.to_csv('ParamTable.csv')

In [256]:
#Собственно зачем создавалась таблица
PT = pd.read_csv('ParamTable.csv',index_col=[0])
PT

Unnamed: 0,NaN_count,diff_vals,data_type,drop,NaN_imp_type,encoder,to_scale
Id,0,1460,int64,True,,,True
YearBuilt,0,112,int64,False,,,True
YearRemodAdd,0,61,int64,False,,,True
KitchenAbvGr,0,4,int64,False,,,True
BedroomAbvGr,0,8,int64,False,,,True
HalfBath,0,3,int64,False,,,True
FullBath,0,4,int64,False,,,True
OverallCond,0,9,int64,False,,,True
BsmtHalfBath,0,3,int64,False,,,True
GrLivArea,0,861,int64,False,,,True


In [257]:
#Получаем колонки для дропа
drop_features = PT[PT['drop'] == True].index.to_numpy()

#Сразу отсечем их, чтобы они не попали в остальные препроцессоры
PT = PT[PT['drop'] != True]

#Заполнение категориальных НаНов
NaN_most_freq = PT[PT['NaN_imp_type'] == 'most_frequent'].index.to_numpy()
NaN_is_a_class = PT[PT['NaN_imp_type'] == 'constant'].index.to_numpy()

#Заполнение числовых НаНов
NaN_nums = PT[PT['NaN_imp_type'] == 'mean'].index.to_numpy()
NaN_nums_med = PT[PT['NaN_imp_type'] == 'median'].index.to_numpy()

#Энкодер
OrdEncCol = PT[PT['encoder'] == 'ordinal'].index.to_numpy()
TrgEncCol = PT[PT['encoder'] == 'target'].index.to_numpy()

OneHotCol = np.array([])

To_scale = PT[PT['to_scale'] == True].index.to_numpy()

drop_features, NaN_is_a_class, NaN_most_freq, OrdEncCol, TrgEncCol, NaN_nums, To_scale

(array(['Id', 'Utilities'], dtype=object),
 array(['BsmtFinType1', 'GarageQual', 'GarageCond', 'Alley', 'PoolQC',
        'Fence', 'MiscFeature', 'GarageFinish', 'GarageType',
        'FireplaceQu', 'BsmtExposure', 'BsmtCond', 'BsmtQual',
        'MasVnrType', 'BsmtFinType2'], dtype=object),
 array(['Electrical'], dtype=object),
 array(['Street', 'CentralAir'], dtype=object),
 array(['BsmtFinType1', 'Condition1', 'Neighborhood', 'GarageQual',
        'GarageCond', 'PavedDrive', 'LandSlope', 'LotConfig',
        'LandContour', 'LotShape', 'Alley', 'PoolQC', 'Fence',
        'MiscFeature', 'MSZoning', 'SaleType', 'GarageFinish',
        'Condition2', 'GarageType', 'FireplaceQu', 'BsmtExposure',
        'BsmtCond', 'Heating', 'SaleCondition', 'Electrical', 'BsmtQual',
        'Foundation', 'ExterCond', 'ExterQual', 'MasVnrType',
        'Exterior2nd', 'Exterior1st', 'RoofMatl', 'RoofStyle',
        'KitchenQual', 'HouseStyle', 'Functional', 'BldgType',
        'BsmtFinType2', 'HeatingQC']

In [258]:
full_scale = np.concatenate([To_scale,TrgEncCol,NaN_nums_med])
full_scale = np.unique(full_scale)

In [259]:

num_imputer = ColumnTransformer(
    transformers = [
        ('num_imputer_mean', SimpleImputer(strategy='mean'), NaN_nums),
        ('num_imputer_median', SimpleImputer(strategy='median'), NaN_nums_med)
    ],
    verbose_feature_names_out = False,
    remainder = 'passthrough' 
)    

cat_imputer = ColumnTransformer(
    transformers = [
        ('drop_features', 'drop', drop_features), #дроп id
        ('cat_imputer_mf', SimpleImputer(strategy='most_frequent'), NaN_most_freq),
        ('cat_imputer_no_class', SimpleImputer(strategy='constant',fill_value='No_Class'), NaN_is_a_class)
    ],
    verbose_feature_names_out = False,
    remainder = 'passthrough' 
)    

cat_encoder = ColumnTransformer(
    transformers = [
        ('ordinal_encoding', OrdinalEncoder(dtype='int64'), OrdEncCol),
        ('target_encoding',TargetEncoder(target_type='continuous'),TrgEncCol),    
        #('onehot_encoding',OneHotEncoder(sparse_output=False),OneHotCol),    
        ],
    verbose_feature_names_out = False,
    remainder = 'passthrough' 
)    

cat_scaler = ColumnTransformer(
    [
        ('scaling_num_columns', StandardScaler(), full_scale)
    ],
    verbose_feature_names_out = False,
    remainder = 'passthrough' 
)

LRG = LinearRegression()

ml_pipeline = Pipeline(
    [
        ('cat_imputer', cat_imputer),
        ('num_imputer', num_imputer),
        ('cat_encoder', cat_encoder),
        ('cat_scaler', cat_scaler),
        ('model', LRG)
    ]
)


ml_pipeline.fit(X_train, y_train)

print('train accuracy:', root_mean_squared_log_error(y_train, ml_pipeline.predict(X_train))) # Доля правильных ответов на выборке, которая была использована для обучения
print('valid accuracy:', root_mean_squared_log_error(y_valid, ml_pipeline.predict(X_valid))) # Доля правильных ответов на выборке, которую обученный алгоритм еще не видел

train accuracy: 0.009664301180235136
valid accuracy: 0.011049540484099212


In [260]:
cross_validation_result = cross_val_score(
    ml_pipeline, 
    df.drop('SalePrice', axis=1),
    df['SalePrice'],
    cv=KFold(n_splits=5, random_state=42, shuffle=True)
)

cross_validation_result

array([0.89510139, 0.55861241, 0.66452705, 0.90769074, 0.03438981])

In [261]:
# def objective(trial):
    
#     # Параметры самой модели
#     model_params = {
#         'max_depth' : trial.suggest_int('max_depth', 2, 10, 1),
#         'criterion' : trial.suggest_categorical('criterion', ['gini', 'entropy', 'log_loss']),
#     }
    
    
#     ml_pipeline.named_steps['model'].set_params(**model_params)
#     #model = ml_pipeline.set_params(**params)
    
    
    
#     # Можно добавить выбор encoder'а
#     encoder_type_label = trial.suggest_categorical('encoder_type', ['OneHotEnc', 'TargetEnc'])
#     encoder_mapping = {'OneHotEnc': OneHotEncoder(sparse_output=False), 'TargetEnc': TargetEncoder()}
    
    
#     ml_pipeline.named_steps['scaler_and_encoder'].set_params(
#         one_hot_encoding_features=encoder_mapping[encoder_type_label]
#     )
        
#     # Создание сплитов для кросс-валидации
#     cv = KFold(n_splits=5, random_state=666, shuffle=True)

#     # Вычисление метрик точности с использованием кросс-валидации
#     scores = cross_val_score(ml_pipeline, X, y, cv=cv)
    
#     accuracy = scores.mean()
    
#     return accuracy


# # Создание объекта для проведения оптимизации
# study = optuna.create_study(direction='maximize')


# # Запуск оптимизации
# study.optimize(objective, n_trials=100)


# # Вывод результатов
# best_params = study.best_params
# best_value = study.best_value


# print(f"Лучшие параметры: {best_params}")
# print(f"Лучшее значение: {best_value}")