In [52]:
# base
import pickle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

# Важная настройка для корректной настройки pipeline!
import sklearn
sklearn.set_config(transform_output="pandas")

# Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.base import BaseEstimator, TransformerMixin

# Preprocessing
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.preprocessing import OneHotEncoder, StandardScaler, RobustScaler, MinMaxScaler, OrdinalEncoder, TargetEncoder
from sklearn.model_selection import GridSearchCV, KFold

# for model learning
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score

#models
from sklearn.neighbors import KNeighborsClassifier, RadiusNeighborsClassifier, KNeighborsRegressor
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, BaggingClassifier
from sklearn.svm import SVC
from catboost import CatBoostRegressor

# Metrics
from sklearn.metrics import root_mean_squared_log_error

import category_encoders as ce 
# tunning hyperparamters model
import optuna

### Подгружаем датасет ###

In [53]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
df = pd.read_csv('/home/zef/DS_Bootcamp/HousePrices/TempData/train.csv')

#Логарифмируем цену
df['SalePrice'] = df['SalePrice'].map(np.log)
#df.Utilities.value_counts()

In [54]:
#Подгружена заготовленная таблица параметров
PT = pd.read_csv('/home/zef/DS_Bootcamp/HousePrices/ParamTable.csv',index_col=[0])
PT.head(10)

Unnamed: 0,NaN_count,diff_vals,data_type,drop,NaN_imp_type,encoder,to_scale
Id,0,1460,int64,True,median,,True
YearBuilt,0,112,int64,False,median,,True
YearRemodAdd,0,61,int64,False,median,,True
KitchenAbvGr,0,4,int64,False,median,,True
BedroomAbvGr,0,8,int64,False,median,,True
HalfBath,0,3,int64,False,median,,True
FullBath,0,4,int64,False,median,,True
OverallCond,0,9,int64,False,median,,True
BsmtHalfBath,0,3,int64,False,median,,True
GrLivArea,0,861,int64,False,median,,True


In [55]:
#Сплит
X_train, X_valid, y_train, y_valid = train_test_split(df.drop('SalePrice', axis=1), df['SalePrice'], test_size=0.2, random_state=42)

In [56]:
#Первичный отбор признаков на заполнение: делим все где NaN это отдельный класс и где это действительно пропуск (увы, пока вручную)
Cat_With_NaNs = pd.DataFrame(data={'NaN_count': df.isna().sum(), 'Sum':df.count(), 'data_type':df.dtypes})
Cat_With_NaNs = Cat_With_NaNs[(Cat_With_NaNs['NaN_count'] != 0) & (Cat_With_NaNs['data_type'] == 'object')]
NaN_Is_A_Class = Cat_With_NaNs.index.to_list()
NaN_Is_Abcence = [NaN_Is_A_Class.pop(NaN_Is_A_Class.index('Electrical'))]

#результат
NaN_Is_A_Class, NaN_Is_Abcence

(['Alley',
  'MasVnrType',
  'BsmtQual',
  'BsmtCond',
  'BsmtExposure',
  'BsmtFinType1',
  'BsmtFinType2',
  'FireplaceQu',
  'GarageType',
  'GarageFinish',
  'GarageQual',
  'GarageCond',
  'PoolQC',
  'Fence',
  'MiscFeature'],
 ['Electrical'])

In [57]:
#Подставляем в таблицу
for col in NaN_Is_A_Class:
    PT.loc[col,'NaN_imp_type'] = 'constant'

for col in NaN_Is_Abcence:
    PT.loc[col,'NaN_imp_type'] = 'most_frequent'

PT

Unnamed: 0,NaN_count,diff_vals,data_type,drop,NaN_imp_type,encoder,to_scale
Id,0,1460,int64,True,median,,True
YearBuilt,0,112,int64,False,median,,True
YearRemodAdd,0,61,int64,False,median,,True
KitchenAbvGr,0,4,int64,False,median,,True
BedroomAbvGr,0,8,int64,False,median,,True
HalfBath,0,3,int64,False,median,,True
FullBath,0,4,int64,False,median,,True
OverallCond,0,9,int64,False,median,,True
BsmtHalfBath,0,3,int64,False,median,,True
GrLivArea,0,861,int64,False,median,,True


In [58]:
#Определяем тип энкодера - кого больше 2 - target/catboost/(можно и one hot), кого меньше - label
pp_cat = df.select_dtypes('object').nunique(0,dropna=False).to_frame().rename(columns={0:'Uniqie_amount'})
OrdEncCol = pp_cat[pp_cat['Uniqie_amount'] == 2].index.to_list()
TrgEncCol = pp_cat[pp_cat['Uniqie_amount'] > 2].index.to_list() 
OrdEncCol, TrgEncCol

(['Street', 'Utilities', 'CentralAir'],
 ['MSZoning',
  'Alley',
  'LotShape',
  'LandContour',
  'LotConfig',
  'LandSlope',
  'Neighborhood',
  'Condition1',
  'Condition2',
  'BldgType',
  'HouseStyle',
  'RoofStyle',
  'RoofMatl',
  'Exterior1st',
  'Exterior2nd',
  'MasVnrType',
  'ExterQual',
  'ExterCond',
  'Foundation',
  'BsmtQual',
  'BsmtCond',
  'BsmtExposure',
  'BsmtFinType1',
  'BsmtFinType2',
  'Heating',
  'HeatingQC',
  'Electrical',
  'KitchenQual',
  'Functional',
  'FireplaceQu',
  'GarageType',
  'GarageFinish',
  'GarageQual',
  'GarageCond',
  'PavedDrive',
  'PoolQC',
  'Fence',
  'MiscFeature',
  'SaleType',
  'SaleCondition'])

In [59]:
#Подставляем в таблицу
for col in OrdEncCol:
    PT.loc[col,'encoder'] = 'ordinal'

for col in TrgEncCol:
    PT.loc[col,'encoder'] = 'target'

In [60]:
# PT['to_scale'] = False

# for col in X_train.select_dtypes(exclude='object').columns:
#     if df[col].min() < -1 or df[col].max() > 1:
#         PT.loc[col,'to_scale'] = True

# PT.drop(['SalePrice'])
# PT.to_csv('ParamTable.csv')

In [61]:
#Собственно зачем создавалась таблица
PT = pd.read_csv('ParamTable.csv',index_col=[0])
PT

Unnamed: 0,NaN_count,diff_vals,data_type,drop,NaN_imp_type,encoder,to_scale
Id,0,1460,int64,True,median,,True
YearBuilt,0,112,int64,False,median,,True
YearRemodAdd,0,61,int64,False,median,,True
KitchenAbvGr,0,4,int64,False,median,,True
BedroomAbvGr,0,8,int64,False,median,,True
HalfBath,0,3,int64,False,median,,True
FullBath,0,4,int64,False,median,,True
OverallCond,0,9,int64,False,median,,True
BsmtHalfBath,0,3,int64,False,median,,True
GrLivArea,0,861,int64,False,median,,True


In [62]:

def change_nan_imp(df):
    if (df['NaN_imp_type'].isna()) & (df['data_type'] == 'int64'):
        return 'median'
    elif (df['NaN_imp_type'].isna()) & (df['data_type'] == 'int64'):
        return 'most_frequent'
    else:
        return np.nan
    
PT.loc['Id':'BsmtFinSF1','NaN_imp_type'] = PT.loc['Id':'BsmtFinSF1','NaN_imp_type'].fillna(value='median')
PT.loc['BsmtFinType1':'HeatingQC','NaN_imp_type'] = PT.loc['BsmtFinType1':'HeatingQC','NaN_imp_type'].fillna(value='most_frequent')
# PT.to_csv('/home/zef/DS_Bootcamp/HousePrices/ParamTable.csv')

In [63]:
#Получаем колонки для дропа
drop_features = PT[PT['drop'] == True].index.to_numpy()

#Сразу отсечем их, чтобы они не попали в остальные препроцессоры
PT = PT[PT['drop'] != True]

#Заполнение категориальных НаНов
NaN_most_freq = PT[PT['NaN_imp_type'] == 'most_frequent'].index.to_numpy()
NaN_is_a_class = PT[PT['NaN_imp_type'] == 'constant'].index.to_numpy()

#Заполнение числовых НаНов
NaN_nums = PT[PT['NaN_imp_type'] == 'mean'].index.to_numpy()
NaN_nums_med = PT[PT['NaN_imp_type'] == 'median'].index.to_numpy()

#Энкодер
OrdEncCol = PT[PT['encoder'] == 'ordinal'].index.to_numpy()
TrgEncCol = PT[PT['encoder'] == 'target'].index.to_numpy()

OneHotCol = np.array([])

To_scale = PT[PT['to_scale'] == True].index.to_numpy()

drop_features, NaN_is_a_class, NaN_most_freq, OrdEncCol, TrgEncCol, NaN_nums, NaN_nums_med, To_scale

(array(['Id', 'Utilities'], dtype=object),
 array(['BsmtFinType1', 'GarageQual', 'GarageCond', 'Alley', 'PoolQC',
        'Fence', 'MiscFeature', 'GarageFinish', 'GarageType',
        'FireplaceQu', 'BsmtExposure', 'BsmtCond', 'BsmtQual',
        'MasVnrType', 'BsmtFinType2'], dtype=object),
 array(['Condition1', 'Neighborhood', 'PavedDrive', 'LandSlope',
        'LotConfig', 'LandContour', 'LotShape', 'Street', 'MSZoning',
        'SaleType', 'Condition2', 'Heating', 'SaleCondition', 'CentralAir',
        'Electrical', 'Foundation', 'ExterCond', 'ExterQual',
        'Exterior2nd', 'Exterior1st', 'RoofMatl', 'RoofStyle',
        'KitchenQual', 'HouseStyle', 'Functional', 'BldgType', 'HeatingQC'],
       dtype=object),
 array(['Street', 'CentralAir'], dtype=object),
 array(['BsmtFinType1', 'Condition1', 'Neighborhood', 'GarageQual',
        'GarageCond', 'PavedDrive', 'LandSlope', 'LotConfig',
        'LandContour', 'LotShape', 'Alley', 'PoolQC', 'Fence',
        'MiscFeature', 'MSZonin

In [64]:
full_scale = np.concatenate([To_scale,TrgEncCol,NaN_nums_med])
full_scale = np.unique(full_scale)

In [65]:

num_imputer = ColumnTransformer(
    transformers = [
        ('num_imputer_mean', SimpleImputer(strategy='mean'), NaN_nums),
        ('num_imputer_median', SimpleImputer(strategy='median'), NaN_nums_med)
    ],
    verbose_feature_names_out = False,
    remainder = 'passthrough' 
)    

cat_imputer = ColumnTransformer(
    transformers = [
        ('drop_features', 'drop', drop_features), #дроп id
        ('cat_imputer_mf', SimpleImputer(strategy='most_frequent'), NaN_most_freq),
        ('cat_imputer_no_class', SimpleImputer(strategy='constant',fill_value='No_Class'), NaN_is_a_class)
    ],
    verbose_feature_names_out = False,
    remainder = 'passthrough' 
)    

cat_encoder = ColumnTransformer(
    transformers = [
        ('ordinal_encoding', OrdinalEncoder(dtype='int64'), OrdEncCol),
        ('target_encoding',ce.CatBoostEncoder(),TrgEncCol),    
        #('onehot_encoding',OneHotEncoder(sparse_output=False),OneHotCol),    
        ],
    verbose_feature_names_out = False,
    remainder = 'passthrough' 
)    

cat_scaler = ColumnTransformer(
    [
        ('scaling_num_columns', StandardScaler(), full_scale)
    ],
    verbose_feature_names_out = False,
    remainder = 'passthrough' 
)

LRG = LinearRegression()
KNNR = KNeighborsRegressor(n_neighbors=5,algorithm='auto',p=2,leaf_size=30)
CBR = CatBoostRegressor(iterations=2,learning_rate=1,depth=3)

ml_pipeline = Pipeline(
    [
        ('cat_imputer', cat_imputer),
        ('num_imputer', num_imputer),
        ('cat_encoder', cat_encoder),
        ('cat_scaler', cat_scaler),
        ('model', LRG)
    ]
)


ml_pipeline.fit(X_train, y_train)

# pickle.dump(ml_pipeline, open('pipl.pkl', 'wb'))

print('train accuracy:', root_mean_squared_log_error(y_train, ml_pipeline.predict(X_train))) # Доля правильных ответов на выборке, которая была использована для обучения
print('valid accuracy:', root_mean_squared_log_error(y_valid, ml_pipeline.predict(X_valid))) # Доля правильных ответов на выборке, которую обученный алгоритм еще не видел

train accuracy: 0.009733645098155897
valid accuracy: 0.010533878148831386


In [66]:
X_train.shape, y_train.shape, X_valid.shape, y_valid.shape

((1168, 80), (1168,), (292, 80), (292,))

In [67]:
def objective(trial):
    
    # # Параметры самой модели
    # model_params = {
    #     'n_neighbors' : trial.suggest_int('n_neighbors', 3, 15, 1),
    #     'algorithm' : trial.suggest_categorical('algorithm', ['ball_tree', 'kd_tree', 'brute']),
    #     'p' : trial.suggest_int('p', 1, 2, 1)
    # }

    # Параметры самой модели
    model_params = {
        'iterations' : trial.suggest_int('iterations', 2, 15, step=1),
        'depth' : trial.suggest_int('depth', 2,5,step=1),
        'learning_rate' : trial.suggest_float('learning_rate', 0.1, 1, step=0.1)
    }
    
    
    ml_pipeline.named_steps['model'].set_params(**model_params)
    #model = ml_pipeline.set_params(**params)
    ml_pipeline.fit(X_train,y_train)
    y_pred = ml_pipeline.predict(X_valid)
    
    RMSLE = root_mean_squared_log_error(y_valid, y_pred)
    
    return RMSLE


# Создание объекта для проведения оптимизации
study = optuna.create_study(direction='minimize')


# Запуск оптимизации
study.optimize(objective, n_trials=100)


# Вывод результатов
best_params = study.best_params
best_value = study.best_value


print(f"Лучшие параметры: {best_params}")
print(f"Лучшее значение: {best_value}")

[I 2024-06-07 15:04:04,690] A new study created in memory with name: no-name-c98597ee-3ee2-47e4-ae89-77082c87d2dc
[W 2024-06-07 15:04:04,693] Trial 0 failed with parameters: {'iterations': 12, 'depth': 4, 'learning_rate': 0.9} because of the following error: ValueError("Invalid parameter 'iterations' for estimator LinearRegression(). Valid parameters are: ['copy_X', 'fit_intercept', 'n_jobs', 'positive'].").
Traceback (most recent call last):
  File "/home/zef/miniforge3/lib/python3.10/site-packages/optuna/study/_optimize.py", line 196, in _run_trial
    value_or_values = func(trial)
  File "/tmp/ipykernel_24300/2746881525.py", line 18, in objective
    ml_pipeline.named_steps['model'].set_params(**model_params)
  File "/home/zef/miniforge3/lib/python3.10/site-packages/sklearn/base.py", line 279, in set_params
    raise ValueError(
ValueError: Invalid parameter 'iterations' for estimator LinearRegression(). Valid parameters are: ['copy_X', 'fit_intercept', 'n_jobs', 'positive'].
[W 202

ValueError: Invalid parameter 'iterations' for estimator LinearRegression(). Valid parameters are: ['copy_X', 'fit_intercept', 'n_jobs', 'positive'].

In [None]:
# cross_validation_result = cross_val_score(
#     ml_pipeline, 
#     df.drop('SalePrice', axis=1),
#     df['SalePrice'],
#     cv=KFold(n_splits=5, random_state=42, shuffle=True)
# )

# cross_validation_result