In [5]:
%run ../../common_utils.py

In [97]:
from catboost import CatBoostRegressor
import optuna
from optuna.samplers import TPESampler
from IPython.utils import io
import unidecode
from catboost import Pool

In [40]:
# !pip install unidecode

Collecting unidecode
  Downloading Unidecode-1.3.2-py3-none-any.whl (235 kB)
Installing collected packages: unidecode
Successfully installed unidecode-1.3.2


In [3]:
random_state = 1
number_of_splits = 5

In [177]:
def load_data(val_data=False,path=None):
    print(path)
    train, test, metadata = load_all_data(path=path)
    nonCategorical, categorical = get_cat_and_non_cat_data(metadata)
    all_features = list(train.columns)
    all_features.remove('price')
    numerical_features = ['area_total','area_kitchen','area_living','floor','rooms','ceiling',
        'bathrooms_shared','bathrooms_private','balconies','loggias','phones','building_id','constructed','stories']
#     categorical_to_numerical(train, ['street','address'])
#     categorical_to_numerical(test, ['street','address'])
    if not val_data:
        X_train, y_train, test_labels = pre_process_numerical(features = all_features, numerical_features = numerical_features, train = train, test = test,
                            outliers_value=7, val_data=val_data, val_split=0.2, random_state=42, scaler="std",
                            add_R="True", add_rel_height="True", droptable=[],
                            one_hot_encode=False, cat_features=categorical, drop_old=True)
        y_train_log = np.log(y_train)
        return X_train, y_train, y_train_log, test_labels
    else:
        X_train, y_train, X_test, y_test, test_labels = pre_process_numerical(features = all_features, numerical_features = numerical_features, train = train, test = test,
                            outliers_value=7, val_data=val_data, val_split=0.2, random_state=42, scaler="std",
                            add_R="True", add_rel_height="True", droptable=[],
                            one_hot_encode=False, cat_features=categorical, drop_old=True)
        y_train_log = np.log(y_train)
        return X_train, y_train, y_train_log, X_test, y_test, test_labels
X_train, y_train, y_train_log, test_labels = load_data(path='../')

../
Std


In [178]:
selected_features_catboost=['building_id','area_kitchen', 'area_living',
       'rooms', 'ceiling', 'bathrooms_shared', 'bathrooms_private',
       'windows_court', 'windows_street', 'balconies', 'loggias', 'phones',
       'new', 'street', 'address', 'seller', 'layout', 'condition', 'district',
       'constructed', 'stories', 'elevator_without', 'elevator_passenger',
       'material', 'parking', 'heating', 'r', 'rel_height']

X_train = X_train[selected_features_catboost]

In [179]:
X_train['street']=X_train['street'].map(lambda row: unidecode.unidecode(row))
X_train['address']=X_train['address'].map(lambda row: unidecode.unidecode(row))

In [185]:
def objective(trial):
    
    param = {
        "learning_rate": trial.suggest_loguniform("learning_rate", 1e-5, 1e-1),
        "l2_leaf_reg": trial.suggest_loguniform("l2_leaf_reg", 1e-2, 1e0),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.01, 0.1),
        "depth": trial.suggest_int("depth", 1, 10),
        "boosting_type": trial.suggest_categorical("boosting_type", ["Ordered", "Plain"]),
        "bootstrap_type": trial.suggest_categorical("bootstrap_type", ["Bayesian", "Bernoulli", "MVS"]),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 2, 20),
        "one_hot_max_size": trial.suggest_int("one_hot_max_size", 2, 20),  
    }
    # Conditional Hyper-Parameters
    if param["bootstrap_type"] == "Bayesian":
        param["bagging_temperature"] = trial.suggest_float("bagging_temperature", 0, 10)
    elif param["bootstrap_type"] == "Bernoulli":
        param["subsample"] = trial.suggest_float("subsample", 0.1, 1)

    scores = []
        
    cv = GroupKFold(n_splits=number_of_splits)
    groups = X_train["building_id"]

    for train_index, test_index in cv.split(X_train, y_train, groups):
        X_train2, X_test = X_train.iloc[train_index], X_train.iloc[test_index]
        y_train2, y_test = y_train.iloc[train_index], y_train.iloc[test_index]
        X_train2.drop(['building_id'], axis=1,inplace=True)
        X_test.drop(['building_id'], axis=1,inplace=True)
        print("info")
        X_train2.info()
        print("cat",categorical_features_indices)
        model = CatBoostRegressor(**param, random_state=random_state,loss_function='RMSE', cat_features=categorical_features_indices)

        pool = Pool(X_train2.values, label=y_train2, cat_features=[0],feature_names=list(X_train2.columns))
        
        model.fit(
            pool,
            eval_set=[(X_test, y_test)],
            verbose=False,
            early_stopping_rounds=100,
        )
        prediction = np.exp(model.predict(X_test))
        score = root_mean_squared_log_error(prediction, np.exp(y_test))
        scores.append(score)
    return np.average(scores)

In [180]:
train, test, metadata = load_all_data(path='../')
nonCategorical, categorical = get_cat_and_non_cat_data(metadata)

categorical.append('street')
categorical.append('address')
X_train = X_train.astype({'street':'string','address':'string','seller':'string','layout':'string','condition':'string','district':'string','material':'string','parking':'string','heating':'string'})
X_train.drop(['area_kitchen', 'area_living', 'rooms', 'ceiling',
       'bathrooms_shared', 'bathrooms_private', 'windows_court', 'street',
       'windows_street', 'balconies', 'loggias', 'phones', 'new', 'address', 'layout', 'condition', 'district', 'constructed',
       'stories', 'elevator_without', 'elevator_passenger', 'material',
       'parking', 'heating', 'r', 'rel_height'],axis=1,inplace=True)

In [181]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 23224 entries, 0 to 23284
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   building_id  23224 non-null  float64
 1   seller       23224 non-null  string 
dtypes: float64(1), string(1)
memory usage: 544.3 KB


In [186]:
categorical_features_indices = [0]

study = optuna.create_study(sampler=TPESampler(), direction="minimize")
study.optimize(objective, n_trials=10, timeout=600) # Run for 10 minutes
print("Number of completed trials: {}".format(len(study.trials)))
print("Best trial:")
trial = study.best_trial

print("\tBest Score: {}".format(trial.value))
print("\tBest Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

[32m[I 2021-11-09 22:53:29,252][0m A new study created in memory with name: no-name-2141b0b5-68d8-4adb-b70a-65adad1fc45a[0m
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(
[33m[W 2021-11-09 22:53:29,359][0m Trial 0 failed because of the following error: TypeError('Cannot convert StringArray to numpy.ndarray')[0m
Traceback (most recent call last):
  File "C:\ProgramData\Anaconda3\lib\site-packages\optuna\study\_optimize.py", line 213, in _run_trial
    value_or_values = func(trial)
  File "<ipython-input-185-fe9510c53c5e>", line 35, in objective
    model.fit(
  File "C:\ProgramData\Anaconda3\lib\site-packages\catboost\core.py", line 5258, in fit
    return self._fit(X, y, cat_features, None, None, None, sample_weight, None, None, None, None, baseline,
  File "C:\ProgramData\Anaconda3\lib\site-packages\

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18579 entries, 0 to 23284
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   seller  18579 non-null  string
dtypes: string(1)
memory usage: 290.3 KB
info None
cat [0]


TypeError: Cannot convert StringArray to numpy.ndarray

In [113]:
categorical_features_indices

['seller',
 'layout',
 'condition',
 'district',
 'material',
 'parking',
 'heating',
 'street',
 'address']

In [136]:
X_train['layout']

0        1.0387486669036616
1        1.0387486669036616
2        1.0387486669036616
3        1.0387486669036616
4        1.0387486669036616
                ...        
23280                   1.0
23281    1.0387486669036616
23282                   2.0
23283    1.0387486669036616
23284    1.0387486669036616
Name: layout, Length: 23224, dtype: string

In [115]:
categorical_features_indices = np.where(X_train.dtypes != np.float)[0]


In [155]:
categorical

['seller',
 'layout',
 'condition',
 'district',
 'material',
 'parking',
 'heating',
 'street',
 'address']

In [156]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 23224 entries, 0 to 23284
Data columns (total 28 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   building_id         23224 non-null  float64
 1   area_kitchen        23224 non-null  float64
 2   area_living         23224 non-null  float64
 3   rooms               23224 non-null  float64
 4   ceiling             23224 non-null  float64
 5   bathrooms_shared    23224 non-null  float64
 6   bathrooms_private   23224 non-null  float64
 7   windows_court       23224 non-null  float64
 8   windows_street      23224 non-null  float64
 9   balconies           23224 non-null  float64
 10  loggias             23224 non-null  float64
 11  phones              23224 non-null  float64
 12  new                 23224 non-null  float64
 13  street              23224 non-null  string 
 14  address             23224 non-null  string 
 15  seller              23224 non-null  string 
 16  layo