In [43]:
import xgboost as xgb


In [44]:
def root_mean_squared_log_error(y_true, y_pred):
    # Alternatively: sklearn.metrics.mean_squared_log_error(y_true, y_pred) ** 0.5
    assert (y_true >= 0).all() 
    assert (y_pred >= 0).all()
    log_error = np.log1p(y_pred) - np.log1p(y_true)  # Note: log1p(x) = log(1 + x)
    return np.mean(log_error ** 2) ** 0.5

In [45]:
%run ../common_utils.py

In [54]:
train, test, metadata = load_all_data()

In [1]:
# with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
#     print(train[['street']])

In [59]:
train['address']

0        к2.5/2
1        к2.5/2
2        к2.5/2
3        к2.5/2
4        к2.5/2
          ...  
23280         3
23281         9
23282      93К2
23283        75
23284      58К2
Name: address, Length: 23285, dtype: object

In [47]:
nonCategorical, categorical = get_cat_and_non_cat_data(metadata)
categorical.remove('district')
all_features = list(train.columns)
all_features.remove('price')
numerical_features = ['area_total','area_kitchen','area_living','floor','rooms','ceiling',
    'bathrooms_shared','bathrooms_private','balconies','loggias','phones','building_id','constructed','stories']

In [48]:
%%capture --no-display
X_train, y_train, X_test, y_test, test_labels = pre_process_numerical(features = all_features, Numerical_features = numerical_features, train = train, test = test,
                    outliers_value=7, val_split=0.2, random_state=42, scaler="std",
                    add_R="True", add_rel_height="True", droptable=[],
                    one_hot_encode=True, cat_features=categorical, drop_old=True)

In [49]:
from sklearn.preprocessing import LabelEncoder

def categorical_to_numerical(data, features):
    le = LabelEncoder()
    for feature in features:
        data[feature] = le.fit_transform(data[feature])

In [50]:
categorical_to_numerical(X_train, ['street','address'])
categorical_to_numerical(X_test, ['street','address'])

In [51]:
X_train.columns

Index(['apartment_id', 'area_total', 'area_kitchen', 'area_living', 'floor',
       'rooms', 'ceiling', 'bathrooms_shared', 'bathrooms_private',
       'windows_court', 'windows_street', 'balconies', 'loggias', 'phones',
       'building_id', 'new', 'latitude', 'longitude', 'district', 'street',
       'address', 'constructed', 'stories', 'elevator_without',
       'elevator_passenger', 'elevator_service', 'garbage_chute', 'seller_1',
       'seller_2', 'seller_3', 'seller_4', 'seller_5', 'layout_1', 'layout_2',
       'layout_3', 'layout_4', 'condition_1', 'condition_2', 'condition_3',
       'condition_4', 'condition_5', 'material_1', 'material_2', 'material_3',
       'material_4', 'material_5', 'material_6', 'material_7', 'material_8',
       'parking_1', 'parking_2', 'parking_3', 'parking_4', 'heating_1',
       'heating_2', 'heating_3', 'heating_4', 'heating_5', 'r', 'theta',
       'rel_height'],
      dtype='object')

# NOTE: XGBoost is able to handle missing values!

In [50]:
# from sklearn import metrics

# Parameter Tuning
# model = xgb.XGBClassifier()
# param_dist = {"max_depth": [10,30,50],
#               "min_child_weight" : [1,3,6],
#               "n_estimators": [200],
#               "learning_rate": [0.05, 0.1,0.16],}
# grid_search = GridSearchCV(model, param_grid=param_dist, cv = 3, 
#                                    verbose=10, n_jobs=-1)
# grid_search.fit(train, y_train)

# grid_search.best_estimator_

model = xgb.XGBRegressor(eta = 0.1, max_depth=10, min_child_weight=1,  eval_metric = 'rmsle', n_estimators=500,\
                          n_jobs=-1 , verbose=1,learning_rate=0.16, seed=42)
model.fit(X_train,y_train)

Parameters: { "verbose" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
             eta=0.1, eval_metric='rmsle', gamma=0, gpu_id=-1,
             importance_type=None, interaction_constraints='',
             learning_rate=0.16, max_delta_step=0, max_depth=10,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=500, n_jobs=-1, num_parallel_tree=1, predictor='auto',
             random_state=42, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
             seed=42, subsample=1, tree_method='exact', validate_parameters=1, ...)

In [53]:
xgb_prediction = model.predict(X_test)
xgb_rmsle = root_mean_squared_log_error(xgb_prediction, y_test)
print('XGBoost RMSLE:', xgb_rmsle)

XGBoost RMSLE: 0.16932079045682918


# XGBoost submission on all features

In [69]:
X = train.drop(['price'], axis=1)
y = train['price']
categorical_to_numerical(X, ['street','address'])
categorical_to_numerical(test, ['street','address'])

In [67]:
model.fit(X, y)

Parameters: { "verbose" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
             eta=0.1, eval_metric='rmsle', gamma=0, gpu_id=-1,
             importance_type=None, interaction_constraints='',
             learning_rate=0.16, max_delta_step=0, max_depth=10,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=500, n_jobs=-1, num_parallel_tree=1, predictor='auto',
             random_state=42, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
             seed=42, subsample=1, tree_method='exact', validate_parameters=1, ...)

In [70]:
xgb_prediction = model.predict(test)
df = pd.DataFrame(xgb_prediction, columns = ['price_prediction'])
test.rename(columns={'apartment_id' : 'id'}, inplace=True)
result = pd.concat([test['id'], df], axis=1)
result[['id','price_prediction']].to_csv('.\submissions\XGB1.0.csv', index=False)

In [71]:
X.columns

Index(['apartment_id', 'seller', 'area_total', 'area_kitchen', 'area_living',
       'floor', 'rooms', 'layout', 'ceiling', 'bathrooms_shared',
       'bathrooms_private', 'windows_court', 'windows_street', 'balconies',
       'loggias', 'condition', 'phones', 'building_id', 'new', 'latitude',
       'longitude', 'district', 'street', 'address', 'constructed', 'material',
       'stories', 'elevator_without', 'elevator_passenger', 'elevator_service',
       'parking', 'garbage_chute', 'heating'],
      dtype='object')

# Submission with one hot encoding

In [21]:
X_train, y_train, X_test, y_test, test_labels = pre_process_numerical(features = all_features, Numerical_features = numerical_features, train = train, test = test,
                    outliers_value=7, val_split=0.00000001, random_state=42, scaler="std",
                    add_R="True", add_rel_height="True", droptable=[],
                    one_hot_encode=True, cat_features=categorical, drop_old=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[col] = expressions.where(mask, this, that)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [29]:
categorical_to_numerical(X_train, ['street','address'])
categorical_to_numerical(test_labels, ['street','address'])

In [27]:
model = xgb.XGBRegressor(eta = 0.1, max_depth=10, min_child_weight=1,  eval_metric = 'rmsle', n_estimators=500,\
                          n_jobs=-1 , verbose=1,learning_rate=0.16, seed=42)
model.fit(X_train,y_train)

Parameters: { "verbose" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
             eta=0.1, eval_metric='rmsle', gamma=0, gpu_id=-1,
             importance_type=None, interaction_constraints='',
             learning_rate=0.16, max_delta_step=0, max_depth=10,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=500, n_jobs=-1, num_parallel_tree=1, predictor='auto',
             random_state=42, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
             seed=42, subsample=1, tree_method='exact', validate_parameters=1, ...)

In [30]:
xgb_prediction = model.predict(test_labels)
df = pd.DataFrame(xgb_prediction, columns = ['price_prediction'])
test.rename(columns={'apartment_id' : 'id'}, inplace=True)
result = pd.concat([test['id'], df], axis=1)
result[['id','price_prediction']].to_csv('.\submissions\XGB1.1.csv', index=False)

In [36]:
result = result.sort_values(by='id')

In [39]:
result[['id','price_prediction']].to_csv('.\submissions\XGB1.2.csv', index=False)

In [42]:
test_labels.columns

Index(['apartment_id', 'area_total', 'area_kitchen', 'area_living', 'floor',
       'rooms', 'ceiling', 'bathrooms_shared', 'bathrooms_private',
       'windows_court', 'windows_street', 'balconies', 'loggias', 'phones',
       'building_id', 'new', 'latitude', 'longitude', 'district', 'street',
       'address', 'constructed', 'stories', 'elevator_without',
       'elevator_passenger', 'elevator_service', 'garbage_chute', 'seller_1',
       'seller_2', 'seller_3', 'seller_4', 'seller_5', 'layout_1', 'layout_2',
       'layout_3', 'layout_4', 'condition_1', 'condition_2', 'condition_3',
       'condition_4', 'condition_5', 'material_1', 'material_2', 'material_3',
       'material_4', 'material_5', 'material_6', 'material_7', 'material_8',
       'parking_1', 'parking_2', 'parking_3', 'parking_4', 'heating_1',
       'heating_2', 'heating_3', 'heating_4', 'heating_5', 'r', 'theta',
       'rel_height'],
      dtype='object')

In [53]:
np.shape(xgb_prediction)

(9937,)