In [1]:
#!pip install category_encoders
#!pip install pandas
#!pip install scikit-learn

In [2]:
import pandas as pd
import numpy as np

from sklearn.metrics import (
    mean_squared_error,
    mean_absolute_percentage_error,
    mean_absolute_error)

from category_encoders import TargetEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
import os
os.chdir("..")

In [3]:
def load_data(train_path:str, test_path:str) -> (pd.DataFrame, pd.DataFrame):
    """
    Loads the train and test data into pandas DataFrames
    """
    train = pd.read_csv(train_path)
    test = pd.read_csv(test_path)
    return train, test

train, test = load_data('./ml/data/raw/train.csv', './ml/data/raw/test.csv')

In [4]:
# Define relevant groups of columns
categorical_cols = ["type", "sector"]
target = "price"
numerical_cols = [i for i in train.columns if i not in [*categorical_cols, target]]
feature_cols = [*categorical_cols, *numerical_cols]

# Filter 0 valued targets
zero_train = train[train[target] <= 0]
zero_test = test[test[target] <= 0]
print(f"N° of {target} = 0 rows: {len(zero_train)}")
print(f"N° of {target} = 0 rows: {len(zero_test)}")
train = train[train[target] > 0]
test = test[test[target] > 0]

print("All columns")
print(train.columns.tolist())

print("\nCategorical columns")
print(categorical_cols)

print("\nNumerical columns")
print(numerical_cols)

print("\nFeature columns")
print(feature_cols)

print("\nTarget column")
print(target)

N° of price = 0 rows: 1
N° of price = 0 rows: 0
All columns
['type', 'sector', 'net_usable_area', 'net_area', 'n_rooms', 'n_bathroom', 'latitude', 'longitude', 'price']

Categorical columns
['type', 'sector']

Numerical columns
['net_usable_area', 'net_area', 'n_rooms', 'n_bathroom', 'latitude', 'longitude']

Feature columns
['type', 'sector', 'net_usable_area', 'net_area', 'n_rooms', 'n_bathroom', 'latitude', 'longitude']

Target column
price


In [12]:
categorical_transformer = TargetEncoder()

# Include a numerical transfomer with a standard scaler
numerical_transformer = StandardScaler()

preprocessor = ColumnTransformer(
    transformers=[
        ('categorical',
          categorical_transformer,
          categorical_cols),
          ('numerical', numerical_transformer, numerical_cols)
    ])
steps = [
    ('preprocessor', preprocessor),
    ('model', GradientBoostingRegressor(**{
        "learning_rate":0.01,
        "n_estimators":300,
        "max_depth":5,
        "loss":"absolute_error",
        "random_state": 42  # User a random state for reproducibility
    }))
]

pipeline = Pipeline(steps)
pipeline.fit(train[feature_cols], train[target])

In [13]:
train_predictions = pipeline.predict(train[feature_cols])
train_target = train[target].values

test_predictions = pipeline.predict(test[feature_cols])
test_target = test[target].values

In [14]:
def print_metrics(predictions, target):
    # Correctly call metrics functions with target as first arg
    print("RMSE: ", np.sqrt(mean_squared_error(target, predictions)))
    print("MAPE: ", mean_absolute_percentage_error(target, predictions))
    print("MAE : ", mean_absolute_error(target, predictions))

In [15]:
print_metrics(train_predictions, train_target)
print_metrics(test_predictions, test_target)

RMSE:  6239.017699985428
MAPE:  0.5413446761161309
MAE :  2653.257563607936
RMSE:  5677.217551084991
MAPE:  0.6738648299584683
MAE :  2605.246069880555


In [16]:
categorical_transformer = TargetEncoder()

preprocessor = ColumnTransformer(
    transformers=[
        ('categorical',
          categorical_transformer,
          categorical_cols)
    ])

steps = [
    ('preprocessor', preprocessor),
    ('model', GradientBoostingRegressor(**{
        "learning_rate":0.01,
        "n_estimators":300,
        "max_depth":5,
        "loss":"absolute_error"
    }))
]

pipeline = Pipeline(steps)
pipeline.fit(train[feature_cols], train[target])

In [17]:
train_predictions = pipeline.predict(train[feature_cols])
train_target = train[target].values

test_predictions = pipeline.predict(test[feature_cols])
test_target = test[target].values

In [18]:
print_metrics(train_predictions, train_target)
print_metrics(test_predictions, test_target)

RMSE:  10892.194410333657
MAPE:  0.6827900734302479
MAE :  5985.008402188206
RMSE:  10254.155686652393
MAPE:  0.7481788509239793
MAE :  5859.374796053153
