# Imports

In [36]:
%precision 4

import os
import pandas as pd
import numpy as np

from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, mean_absolute_error
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.tree import DecisionTreeRegressor

# Scikit-learn has another version of gradient boosting, but XGBoost has some technical advantages.
from xgboost import XGBRegressor

# Dataset

In [2]:
dataset_path = os.path.join(
    os.getcwd(), 'datasets', 'melbourne_housing', 'data.csv'
)

melbourne_data = pd.read_csv(dataset_path)

melbourne_data.describe()
# count -> mostra o número de itens não vazios

Unnamed: 0,Rooms,Price,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,Lattitude,Longtitude,Propertycount
count,13580.0,13580.0,13580.0,13580.0,13580.0,13580.0,13518.0,13580.0,7130.0,8205.0,13580.0,13580.0,13580.0
mean,2.937997,1075684.0,10.137776,3105.301915,2.914728,1.534242,1.610075,558.416127,151.96765,1964.684217,-37.809203,144.995216,7454.417378
std,0.955748,639310.7,5.868725,90.676964,0.965921,0.691712,0.962634,3990.669241,541.014538,37.273762,0.07926,0.103916,4378.581772
min,1.0,85000.0,0.0,3000.0,0.0,0.0,0.0,0.0,0.0,1196.0,-38.18255,144.43181,249.0
25%,2.0,650000.0,6.1,3044.0,2.0,1.0,1.0,177.0,93.0,1940.0,-37.856822,144.9296,4380.0
50%,3.0,903000.0,9.2,3084.0,3.0,1.0,2.0,440.0,126.0,1970.0,-37.802355,145.0001,6555.0
75%,3.0,1330000.0,13.0,3148.0,3.0,2.0,2.0,651.0,174.0,1999.0,-37.7564,145.058305,10331.0
max,10.0,9000000.0,48.1,3977.0,20.0,8.0,10.0,433014.0,44515.0,2018.0,-37.40853,145.52635,21650.0


In [3]:
melbourne_data.columns

Index(['Suburb', 'Address', 'Rooms', 'Type', 'Price', 'Method', 'SellerG',
       'Date', 'Distance', 'Postcode', 'Bedroom2', 'Bathroom', 'Car',
       'Landsize', 'BuildingArea', 'YearBuilt', 'CouncilArea', 'Lattitude',
       'Longtitude', 'Regionname', 'Propertycount'],
      dtype='object')

In [4]:
def score_dataset(X_train, X_test, y_train, y_test, model=None, train=False):
    """
    Calculates the mean absolute error between target and predicted values
    """
    if model is None:
        model = RandomForestRegressor(
            n_estimators=100,
            random_state=0
        )
        train = True
    
    if train:
        model.fit(X_train, y_train)

    predictions = model.predict(X_test)
    return mean_absolute_error(y_test, predictions)

# Kaggle Course 1: Intro to Machine Learning

https://www.kaggle.com/learn/intro-to-machine-learning

## Handle missing data by droping them from the dataset

In [9]:
# dropna drops missing values (think of na as "not available")
melbourne_data = melbourne_data.dropna(axis=0)

## Selecting features

In [10]:
y = melbourne_data.Price

melboune_features = ['Rooms', 'Bathroom', 'Landsize', 'Lattitude', 'Longtitude']
X = melbourne_data[melboune_features]
X.describe()

Unnamed: 0,Rooms,Bathroom,Landsize,Lattitude,Longtitude
count,6196.0,6196.0,6196.0,6196.0,6196.0
mean,2.931407,1.57634,471.00694,-37.807904,144.990201
std,0.971079,0.711362,897.449881,0.07585,0.099165
min,1.0,1.0,0.0,-38.16492,144.54237
25%,2.0,1.0,152.0,-37.855438,144.926198
50%,3.0,1.0,373.0,-37.80225,144.9958
75%,4.0,2.0,628.0,-37.7582,145.0527
max,8.0,8.0,37000.0,-37.45709,145.52635


In [11]:
X.head()

Unnamed: 0,Rooms,Bathroom,Landsize,Lattitude,Longtitude
1,2,1.0,156.0,-37.8079,144.9934
2,3,2.0,134.0,-37.8093,144.9944
4,4,1.0,120.0,-37.8072,144.9941
6,3,2.0,245.0,-37.8024,144.9993
7,2,1.0,256.0,-37.806,144.9954


## Splitting data into train and test

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2
)

## Building model

### Decision Tree

In [91]:
decision_tree_model = DecisionTreeRegressor(
    random_state=1
)

# Training the model
decision_tree_model.fit(X_train, y_train.values)

predictions = decision_tree_model.predict(X_test)

dt_acc = accuracy_score(y_test.values, predictions)
dt_loss = mean_absolute_error(y_test.values, predictions)

### Random Trees

In [98]:
forest_model = RandomForestRegressor(random_state=1)
forest_model.fit(X_train, y_train)
melb_preds = forest_model.predict(X_test)

### Resultados

In [96]:
print("Decision Tree")
print(f"Error: {dt_loss}")

Decision Tree
Error: 266553.3637096774


In [99]:
print("Random Forest")
print("Error: ", mean_absolute_error(y_test, melb_preds))

Random Forest
Error:  201129.4307089094


# Kaggle Course 2: Intermidiate Machine Learning

## Missing Values

Machine learn models are not capable to handle missing values. Instead we need to deal with them by:

1. Droping Columns with missing values
2. Imputing missing values (Fill missing values)
3. Both imputing missing values and creating a new column to mark them as missing.
    * ![image.png](./resources/ImputingExtended.png)

In [4]:
# Getting the target data
y = melbourne_data.Price

# Getting the rest of the dataset as input
X = melbourne_data.drop(['Price'], axis=1)
# Keeping things simple by taking only numerical values
X = X.select_dtypes(exclude=['object'])

# Splitting the data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=0
)

In [7]:
# Getting columns with missing values
# It returns 'col' if 'col' has any missing values in 'X_train'
cols_with_missing = [col for col in X_train.columns if X_train[col].isnull().any()]

print(f"Columns with missing values: {cols_with_missing}")

Columns with missing values: ['Car', 'BuildingArea', 'YearBuilt']


### Method 1 -> Dropping columns with missing values

In [8]:
X_train_reduced = X_train.drop(cols_with_missing, axis=1)
X_test_reduced = X_test.drop(cols_with_missing, axis=1)

score_dropped = score_dataset(X_train_reduced, X_test_reduced, y_train, y_test)
print(f"Error with dropped columns: {score_dropped}")

Error with dropped columns: 183550.22137772635


### Method 2 -> Imputing missing values

* We will do this by putting in the mean value of the column. It's a simple way that have shown good results. It may not be the best way to impute missing values, but it's a good start.

In [10]:
imputer = SimpleImputer()
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)

score_imputed = score_dataset(X_train_imputed, X_test_imputed, y_train, y_test)
print(f"Error with imputed columns: {score_imputed}")

Error with imputed columns: 178166.46269899711


### Method 3 -> Imputing missing values and creating a new column to mark them as missing

In [16]:
# Make a copy to avoid overwriting original data (when imputing)
X_train_plus = X_train.copy()
X_test_plus = X_test.copy()

# Creating new column to make whether or not there was missing data
for col in cols_with_missing:
    # X_*_plus[col].isnull() return an array os booleans indicating if each position is missing or not
    X_train_plus[col + '_was_missing'] = X_train_plus[col].isnull()
    X_test_plus[col + '_was_missing'] = X_test_plus[col].isnull()

# Imputing data
imputer = SimpleImputer()
X_train_plus_imputed = imputer.fit_transform(X_train_plus)
X_test_plus_imputed = imputer.transform(X_test_plus)

# Scoring model
score_plus_imputed = score_dataset(X_train_plus_imputed, X_test_plus_imputed, y_train, y_test)
print(f"Error with imputed columns: {score_plus_imputed}")

Error with imputed columns: 178927.503183954


## Categorical Data

* To be considered categorical data, a feature must be part of a fixed set of categories. For example, in a survey about cars, people will answare "Honda", "Ford" or "Dodge". In this case, the variable would be considered categorical data.

* Three approaches to deal with this kind of data:
1. Drop Categorical Variables
2. Ordinal Encoder (0, 1, 2, 3, ...)
3. One Hot Encoder [[0, 0, 1], [0, 1, 0], [1, 0, 0]]

In [19]:
y = melbourne_data.Price
X = melbourne_data.drop(['Price'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=0
)

# Dropping missing values as it's not the focus of this section
cols_with_missing = [col for col in X_train.columns if X_train[col].isnull().any()]
X_train.drop(cols_with_missing, axis=1, inplace=True)
X_test.drop(cols_with_missing, axis=1, inplace=True)

# "Cardinality" means the number of unique values in a column
# Select categorical columns with relatively low cardinality (convenient but arbitrary)
low_cardinality_cols = [cname for cname in X_train.columns if X_train[cname].nunique() < 10 and 
                        X_train[cname].dtype == "object"]

# Select numerical columns
numerical_cols = [cname for cname in X_train.columns if X_train[cname].dtype in ['int64', 'float64']]

# Keep selected columns only
my_cols = low_cardinality_cols + numerical_cols
X_train_filtred = X_train[my_cols].copy()
X_test_filtred = X_test[my_cols].copy()

In [28]:
X_train_filtred.head()

Unnamed: 0,Type,Method,Regionname,Rooms,Distance,Postcode,Bedroom2,Bathroom,Landsize,Lattitude,Longtitude,Propertycount
12167,u,S,Southern Metropolitan,1,5.0,3182.0,1.0,1.0,0.0,-37.85984,144.9867,13240.0
6524,h,SA,Western Metropolitan,2,8.0,3016.0,2.0,2.0,193.0,-37.858,144.9005,6380.0
8413,h,S,Western Metropolitan,3,12.6,3020.0,3.0,1.0,555.0,-37.7988,144.822,3755.0
2919,u,SP,Northern Metropolitan,3,13.0,3046.0,3.0,1.0,265.0,-37.7083,144.9158,8870.0
6043,h,S,Western Metropolitan,3,13.3,3020.0,3.0,1.0,673.0,-37.7623,144.8272,4217.0


In [29]:
# Get list of categorival columns
s = (X_train_filtred.dtypes == 'object')
object_cols = list(s[s].index)

print("Categorical variables:")
print(object_cols)

Categorical variables:
['Type', 'Method', 'Regionname']


### Dropping Categorical Variables

In [31]:
X_train_dropped = X_train_filtred.drop(object_cols, axis=1)
X_test_dropped = X_test_filtred.drop(object_cols, axis=1)

score_dropped_cat = score_dataset(X_train_dropped, X_test_dropped, y_train, y_test)
print(f"After dropping categorical variables MAE: {score_dropped_cat}")

After dropping categorical variables MAE: 183550.22137772635


### Ordinal Encoder

In [34]:
ordinal_encoder = OrdinalEncoder()

X_train_encoded = X_train_filtred.copy()
X_test_encoded = X_test_filtred.copy()

X_train_encoded[object_cols] = ordinal_encoder.fit_transform(X_train_filtred[object_cols])
X_test_encoded[object_cols] = ordinal_encoder.transform(X_test_filtred[object_cols])

score_encoded = score_dataset(X_train_encoded, X_test_encoded, y_train, y_test)
print(f"MAE with Ordinal Encoder: {score_encoded}")

MAE with Ordinal Encoder: 175062.2967599411


### One Hot Encoder

In [45]:
# Handle_unknown -> avoid errors when the validation data contains classes that aren't represented in the training data
# sparse -> ensures that the encoded columns are returned as a numpy array (instead of a sparse matrix).
one_hot_encoder = OneHotEncoder(handle_unknown="ignore", sparse_output=False)

one_hot_cols_train = pd.DataFrame(one_hot_encoder.fit_transform(X_train_filtred[object_cols]))
one_hot_cols_test = pd.DataFrame(one_hot_encoder.transform(X_test_filtred[object_cols]))

# One-hot encoding removed index; put it back
one_hot_cols_train.index = X_train_filtred.index
one_hot_cols_test.index = X_test_filtred.index

# Remove categorical columns (will replace with one-hot encoding)
num_X_train = X_train_filtred.drop(object_cols, axis=1)
num_X_test = X_test_filtred.drop(object_cols, axis=1)

# Add one-hot encoded columns to numerical features
one_hot_X_train = pd.concat([num_X_train, one_hot_cols_train], axis=1)
one_hot_X_test = pd.concat([num_X_test, one_hot_cols_test], axis=1)

# Ensure all columns have string type
one_hot_X_train.columns = one_hot_X_train.columns.astype(str)
one_hot_X_test.columns = one_hot_X_test.columns.astype(str)

score_onehot = score_dataset(one_hot_X_train, one_hot_X_test, y_train, y_test)
print(f"MAE with One Hot Encoder: {score_onehot}")


MAE with One Hot Encoder: 176703.63810751104


## Pipelines

* A way to organize preprocessing with modeling code organized.

In [11]:
X = melbourne_data.drop(labels=["Price"], axis=1)
y = melbourne_data.Price

X_train_full, X_valid_full, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2,
                                                                random_state=0)

# Select categorical columns with relatively low cardinality (convenient but arbitrary)
categorical_cols = [cname for cname in X_train_full.columns if X_train_full[cname].nunique() < 10 and 
                        X_train_full[cname].dtype == "object"]

numerical_cols = [cname for cname in X_train_full.columns if X_train_full[cname].dtype in ['int64', 'float64']]

my_cols = categorical_cols + numerical_cols
X_train = X_train_full[my_cols].copy()
X_valid = X_valid_full[my_cols].copy()

In [12]:
X_train.head()

Unnamed: 0,Type,Method,Regionname,Rooms,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,Lattitude,Longtitude,Propertycount
12167,u,S,Southern Metropolitan,1,5.0,3182.0,1.0,1.0,1.0,0.0,,1940.0,-37.85984,144.9867,13240.0
6524,h,SA,Western Metropolitan,2,8.0,3016.0,2.0,2.0,1.0,193.0,,,-37.858,144.9005,6380.0
8413,h,S,Western Metropolitan,3,12.6,3020.0,3.0,1.0,1.0,555.0,,,-37.7988,144.822,3755.0
2919,u,SP,Northern Metropolitan,3,13.0,3046.0,3.0,1.0,1.0,265.0,,1995.0,-37.7083,144.9158,8870.0
6043,h,S,Western Metropolitan,3,13.3,3020.0,3.0,1.0,2.0,673.0,673.0,1970.0,-37.7623,144.8272,4217.0


### Step 1 - Define Preprocessing Step

In [13]:
# Preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy='constant')

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ]
)

### Step 2 - Define Model

In [14]:
model = RandomForestRegressor(
    n_estimators=100,
    random_state=0
)

### Step 3 - Create and Evaluate Pipeline

In [15]:
pipeline = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('model', model)
    ],
    verbose=True
)

pipeline.fit(X_train, y_train)

predictions = pipeline.predict(X_valid)

score = mean_absolute_error(y_valid, predictions)
print(f"MAE: {score}")

[Pipeline] ...... (step 1 of 2) Processing preprocessor, total=   0.0s
[Pipeline] ............. (step 2 of 2) Processing model, total=   5.2s
MAE: 160679.18917034855


In [17]:
pipeline.steps

[('preprocessor',
  ColumnTransformer(transformers=[('num', SimpleImputer(strategy='constant'),
                                   ['Rooms', 'Distance', 'Postcode', 'Bedroom2',
                                    'Bathroom', 'Car', 'Landsize', 'BuildingArea',
                                    'YearBuilt', 'Lattitude', 'Longtitude',
                                    'Propertycount']),
                                  ('cat',
                                   Pipeline(steps=[('imputer',
                                                    SimpleImputer(strategy='most_frequent')),
                                                   ('onehot',
                                                    OneHotEncoder(handle_unknown='ignore',
                                                                  sparse_output=False))]),
                                   ['Type', 'Method', 'Regionname'])])),
 ('model', RandomForestRegressor(random_state=0))]

## Cross-Validation

* Runs da modeling process on different subsets of the data, in order to get multiple measures of the modely quality.
* This avoids the problem of taking noise into account.

### When to use cross-validation

* For small datasets, where extra computational burden isn't a big deal, you should run cross-validation.
* For larger datasets, a single validation set is sufficient.
* There's no simple threshold for what constitutes a large vs. small dataset. But if your model takes a couple minutes or less to run, it's probably worth switching to cross-validation.

In [20]:
# Select subset of predictors
cols_to_use = ['Rooms', 'Distance', 'Landsize', 'BuildingArea', 'YearBuilt']

X = melbourne_data[cols_to_use]
y = melbourne_data.Price

pipeline = Pipeline(
    steps=[
        ('preprocessor', SimpleImputer()),
        ('model', RandomForestRegressor(n_estimators=50, random_state=0))
    ]
)

# Multiply by -1 since sklearn calculates *negative* MAE
num_gropus = 5
scores = -1 * cross_val_score(pipeline, X, y,
                              cv=num_gropus,
                              scoring='neg_mean_absolute_error')

print("MAE scores:\n", scores)

MAE scores:
 [301628.7893587  303164.4782723  287298.331666   236061.84754543
 260383.45111427]


In [21]:
print("Average MAE score (across experiments):")
print(scores.mean())

Average MAE score (across experiments):
277707.3795913405


#### GridSearchCV

* https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html
* https://en.wikipedia.org/wiki/Hyperparameter_optimization

In [33]:
def create_pipeline():
    return Pipeline(
        steps=[
            ('preprocessor', SimpleImputer()),
            ('model', RandomForestRegressor(n_estimators=50, random_state=0))
        ]
    )

param_grid = {
    "model__n_estimators": [100, 200, 300, 400, 500],
    "model__criterion": ["poisson"],
    "model__max_depth": [None, 10, 16, 32, 64, 128, 256],
    "preprocessor__strategy": ["mean"]
}

num_gropus = 3

grid_search = GridSearchCV(
    estimator=create_pipeline(),
    param_grid=param_grid,
    cv=num_gropus,
    scoring='neg_mean_absolute_error',
    verbose=2
)

grid_search.fit(X, y)
print("Best parameter (CV score=%0.3f):" % grid_search.best_score_)
for param in grid_search.best_params_:
    print(param, grid_search.best_params_[param])


Fitting 3 folds for each of 35 candidates, totalling 105 fits
[CV] END model__criterion=poisson, model__max_depth=None, model__n_estimators=100, preprocessor__strategy=mean; total time=   1.9s
[CV] END model__criterion=poisson, model__max_depth=None, model__n_estimators=100, preprocessor__strategy=mean; total time=   2.0s
[CV] END model__criterion=poisson, model__max_depth=None, model__n_estimators=100, preprocessor__strategy=mean; total time=   1.9s
[CV] END model__criterion=poisson, model__max_depth=None, model__n_estimators=200, preprocessor__strategy=mean; total time=   3.9s
[CV] END model__criterion=poisson, model__max_depth=None, model__n_estimators=200, preprocessor__strategy=mean; total time=   3.9s
[CV] END model__criterion=poisson, model__max_depth=None, model__n_estimators=200, preprocessor__strategy=mean; total time=   3.9s
[CV] END model__criterion=poisson, model__max_depth=None, model__n_estimators=300, preprocessor__strategy=mean; total time=   6.0s
[CV] END model__crite

## XGBoost Library - Extreme Gradient Boosting

* The most accurate modeling technique for structured data.

### Gradient Booting

* Gradient boosting is a method that goes through cycles to iteratively add models into an ensemble.
![image.png](./resources/gradientBoosting.png)

In [34]:
cols_to_use = ['Rooms', 'Distance', 'Landsize', 'BuildingArea', 'YearBuilt']

X = melbourne_data[cols_to_use]
y = melbourne_data.Price

X_train, X_test, y_train, y_test = train_test_split(X, y)

In [37]:
model = XGBRegressor()
model.fit(X_train, y_train)

In [38]:
predictions = model.predict(X_test)
score = mean_absolute_error(y_test, predictions)
print(f"MAE: {score}")

MAE: 239806.40413119018


### Parameter Tunning

* n_estimators: It is equal to the number of models that we include in the ensemble.
    * Too low a value causes underfitting
    * Too high a value causes overfitting

In [41]:
model = XGBRegressor(n_estimators=500)
model.fit(X_train, y_train)

predictions = model.predict(X_test)
score = mean_absolute_error(y_test, predictions)
print(f"MAE: {score}")

MAE: 251012.18044631352


* early_stopping_rounds:  Early stopping causes the model to stop iterating when the validation score stops improving
    *  We stop after 5 straight rounds of deteriorating validation scores.

In [44]:
model = XGBRegressor(n_estimators=500, early_stopping_rounds=5)
model.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False)

predictions = model.predict(X_test)
score = mean_absolute_error(y_test, predictions)
print(f"MAE: {score}")

MAE: 242594.4020825663


* learning_rate: 
    * Instead of getting predictions by simply adding up the predictions from each component model, we can multiply the predictions from each model by a small number

In [49]:
model = XGBRegressor(n_estimators=500, early_stopping_rounds=5, learning_rate=0.05)
model.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False)

predictions = model.predict(X_test)
score = mean_absolute_error(y_test, predictions)
print(f"MAE: {score}")

MAE: 238429.98981038292


* n_jobs: 
    * On larger datasets where runtime is a consideration, you can use parallelism to build your models faster.

In [50]:
model = XGBRegressor(n_estimators=500, early_stopping_rounds=5, learning_rate=0.05, n_jobs=2)
model.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False)

predictions = model.predict(X_test)
score = mean_absolute_error(y_test, predictions)
print(f"MAE: {score}")

MAE: 238429.98981038292


## Data Leakage

* Happens when your training data contains information about the target, but similar data will not be available when the model is used for prediction. This leads to high performance on the training set (and possibly even the validation data), but the model will perform poorly in production.

### Target Leakage

* Occurs when your predictors include data that will not be available at the time you make predictions.
* To prevent this type of data leakage, any variable updated (or created) after the target value is realized should be excluded.


### Train-Test Leakage

* A different type of leak occurs when you aren't careful to distinguish training data from validation data.
* 