# Ensemble Learning
Group Challenge

***
by: Paul Bédier, Lukasz Pszenny, Lasse Schmidt

within: MS Data Sciences & Business Analytics

at: CentraleSupélec & ESSEC Business School
***

### 1. Import Packages

In [31]:
from importlib import reload
reload(prepData)

<module 'util.preprocess_data' from 'd:\\Dokumente\\2_Bildung\\2_MSc\\1_Classes\\Y2T2_Ensemble Learning\\4_challenge\\Ensemble-Learning-on-AirBnb-dataset\\util\\preprocess_data.py'>

In [1]:
# import own scripts
import util.preprocess_data as prepData

In [12]:
# parse & handle data
import csv
import numpy as np
import pandas as pd
from PIL import Image

# modeling tools
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

# models
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor


# visualization
from matplotlib import cm
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
# set matplotlib and seaborn settings for nicer plots
%matplotlib inline

SMALL_SIZE = 6
MEDIUM_SIZE = 8
BIGGER_SIZE = 10

plt.rc('font', size=SMALL_SIZE)          # controls default text sizes
plt.rc('axes', titlesize=BIGGER_SIZE)     # fontsize of the axes title
plt.rc('axes', labelsize=MEDIUM_SIZE)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('ytick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('legend', fontsize=MEDIUM_SIZE)    # legend fontsize
plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title

### 2. Retrieve Data

In [4]:
# Read data
airbnb = pd.read_csv('data/AB_NYC_2019.csv', index_col=0)

# perform train test split
airbnb_train, airbnb_test = train_test_split(airbnb, test_size = 0.2)

# validate what happened
print(airbnb.shape)
print(airbnb_train.shape)
print(airbnb_test.shape)

(48895, 15)
(39116, 15)
(9779, 15)


In [5]:
# which columns we want to drop
drop_cols = ["name", "host_id", "neighbourhood", "neighbourhood_group", "room_type", "last_review", "last_review_recency"]

# preprocess training data
airbnb_train, impute_threshs, encode_threshs = prepData.prep_pipeline(airbnb_train, drop_cols = drop_cols)

# preprocess test data (using same thresholds as for training data)
airbnb_test, _, _ = prepData.prep_pipeline(airbnb_test, drop_cols = drop_cols, impute_threshs = impute_threshs, encode_threshs = encode_threshs)

In [6]:
# split data into X & y
X_train, y_train = prepData.split_frame(airbnb_train)
X_test,  y_test  = prepData.split_frame(airbnb_test)

# validate
print(X_train.shape, y_train.shape)
print(X_test.shape,  y_test.shape)

(39116, 13) (39116,)
(9779, 13) (9779,)


In [7]:
# dataframe must contain no missing values!
X_train.isna().sum()

latitude                          0
longitude                         0
minimum_nights                    0
number_of_reviews                 0
reviews_per_month                 0
calculated_host_listings_count    0
availability_365                  0
last_review_recency_log_noise     0
room_type_enc                     0
distance_l1                       0
distance_l2                       0
l2_mean                           0
l2_sd                             0
dtype: int64

### 3. Start Modeling

In [11]:
# basic random forest model
clf = RandomForestRegressor(n_estimators = 100, criterion = "squared_error")
clf.fit(X_train, y_train)

# predict
y_pred = clf.predict(X_train)

# compute R^2 metric
r2_score(y_train, y_pred)

0.8714612201492138

In [27]:
# basic XGBoost
clf = XGBRegressor(n_estimators = 100, max_depth = 7, eta = 0.1, subsample = 0.7, colsample_bytree = 0.8)
clf.fit(X_train, y_train)

# predict
y_pred = clf.predict(X_train)

# compute R^2 metric
r2_score(y_train, y_pred)

0.7300918451244958