![](https://drive.google.com/uc?export=view&id=1rqx5wh7U-8OdKlr4L38QNGggEXSKyXHC)

---

<h1>5.3 Models</h1>

## Libraries

Importing all the libraries needed for the analysis, reading the file, and preparing the data for model training as per Final_Objective_5_2 (more details can be found in that file)

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
import xgboost
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import RFE
from sklearn.metrics import mean_absolute_error, mean_squared_error
import pickle
import warnings

In [9]:
warnings.filterwarnings('ignore')

In [10]:
df = pd.read_csv('listings_preprocessed.csv')

In [11]:
drop_columns = ['smoke_alarm','oven','patio_balcony','fire_extinguisher','shower_gel','bathroom_type_shared','price']
categorical_features = ['host_identity_verified','host_is_superhost','neighbourhood_cleansed','room_type','instant_bookable','bathroom_type','amenities']
numerical_features = ['host_listings_count','accommodates','beds','minimum_nights','maximum_nights','availability_365','number_of_reviews','reviews_per_month','bathroom']
amenities_features = ['smoke_alarm','patio_balcony','oven','fire_extiguinsher','shower_gel']

One-hot encoding of Categorical Features

In [12]:
df = pd.get_dummies(df)

Creating X and y dataframes

In [13]:
X = df.drop(drop_columns, axis=1)
y = df['price']

Train and Test dataset splitting

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Standardization and Scaling

In [15]:
sc = StandardScaler()

X_sc = sc.fit_transform(X_train[numerical_features])
X_sc_test = sc.transform(X_test[numerical_features])

X_train[numerical_features] = X_sc
X_test[numerical_features] = X_sc_test

PCA

In [16]:
pca = PCA().fit(X_train)
pca = PCA(0.9)
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

## Models

### Lasso Regressor

All features

In [17]:
reg = reg = Lasso().fit(X_train, y_train)

y_pred = reg.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)
print("MAE is {}".format(mae))
print("MSE is {}".format(mse))
print("RMSE is {}".format(rmse))

MAE is 24.389867406952465
MSE is 1223.6986797098893
RMSE is 34.981404770390355


PCA Features

In [18]:
reg = reg = Lasso().fit(X_train_pca, y_train)

y_pred_pca = reg.predict(X_test_pca)

mae = mean_absolute_error(y_test, y_pred_pca)
mse = mean_squared_error(y_test, y_pred_pca)
rmse = mean_squared_error(y_test, y_pred_pca, squared=False)
print("MAE is {}".format(mae))
print("MSE is {}".format(mse))
print("RMSE is {}".format(rmse))

MAE is 25.114641310760344
MSE is 1266.5738046647475
RMSE is 35.58895621769129


### Support Vector Regressor (SVR)

All Features

In [21]:
svr = SVR(kernel = 'rbf')
svr.fit(X_train, y_train)

y_pred = svr.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)
print("MAE is {}".format(mae))
print("MSE is {}".format(mse))
print("RMSE is {}".format(rmse))  

MAE is 21.731394208170514
MSE is 1213.6077842218158
RMSE is 34.83687391574933


PCA Features

In [22]:
svr = SVR(kernel = 'rbf')
svr.fit(X_train_pca, y_train)

y_pred_pca = svr.predict(X_test_pca)

mae = mean_absolute_error(y_test, y_pred_pca)
mse = mean_squared_error(y_test, y_pred_pca)
rmse = mean_squared_error(y_test, y_pred_pca, squared=False)
print("MAE is {}".format(mae))
print("MSE is {}".format(mse))
print("RMSE is {}".format(rmse))  

MAE is 23.01729799529238
MSE is 1302.6075457917573
RMSE is 36.09165479431162


### Random Forest Regressor

All Features

In [19]:
rf = RandomForestRegressor(n_estimators=500, random_state=42)
rf.fit(X_train, y_train)

y_pred = rf.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)
print("MAE is {}".format(mae))
print("MSE is {}".format(mse))
print("RMSE is {}".format(rmse))

MAE is 19.146785536216516
MSE is 856.8242122136011
RMSE is 29.271559784432416


PCA Features

In [20]:
rf_pca = RandomForestRegressor(n_estimators=500, random_state=42)
rf_pca.fit(X_train_pca, y_train)

y_pred_pca = rf_pca.predict(X_test_pca)

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred_pca)
rmse = mean_squared_error(y_test, y_pred_pca, squared=False)
print("MAE is {}".format(mae))
print("MSE is {}".format(mse))
print("RMSE is {}".format(rmse))

MAE is 19.146785536216516
MSE is 1002.0270933848586
RMSE is 31.65481153608182


### XGBoost Regressor

Cross-Validation

In [26]:
param_grid = {'max_depth': [2, 5, 8], 'n_estimators': [100, 200, 300, 400]}
xgb = xgboost.XGBRegressor()
grid = GridSearchCV(xgb, param_grid, cv=3, n_jobs=3)
grid.fit(X_train, y_train)
grid.best_estimator_



XGBRegressor(max_depth=8)

All Features

In [27]:
xgb = xgboost.XGBRegressor(n_estimators=300, max_depth=8)
xgb.fit(X_train, y_train)

y_pred = xgb.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)
print("MAE is {}".format(mae))
print("MSE is {}".format(mse))
print("RMSE is {}".format(rmse))

MAE is 18.504515243940467
MSE is 817.8963414383003
RMSE is 28.598887066427956


PCA Features

In [28]:
xgb = xgboost.XGBRegressor(n_estimators=300, max_depth=8)
xgb.fit(X_train_pca, y_train)

y_pred_pca = xgb.predict(X_test_pca)

mae = mean_absolute_error(y_test, y_pred_pca)
mse = mean_squared_error(y_test, y_pred_pca)
rmse = mean_squared_error(y_test, y_pred_pca, squared=False)
print("MAE is {}".format(mae))
print("MSE is {}".format(mse))
print("RMSE is {}".format(rmse))

MAE is 21.566455250158338
MSE is 1022.8734512297178
RMSE is 31.982392831520876


Recursive Feature Elimination (RFE)

Selecting 10 features of the dataset the combination of which contributes the most to the prediction made by the model

In [29]:
xgb = xgboost.XGBRegressor(n_estimators=300, max_depth=8)
rfe = RFE(xgb, n_features_to_select=10)
X_rfe = rfe.fit(X_train, y_train)              
temp = pd.Series(rfe.support_,index = X.columns)
selected_features_rfe = temp[temp==True].index
print(selected_features_rfe)

y_pred = rfe.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)
print("MAE is {}".format(mae))
print("MSE is {}".format(mse))
print("RMSE is {}".format(rmse))

Index(['bathroom', 'neighbourhood_cleansed_ΑΚΡΟΠΟΛΗ',
       'neighbourhood_cleansed_ΕΜΠΟΡΙΚΟ ΤΡΙΓΩΝΟ-ΠΛΑΚΑ',
       'neighbourhood_cleansed_ΖΑΠΠΕΙΟ', 'neighbourhood_cleansed_ΘΗΣΕΙΟ',
       'neighbourhood_cleansed_ΚΟΛΩΝΑΚΙ',
       'neighbourhood_cleansed_ΚΟΥΚΑΚΙ-ΜΑΚΡΥΓΙΑΝΝΗ',
       'room_type_Entire home/apt', 'room_type_Shared room',
       'bathroom_type_private'],
      dtype='object')
MAE is 23.44295066265218
MSE is 1176.557799582893
RMSE is 34.30098831787348


## Saving the Trained Model (<i>Pickle</i> library)

In [None]:
pickle.dump(xgb, open('trained_model.sav', 'wb'))