# 10. CHICAGO AIRBNB: MODEL TUNING

## 1. Recap

In [1]:
import pandas as pd
import numpy as np
pd.set_option("display.max_columns", 99)
pd.set_option("display.max_rows", 999)
pd.set_option('precision', 3)

train = pd.read_csv('data/chicago_airbnb4_train')
X_train = train.drop('price', axis=1)

train.shape, X_train.shape

((5240, 94), (5240, 93))

In [2]:
model_df = pd.read_csv('data/chicago_airbnb4_models')
model_df.columns = ['RMSE', 'OLS_LR_all', 'OLS_LR_36', 'ElasNet_36', 
                    'KNN_all', 'RandFor_all']
model_df

Unnamed: 0,RMSE,OLS_LR_all,OLS_LR_36,ElasNet_36,KNN_all,RandFor_all
0,rmse_1,425.931,552.859,552.819,417.219,422.306
1,rmse_2,183.527,210.802,210.359,185.514,184.979
2,rmse_3,134.728,123.628,118.106,123.416,134.961
3,rmse_4,138.456,128.244,123.103,135.151,136.021
4,rmse_5,413.153,590.703,590.501,419.743,421.211
5,rmse_6,447.291,463.421,461.63,450.736,449.103
6,rmse_7,568.169,143.516,138.461,566.424,572.348
7,rmse_8,140.921,123.251,119.284,134.056,150.106
8,rmse_9,444.477,129.995,123.141,442.413,447.136
9,rmse_10,130.011,412.477,412.35,113.691,114.374


## 2. Reducing Features Using Principal Components
We want to reduce the number of features while retaining the variance in the data

In [17]:
from sklearn.decomposition import PCA

pca = PCA(n_components=0.95, whiten=True)
X_train_pca = pca.fit_transform(X_train)

print("Original number of features:", X_train.shape[1])
print("Reduced number of features:", X_train_pca.shape[1])

Original number of features: 93
Reduced number of features: 26


The output of our solution shows that PCA let us reduce our dimensionality by a wopping 67 (72%) features while still retaining 95% of the information (variance) in the feature matrix.

In [18]:
pca.explained_variance_

array([4.64938613, 3.48702325, 2.07064037, 1.51684252, 1.34904604,
       1.15115221, 1.09107001, 0.91861542, 0.84215224, 0.75376112,
       0.71785418, 0.56043914, 0.43234613, 0.38420376, 0.3724925 ,
       0.31875881, 0.3071287 , 0.28155298, 0.26640015, 0.22223678,
       0.1734344 , 0.16193853, 0.14135618, 0.13142378, 0.12928797,
       0.09760897])

In [19]:
pca.components_.T[:, 0]

array([-6.25724132e-02, -8.39135910e-03, -7.87018314e-02,  4.79114077e-02,
       -5.89016256e-03, -1.30598361e-02, -6.52554379e-03, -1.65706434e-02,
       -2.91231825e-02,  2.57873318e-02, -3.68486711e-02, -4.24146329e-01,
       -3.97299963e-01, -3.59766074e-01, -3.61086371e-01, -3.90431761e-01,
       -2.88881380e-01, -3.83826243e-01, -4.75716506e-02, -1.02182087e-02,
        4.71940084e-03,  6.20788223e-03, -4.80824010e-04, -5.60364982e-04,
        8.01026700e-03, -1.31769602e-02, -8.92779869e-04,  8.59998029e-04,
       -1.72023244e-05,  1.02576610e-03, -2.20237371e-03, -2.72141174e-04,
       -1.26641820e-05,  4.87969532e-04,  5.53800280e-04,  5.01081817e-04,
        8.58444755e-04, -5.83797417e-04,  3.98290062e-03, -2.47193334e-03,
        1.74901836e-03, -2.42509565e-04, -2.57787305e-04,  1.51764201e-03,
        1.04677855e-03, -2.81725195e-04,  2.36307312e-04,  1.73591188e-03,
       -3.79143355e-03, -7.40026966e-04,  3.18999459e-04, -4.74928780e-03,
       -1.58432367e-03, -

In [15]:
len(pca.components_.T[:, 0])

93

In [20]:
np.cumsum

<function numpy.cumsum(a, axis=None, dtype=None, out=None)>