# 9. CHICAGO AIRBNB: MODEL TRAINING
---

## 1. The Data & Train-Test Split

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
pd.set_option("display.max_columns", 99)
pd.set_option("display.max_rows", 999)
pd.set_option('precision', 3)

chicago = pd.read_csv('data/chicago_airbnb3')

num_cols = ['host_rr', 'host_ar', 'latitude', 'longitude', 'accommodates',
            'baths', 'bedrooms', 'beds', 'min_nights', 'max_nights', 'n_reviews', 
            'rs_rating', 'rs_accuracy', 'rs_cleanliness', 'rs_check_in', 
            'rs_communication', 'rs_location', 'rs_value', 'reviews_per_month']

cat_cols = ['host_rt', 'neighborhood', 'property_type', 'room_type']

train, test = train_test_split(chicago, test_size=0.2, random_state=42)

X_train = train.drop('price', axis=1)
y_train = train['price']

X_test = test.drop('price', axis=1)
y_test = test['price']

X_train.shape, y_train.shape, X_test.shape, y_test.shape

((5240, 25), (5240,), (1310, 25), (1310,))

## 2. Preprocessing Pipeline

In [2]:
from sklearn.base import BaseEstimator, TransformerMixin

class AttributeAdder(BaseEstimator, TransformerMixin):    
    def __init__(self):
        self    
        
    def fit(self, df, y = None):               
        return self

    def transform(self, df):        
        df_ = df.copy()
        df_['beds_per_bdrm'] = df_['beds']/df_['bedrooms']
        df_['baths_per_bed'] = df_['baths']/df_['beds']
        df_.replace(np.inf, 0, inplace=True)
        df_.replace(np.nan, 0, inplace=True)
        return df_
    
adder = AttributeAdder()
X_train_num = adder.fit_transform(train[num_cols])
print(X_train_num.shape)

(5240, 21)


In [3]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

num_pipe = Pipeline(steps = [
    ('add_features', AttributeAdder()),
    ('std_scaling', StandardScaler())
])
full_pipe = ColumnTransformer(
    transformers=[
        ('num_prep', num_pipe, num_cols),
        ('cat_prep', OneHotEncoder(), cat_cols)
    ])

X_train = full_pipe.fit_transform(X_train)
X_train

<5240x115 sparse matrix of type '<class 'numpy.float64'>'
	with 131000 stored elements in Compressed Sparse Row format>

So we have a sparse matrix (probably due to OneHotEncoding). We also have 115 features which is the same number that we got in the previous part when we used get_dummies. 

## 3. HTML Representation of Pipeline

In [4]:
from sklearn import set_config
set_config(display='diagram')

full_pipe

## 4Training a Linear Regression Model