# Predict AirBnb Prices in New York

Data source: 

Kaggle, New York City Airbnb Open Data (Airbnb listings and metrics in NYC, NY, USA (2019) )

## Imports

In [15]:
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import pandas as pd  # conda install pandas
import warnings
warnings.filterwarnings('ignore')

print(tf.__version__)

2.12.0


## Load data and take a look at it

In [16]:
data = pd.read_csv('AB_NYC_2019.csv').sample(frac=1)  # shaffle the instances
data.head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
28747,22214395,Spacious Studio In Queens,162276183,Gary,Queens,Rego Park,40.71663,-73.85823,Entire home/apt,59,1,110,2019-06-24,5.92,1,147
20194,16162363,Quiet Private Room in Manhattan,104926837,Julia,Manhattan,Financial District,40.70551,-74.00735,Private room,113,3,131,2019-07-04,4.21,3,12
5150,3709013,Huge 1.5BR Artist Home in Brownstone Brooklyn :),7420906,Alix,Brooklyn,Bedford-Stuyvesant,40.68473,-73.95697,Entire home/apt,99,3,10,2018-11-13,0.28,1,66
2979,1735478,"Relaxing, serene room in NYC Apt",6790494,Paul,Queens,Long Island City,40.76081,-73.93163,Private room,100,20,52,2015-08-25,0.75,1,182
8022,6180762,BRAND NEW 1BD / STEPS CENTRAL PARK!,1475015,Mike,Manhattan,Upper West Side,40.76877,-73.9846,Entire home/apt,87,30,3,2017-11-30,0.1,52,275


In [51]:
data.describe()

Unnamed: 0,id,host_id,latitude,longitude,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
count,48895.0,48895.0,48895.0,48895.0,48895.0,48895.0,48895.0,38843.0,48895.0,48895.0
mean,19017140.0,67620010.0,40.728949,-73.95217,152.720687,7.029962,23.274466,1.373221,7.143982,112.781327
std,10983110.0,78610970.0,0.05453,0.046157,240.15417,20.51055,44.550582,1.680442,32.952519,131.622289
min,2539.0,2438.0,40.49979,-74.24442,0.0,1.0,0.0,0.01,1.0,0.0
25%,9471945.0,7822033.0,40.6901,-73.98307,69.0,1.0,1.0,0.19,1.0,0.0
50%,19677280.0,30793820.0,40.72307,-73.95568,106.0,3.0,5.0,0.72,1.0,45.0
75%,29152180.0,107434400.0,40.763115,-73.936275,175.0,5.0,24.0,2.02,2.0,227.0
max,36487240.0,274321300.0,40.91306,-73.71299,10000.0,1250.0,629.0,58.5,327.0,365.0


## Data preprocessing

In [17]:
# id, name, host_id, last_review are not usefull; neighbourhood column has a lot of disctinct values; 
# neighbourhood_group and room_type must be transformed to one-hot encoding

# Columns to keep:
features = data[['neighbourhood_group', 'room_type', 'minimum_nights', 'number_of_reviews', 'reviews_per_month', 
                 'calculated_host_listings_count', 'availability_365']]
data.head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
28747,22214395,Spacious Studio In Queens,162276183,Gary,Queens,Rego Park,40.71663,-73.85823,Entire home/apt,59,1,110,2019-06-24,5.92,1,147
20194,16162363,Quiet Private Room in Manhattan,104926837,Julia,Manhattan,Financial District,40.70551,-74.00735,Private room,113,3,131,2019-07-04,4.21,3,12
5150,3709013,Huge 1.5BR Artist Home in Brownstone Brooklyn :),7420906,Alix,Brooklyn,Bedford-Stuyvesant,40.68473,-73.95697,Entire home/apt,99,3,10,2018-11-13,0.28,1,66
2979,1735478,"Relaxing, serene room in NYC Apt",6790494,Paul,Queens,Long Island City,40.76081,-73.93163,Private room,100,20,52,2015-08-25,0.75,1,182
8022,6180762,BRAND NEW 1BD / STEPS CENTRAL PARK!,1475015,Mike,Manhattan,Upper West Side,40.76877,-73.9846,Entire home/apt,87,30,3,2017-11-30,0.1,52,275


In [18]:
# Check if there are any missing values:
print(features.isna().sum())  # the number of missing values for each feature

neighbourhood_group                   0
room_type                             0
minimum_nights                        0
number_of_reviews                     0
reviews_per_month                 10052
calculated_host_listings_count        0
availability_365                      0
dtype: int64


In [19]:
# Fill reviews_per_month with zero:
features['reviews_per_month'] = features['reviews_per_month'].fillna(0)
print(features.isna().sum())

neighbourhood_group               0
room_type                         0
minimum_nights                    0
number_of_reviews                 0
reviews_per_month                 0
calculated_host_listings_count    0
availability_365                  0
dtype: int64


In [20]:
# Turn categorical features to one-hot encodings:
onehot_neighbourhood_group = pd.get_dummies(features['neighbourhood_group'])
onehot_room_type = pd.get_dummies(features['room_type'])
print(onehot_room_type)

# Drop original categorical features:
features = features.drop(columns=['neighbourhood_group', 'room_type'])
print("1: ", features.head())

# Put one-hot encodings to the features vector:
features = pd.concat([features, onehot_neighbourhood_group, onehot_room_type], axis=1)
print("2: ", features.head())

       Entire home/apt  Private room  Shared room
28747             True         False        False
20194            False          True        False
5150              True         False        False
2979             False          True        False
8022              True         False        False
...                ...           ...          ...
9355             False          True        False
35254            False          True        False
48195             True         False        False
35137             True         False        False
33145             True         False        False

[48895 rows x 3 columns]
1:         minimum_nights  number_of_reviews  reviews_per_month  \
28747               1                110               5.92   
20194               3                131               4.21   
5150                3                 10               0.28   
2979               20                 52               0.75   
8022               30                  3               

In [44]:
# Select labels:
targets = data['price']

# Select  X (features) and y (labels) for training set and test set
train_size = int(0.7*len(data))  # 70%
X_train, X_test = features.values[:train_size, :], features.values[train_size:, :]
y_train, y_test = targets.values[:train_size], features.values[train_size:]

# Check how many features we have:
print(len(X_train[0]))  # e.g. take the first training instance

# Check the number of data instances and labels in training data
print(len(X_train)) 
print(len(y_train))

13
34226
34226


## Data visualization and analysis

## The Tensorflow 2 ML Approaches

In [49]:
class LinearModel:  # Linear Regression
    # it will generate y_pred = W*X + b (weights times input + bias)
    # the model learns the W and the b so that y_pred is close to real y in our data

    def __init__(self):
        # Goal: y_pred = W*X + b
        
        # self.W = tf.Variable(13.0)
        # self.b = tf.Variable(4.0)
        
        # Initialize random values:
        self.initializer = tf.keras.initializers.GlorotUniform()

    def loss(init, y, y_pred):
        # return tf.reduce_mean(tf.square(y - y_pred))  # mean square error between y and y_pred
        return tf.reduce_mean(tf.abs(y - y_pred))  # mean absolute error between y and y_pred (more human readable than square)

    def train(self, X, y, lr = 0.00001, epochs = 20, verbose=True):  # X = input feature vectors, y = data labels, 
                                                                    # lr = learning rate, epochs = # of epochs,
                                                                    # verbose = print intermediate results
                                                                    # if los is going up, the lr is too high
        # Create arrays for data:
        X = np.asarray(X, dtype=np.float32)
        y = np.asarray(y, dtype=np.float32).reshape((-1, 1))  # turn [1, 2, 3, 4] -> [[1], []]

        # declare W (weights) and b (bias) variables using the initializer we declared in the constructor;
        # shape as data is multidimensional (> 1 feature):
        self.W = tf.Variable(
            initial_value = self.initializer(shape=(len(X[0]),1), dtype='float32')
        )  # number of features by 1
        
        self.b = tf.Variable(
            initial_value = self.initializer(shape=(1,), dtype='float32')
        )
        
        
        def train_step():
            with tf.GradientTape() as t:  # compute gradients for some Python functions
                current_loss = self.loss(y, self.predict(X))  # calculate the current loss between the true label 
                                                                # and the current prediction of this label
            # get gradients of the weigts and the biases:
            dW, db = t.gradient(current_loss, [self.W, self.b]) # pass W and b as a list to t.gradient; get current loss
            # update W and b:
            self.W.assign_sub(lr * dW)  # substract: W -= lr * dW (not possible to use directly, as dW and db are tf variables)
            self.b.assign_sub(lr * db)

            return current_loss

        for epoch in range(epochs):
            current_loss = train_step()
            if verbose:
                print(f'Epoch {epoch}: Loss: {current_loss.numpy()}')  # eager excecution (numpy value of the tf tensor)
    
    def predict(self, X):
        # return self.W * X + self.b  # valid only for one-dimensional data
        # [a, b] x [b, a]
        # X -> [number_instances, number_features]; W must be [number_features, 1] as we want to end up with a simple value by predict
        return tf.matmul(X, self.W) + self.b  # matrix multiplication
        
      

In [50]:
model = LinearModel()
model.train(X_train, y_train, epochs=100)

Epoch 0: Loss: 188.72377014160156
Epoch 1: Loss: 188.58926391601562
Epoch 2: Loss: 188.4547119140625
Epoch 3: Loss: 188.3201904296875
Epoch 4: Loss: 188.18568420410156
Epoch 5: Loss: 188.05113220214844
Epoch 6: Loss: 187.91659545898438
Epoch 7: Loss: 187.78207397460938
Epoch 8: Loss: 187.64755249023438
Epoch 9: Loss: 187.51303100585938
Epoch 10: Loss: 187.3784942626953
Epoch 11: Loss: 187.2439727783203
Epoch 12: Loss: 187.1094512939453
Epoch 13: Loss: 186.97491455078125
Epoch 14: Loss: 186.84039306640625
Epoch 15: Loss: 186.70587158203125
Epoch 16: Loss: 186.57135009765625
Epoch 17: Loss: 186.4368133544922
Epoch 18: Loss: 186.3022918701172
Epoch 19: Loss: 186.16775512695312
Epoch 20: Loss: 186.0332489013672
Epoch 21: Loss: 185.89869689941406
Epoch 22: Loss: 185.76419067382812
Epoch 23: Loss: 185.62966918945312
Epoch 24: Loss: 185.49514770507812
Epoch 25: Loss: 185.36061096191406
Epoch 26: Loss: 185.22610473632812
Epoch 27: Loss: 185.09158325195312
Epoch 28: Loss: 184.95704650878906
Epo

## Conclusions