# Importing Relevant Python Libraries

In [86]:
# Data Analysis and Wrangling
import pandas as pd
import numpy as np
import random as rnd
import math

# Visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# Machine Learning
from sklearn import linear_model
from sklearn.model_selection import train_test_split

# Extracting Data

In [87]:
#list1 = pd.read_csv('listings.csv')
#list2 = pd.read_csv('listings2.csv')
list2 = pd.read_csv('nylistings.csv')

  interactivity=interactivity, compiler=compiler, result=result)


# Data Size Metrics

In [51]:
#print(list1.columns.values)
#print(list1.shape)

In [52]:
#list2.columns.values)
print(list2.shape)

(44317, 96)


In [53]:
list2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44317 entries, 0 to 44316
Data columns (total 96 columns):
id                                  44317 non-null int64
listing_url                         44317 non-null object
scrape_id                           44317 non-null int64
last_scraped                        44317 non-null object
name                                44289 non-null object
summary                             42809 non-null object
space                               29850 non-null object
description                         44292 non-null object
experiences_offered                 44317 non-null object
neighborhood_overview               25619 non-null object
notes                               16147 non-null object
transit                             27342 non-null object
access                              25360 non-null object
interaction                         24368 non-null object
house_rules                         27646 non-null object
thumbnail_url          

In [54]:
# Join list1 and list2 to create the x vector
# X = pd.concat([list1, list2], axis=1)
X = list2
X.shape # Original Data Frame

(44317, 96)

#### Converting Security Deposit Value from String to Numeric

In [55]:
X['Price'] = X['price'].replace(np.nan, '$0', regex=True)

In [56]:
X.drop(['price'], axis = 1, inplace = True)
X.shape

(44317, 96)

In [57]:
X['price']=X['Price'].replace('[\$,]', '', regex=True).astype(float)

In [58]:
X.drop(['Price'], axis = 1, inplace = True)
X.shape

(44317, 96)

# Feature Engineering

### Property Type

In [59]:
X=X[X['property_type'].notnull()]
X['property_type'].value_counts()

Apartment             37498
House                  3726
Loft                    879
Townhouse               817
Condominium             594
Other                   290
Bed & Breakfast         136
Timeshare                85
Guesthouse               58
Guest suite              54
Hostel                   39
Dorm                     28
Boutique hotel           25
Villa                    21
Bungalow                 19
Boat                      9
Vacation home             9
Serviced apartment        9
In-law                    6
Earth House               3
Chalet                    3
Cabin                     3
Castle                    1
Yurt                      1
Treehouse                 1
Cave                      1
Train                     1
Tent                      1
Name: property_type, dtype: int64

In [60]:
X.shape

(44317, 96)

In [61]:
# Increasing the feature space
i=0
for item in X['property_type'].unique():
    a=X['property_type'] == X['property_type'].unique()[i]
    X[item] = a.map(lambda x: 1 if x == True else -1)
    i=i+1
X.shape

(44317, 124)

### Bed Type

In [62]:
X=X[X['bed_type'].notnull()]
X['bed_type'].unique()

array(['Real Bed', 'Airbed', 'Pull-out Sofa', 'Futon', 'Couch'], dtype=object)

In [63]:
# Increasing the feature space
i=0
for item in X['bed_type'].unique():
    a=X['bed_type'] == X['bed_type'].unique()[i]
    X[item] = a.map(lambda x: 1 if x == True else -1)
    i=i+1
X.shape

(44317, 129)

### Extracting Numerical Values

In [64]:
Z1=X.loc[:, X.dtypes == np.float64] #Extracting columns with values of type float 
Z2=X.loc[:, X.dtypes == np.int64]   #Extracting columns with values of type int
X_numeric=pd.concat([Z1,Z2], axis=1)

In [65]:
# X_numeric has duplicate columns. The code below removes the duplicate columns
_, i = np.unique(X_numeric.columns, return_index=True)
X_Num_Cov=X_numeric.iloc[:, i]
X_Num_Cov.shape
X_Num_Cov.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 44317 entries, 0 to 44316
Data columns (total 65 columns):
Airbed                            44317 non-null int64
Apartment                         44317 non-null int64
Bed & Breakfast                   44317 non-null int64
Boat                              44317 non-null int64
Boutique hotel                    44317 non-null int64
Bungalow                          44317 non-null int64
Cabin                             44317 non-null int64
Castle                            44317 non-null int64
Cave                              44317 non-null int64
Chalet                            44317 non-null int64
Condominium                       44317 non-null int64
Couch                             44317 non-null int64
Dorm                              44317 non-null int64
Earth House                       44317 non-null int64
Futon                             44317 non-null int64
Guest suite                       44317 non-null int64
Guesthouse 

### Latitude and Longitude

### Using KNN for local model fitting

In [66]:
# Defining the points for KNN algorithm
lat=np.linspace(47.55,47.70,3)
long=np.linspace(-122.3875,-122.2875,3)
encode=[]
for i in range(0,len(X_Num_Cov['latitude'])):
    delta_arr=[]
    for j in range(0,len(lat)):
        for k in range(0,len(long)):
            a=(X_Num_Cov['latitude'].iloc[i]-lat[j])**2
            b=(X_Num_Cov['longitude'].iloc[i]-long[k])**2
            c=a+b
            delta_arr.append(c)
    delta=min(delta_arr)
    encode.append((np.array(delta_arr)<=delta).astype(int))

In [68]:
knn1=[]
knn2=[]
knn3=[]
knn4=[]
knn5=[]
knn6=[]
knn7=[]
knn8=[]
knn9=[]
for i in range(0,len(encode)):
    knn1.append(encode[i][0])
    knn2.append(encode[i][1])
    knn3.append(encode[i][2])
    knn4.append(encode[i][3])
    knn5.append(encode[i][4])
    knn6.append(encode[i][5])
    knn7.append(encode[i][6])
    knn8.append(encode[i][7])
    knn9.append(encode[i][8])

In [69]:
knn1_s=pd.Series( (v for v in np.array(knn1)) ,name='knn1')
knn2_s=pd.Series( (v for v in np.array(knn2)),name='knn2' )
knn3_s=pd.Series( (v for v in np.array(knn3)),name='knn3' )
knn4_s=pd.Series( (v for v in np.array(knn4)) ,name='knn4')
knn5_s=pd.Series( (v for v in np.array(knn5)) ,name='knn5')
knn6_s=pd.Series( (v for v in np.array(knn6)) ,name='knn6')
knn7_s=pd.Series( (v for v in np.array(knn7)) ,name='knn7')
knn8_s=pd.Series( (v for v in np.array(knn8)) ,name='knn8')
knn9_s=pd.Series( (v for v in np.array(knn9)) ,name='knn9')
X_Num_Cov=pd.concat([X_Num_Cov,knn1_s,knn2_s,knn3_s,knn4_s,knn5_s,knn6_s,knn7_s,knn8_s,knn9_s],axis=1)
X_Num_Cov.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 44317 entries, 0 to 44316
Data columns (total 74 columns):
Airbed                            44317 non-null int64
Apartment                         44317 non-null int64
Bed & Breakfast                   44317 non-null int64
Boat                              44317 non-null int64
Boutique hotel                    44317 non-null int64
Bungalow                          44317 non-null int64
Cabin                             44317 non-null int64
Castle                            44317 non-null int64
Cave                              44317 non-null int64
Chalet                            44317 non-null int64
Condominium                       44317 non-null int64
Couch                             44317 non-null int64
Dorm                              44317 non-null int64
Earth House                       44317 non-null int64
Futon                             44317 non-null int64
Guest suite                       44317 non-null int64
Guesthouse 

In [72]:
# This cell constructs the new feature space
space=['availability_30','accommodates','bathrooms','bedrooms','beds','price','reviews_per_month','review_scores_value','knn1',
      'knn2','knn3','knn4','knn5','knn6','knn7','knn8','knn9']
for item in X['property_type'].unique().tolist():
    space.append(item)
for item in X['bed_type'].unique().tolist():
    space.append(item)

In [73]:
# Selecting certian numerical features
X_select = X_Num_Cov[space]
# Filling 0 for missing values in reviews per month column
values={'reviews_per_month':0,'review_scores_value':0}
X_new=X_select.fillna(value=values) # New Dataframe with missing values substituted

In [74]:
X_new

Unnamed: 0,availability_30,accommodates,bathrooms,bedrooms,beds,price,reviews_per_month,review_scores_value,knn1,knn2,...,Cabin,Boutique hotel,Cave,Castle,Train,Real Bed,Airbed,Pull-out Sofa,Futon,Couch
0,0,2,1.0,0.0,1.0,110.0,0.00,0.0,0,0,...,-1,-1,-1,-1,-1,1,-1,-1,-1,-1
1,19,2,1.0,1.0,1.0,50.0,2.00,10.0,0,0,...,-1,-1,-1,-1,-1,1,-1,-1,-1,-1
2,28,3,1.0,1.0,1.0,125.0,0.77,10.0,0,0,...,-1,-1,-1,-1,-1,1,-1,-1,-1,-1
3,30,4,1.0,1.0,3.0,100.0,0.00,0.0,0,0,...,-1,-1,-1,-1,-1,1,-1,-1,-1,-1
4,30,4,3.0,3.0,3.0,300.0,0.00,0.0,0,0,...,-1,-1,-1,-1,-1,1,-1,-1,-1,-1
5,0,4,1.0,1.0,1.0,69.0,3.27,10.0,0,0,...,-1,-1,-1,-1,-1,1,-1,-1,-1,-1
6,24,7,1.0,2.0,5.0,150.0,1.58,10.0,0,0,...,-1,-1,-1,-1,-1,1,-1,-1,-1,-1
7,29,2,1.0,1.0,1.0,101.0,1.49,10.0,0,0,...,-1,-1,-1,-1,-1,1,-1,-1,-1,-1
8,29,4,1.5,2.0,2.0,100.0,0.00,0.0,0,0,...,-1,-1,-1,-1,-1,1,-1,-1,-1,-1
9,30,4,1.0,1.0,1.0,200.0,0.00,0.0,0,0,...,-1,-1,-1,-1,-1,1,-1,-1,-1,-1


In [75]:
# Dropping the missing values from the feature space
X_new = X_new.dropna()
# Creating the output space
Y = X_new['reviews_per_month']
X_new = X_new.drop(['reviews_per_month'], axis = 1)
print(X_new.info())
X_new.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 44058 entries, 0 to 44316
Data columns (total 49 columns):
availability_30        44058 non-null int64
accommodates           44058 non-null int64
bathrooms              44058 non-null float64
bedrooms               44058 non-null float64
beds                   44058 non-null float64
price                  44058 non-null float64
review_scores_value    44058 non-null float64
knn1                   44058 non-null int64
knn2                   44058 non-null int64
knn3                   44058 non-null int64
knn4                   44058 non-null int64
knn5                   44058 non-null int64
knn6                   44058 non-null int64
knn7                   44058 non-null int64
knn8                   44058 non-null int64
knn9                   44058 non-null int64
Apartment              44058 non-null int64
House                  44058 non-null int64
Boat                   44058 non-null int64
Townhouse              44058 non-null int64
C

Unnamed: 0,availability_30,accommodates,bathrooms,bedrooms,beds,price,review_scores_value,knn1,knn2,knn3,...,Cabin,Boutique hotel,Cave,Castle,Train,Real Bed,Airbed,Pull-out Sofa,Futon,Couch
0,0,2,1.0,0.0,1.0,110.0,0.0,0,0,1,...,-1,-1,-1,-1,-1,1,-1,-1,-1,-1
1,19,2,1.0,1.0,1.0,50.0,10.0,0,0,1,...,-1,-1,-1,-1,-1,1,-1,-1,-1,-1
2,28,3,1.0,1.0,1.0,125.0,10.0,0,0,1,...,-1,-1,-1,-1,-1,1,-1,-1,-1,-1
3,30,4,1.0,1.0,3.0,100.0,0.0,0,0,1,...,-1,-1,-1,-1,-1,1,-1,-1,-1,-1
4,30,4,3.0,3.0,3.0,300.0,0.0,0,0,1,...,-1,-1,-1,-1,-1,1,-1,-1,-1,-1


# Preliminary Model Analysis

### Splitting data into training and testing sets

In [76]:
from sklearn.model_selection import cross_val_score
num_test = 0.20 #20% of the data set will be used for testing the final model
X_train, X_test, y_train, y_test = train_test_split(X_new, Y, test_size=num_test, random_state=100)

## Ridge regression + Hyperparameter Tuning via Cross Validation

In [77]:
from sklearn.linear_model import Ridge
clf = Ridge(alpha=5.0)
scores = cross_val_score(clf, X_train, y_train, cv=5)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.19 (+/- 0.01)


#### Fitting the selected model 

In [78]:
clf.fit(X_train,y_train)

Ridge(alpha=5.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

#### Making Predictions

In [79]:
y_test_predict=clf.predict(X_test)

In [80]:
y_train_predict=clf.predict(X_train)

#### Model Performance Evaluation

In [81]:
from sklearn.metrics import mean_squared_error
from math import sqrt

rms_test = sqrt(mean_squared_error(y_test, y_test_predict))
rms_test

1.5069602690958916

In [82]:
rms_train = sqrt(mean_squared_error(y_train, y_train_predict))
rms_train

1.390095317345554

In [83]:
clf.score(X_train, y_train)

0.19106419088893389

In [84]:
clf.score(X_test, y_test)

0.18348727914850896