# Importing Relevant Python Libraries

In [93]:
# Data Analysis and Wrangling
import pandas as pd
import numpy as np
import random as rnd
import math

# Visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# Machine Learning
from sklearn import linear_model
from sklearn.model_selection import train_test_split

# Extracting Data

In [18]:
list1 = pd.read_csv('listings.csv')
list2 = pd.read_csv('listings2.csv')

# Data Size Metrics

In [19]:
#print(list1.columns.values)
print(list1.shape)

(3818, 16)


In [20]:
#list2.columns.values)
print(list2.shape)

(3818, 92)


In [65]:
# Join list1 and list2 to create the x vector
X = pd.concat([list1, list2], axis=1)
X.shape # Original Data Frame

(3818, 108)

# Feature Engineering

### Property Type

In [66]:
X=X[X['property_type'].notnull()]
X['property_type'].value_counts()

House              1733
Apartment          1708
Townhouse           118
Condominium          91
Loft                 40
Bed & Breakfast      37
Other                22
Cabin                21
Camper/RV            13
Bungalow             13
Boat                  8
Tent                  5
Treehouse             3
Chalet                2
Dorm                  2
Yurt                  1
Name: property_type, dtype: int64

In [67]:
X.shape

(3817, 108)

In [68]:
# Increasing the feature space
i=0
for item in X['property_type'].unique():
    a=X['property_type'] == X['property_type'].unique()[i]
    X[item] = a.map(lambda x: 1 if x == True else -1)
    i=i+1
X.shape

(3817, 124)

### Bed Type

In [69]:
X=X[X['bed_type'].notnull()]
X['bed_type'].unique()

array(['Real Bed', 'Futon', 'Pull-out Sofa', 'Airbed', 'Couch'], dtype=object)

In [109]:
# Increasing the feature space
i=0
for item in X['bed_type'].unique():
    a=X['bed_type'] == X['bed_type'].unique()[i]
    X[item] = a.map(lambda x: 1 if x == True else -1)
    i=i+1
X.shape

(3817, 132)

### Extracting Numerical Values

In [210]:
Z1=X.loc[:, X.dtypes == np.float64] #Extracting columns with values of type float 
Z2=X.loc[:, X.dtypes == np.int64]   #Extracting columns with values of type int
X_numeric=pd.concat([Z1,Z2], axis=1)

In [231]:
# X_numeric has duplicate columns. The code below removes the duplicate columns
_, i = np.unique(X_numeric.columns, return_index=True)
X_Num_Cov=X_numeric.iloc[:, i]
X_Num_Cov.to_csv('Numerical_FS.csv')
X_Num_Cov.shape
X_Num_Cov.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3817 entries, 0 to 3817
Data columns (total 55 columns):
Airbed                            3817 non-null int64
Apartment                         3817 non-null int64
Bed & Breakfast                   3817 non-null int64
Boat                              3817 non-null int64
Bungalow                          3817 non-null int64
Cabin                             3817 non-null int64
Camper/RV                         3817 non-null int64
Chalet                            3817 non-null int64
Condominium                       3817 non-null int64
Couch                             3817 non-null int64
Dorm                              3817 non-null int64
Futon                             3817 non-null int64
House                             3817 non-null int64
Loft                              3817 non-null int64
Other                             3817 non-null int64
Pull-out Sofa                     3817 non-null int64
Real Bed                     

### Latitude and Longitude

### Using KNN for local model fitting

In [232]:
# Defining the points for KNN algorithm
lat=np.linspace(47.55,47.70,3)
long=np.linspace(-122.3875,-122.2875,3)
encode=[]
for i in range(0,len(X_Num_Cov['latitude'])):
    delta_arr=[]
    for j in range(0,len(lat)):
        for k in range(0,len(long)):
            a=(X_Num_Cov['latitude'].iloc[i]-lat[j])**2
            b=(X_Num_Cov['longitude'].iloc[i]-long[k])**2
            c=a+b
            delta_arr.append(c)
    delta=min(delta_arr)
    encode.append((np.array(delta_arr)<=delta).astype(int))

In [235]:
knn1=[]
knn2=[]
knn3=[]
knn4=[]
knn5=[]
knn6=[]
knn7=[]
knn8=[]
knn9=[]
for i in range(0,len(encode)):
    knn1.append(encode[i][0])
    knn2.append(encode[i][1])
    knn3.append(encode[i][2])
    knn4.append(encode[i][3])
    knn5.append(encode[i][4])
    knn6.append(encode[i][5])
    knn7.append(encode[i][6])
    knn8.append(encode[i][7])
    knn9.append(encode[i][8])

In [236]:
knn1_s=pd.Series( (v for v in np.array(knn1)) ,name='knn1')
knn2_s=pd.Series( (v for v in np.array(knn2)),name='knn2' )
knn3_s=pd.Series( (v for v in np.array(knn3)),name='knn3' )
knn4_s=pd.Series( (v for v in np.array(knn4)) ,name='knn4')
knn5_s=pd.Series( (v for v in np.array(knn5)) ,name='knn5')
knn6_s=pd.Series( (v for v in np.array(knn6)) ,name='knn6')
knn7_s=pd.Series( (v for v in np.array(knn7)) ,name='knn7')
knn8_s=pd.Series( (v for v in np.array(knn8)) ,name='knn8')
knn9_s=pd.Series( (v for v in np.array(knn9)) ,name='knn9')
X_Num_Cov=pd.concat([X_Num_Cov,knn1_s,knn2_s,knn3_s,knn4_s,knn5_s,knn6_s,knn7_s,knn8_s,knn9_s],axis=1)
X_Num_Cov.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3818 entries, 0 to 3817
Data columns (total 64 columns):
Airbed                            3817 non-null float64
Apartment                         3817 non-null float64
Bed & Breakfast                   3817 non-null float64
Boat                              3817 non-null float64
Bungalow                          3817 non-null float64
Cabin                             3817 non-null float64
Camper/RV                         3817 non-null float64
Chalet                            3817 non-null float64
Condominium                       3817 non-null float64
Couch                             3817 non-null float64
Dorm                              3817 non-null float64
Futon                             3817 non-null float64
House                             3817 non-null float64
Loft                              3817 non-null float64
Other                             3817 non-null float64
Pull-out Sofa                     3817 non-null float

In [248]:
# This cell constructs the new feature space
space=['accommodates','bathrooms','bedrooms','beds','guests_included','price','reviews_per_month','review_scores_value','knn1',
      'knn2','knn3','knn4','knn5','knn6','knn7','knn8','knn9']
for item in X['property_type'].unique().tolist():
    space.append(item)
for item in X['bed_type'].unique().tolist():
    space.append(item)
for item in X['cancellation_policy'].unique().tolist():
    space.append(item)

In [269]:
# Selecting certian numerical features
X_select = X_Num_Cov[space]
# Dropping the missing values from the feature space
X_select = X_select.dropna()
# Creating the output space
Y = X_select['reviews_per_month']
X_select = X_select.drop(['reviews_per_month'], axis = 1)
print(X_select.info())
X_select.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3144 entries, 0 to 3814
Data columns (total 40 columns):
accommodates           3144 non-null float64
bathrooms              3144 non-null float64
bedrooms               3144 non-null float64
beds                   3144 non-null float64
guests_included        3144 non-null float64
price                  3144 non-null float64
review_scores_value    3144 non-null float64
knn1                   3144 non-null float64
knn2                   3144 non-null float64
knn3                   3144 non-null float64
knn4                   3144 non-null float64
knn5                   3144 non-null float64
knn6                   3144 non-null float64
knn7                   3144 non-null float64
knn8                   3144 non-null float64
knn9                   3144 non-null float64
Apartment              3144 non-null float64
House                  3144 non-null float64
Cabin                  3144 non-null float64
Condominium            3144 non-null f

Unnamed: 0,accommodates,bathrooms,bedrooms,beds,guests_included,price,review_scores_value,knn1,knn2,knn3,...,Chalet,Tent,Real Bed,Futon,Pull-out Sofa,Airbed,Couch,moderate,strict,flexible
0,4.0,1.0,1.0,1.0,2.0,85.0,10.0,0.0,0.0,0.0,...,-1.0,-1.0,1.0,-1.0,-1.0,-1.0,-1.0,1.0,-1.0,-1.0
1,4.0,1.0,1.0,1.0,1.0,150.0,10.0,0.0,0.0,0.0,...,-1.0,-1.0,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1.0,-1.0
2,11.0,4.5,5.0,7.0,10.0,975.0,10.0,0.0,0.0,0.0,...,-1.0,-1.0,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1.0,-1.0
4,6.0,2.0,3.0,3.0,6.0,450.0,9.0,0.0,0.0,0.0,...,-1.0,-1.0,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1.0,-1.0
5,2.0,1.0,1.0,1.0,1.0,120.0,10.0,0.0,0.0,0.0,...,-1.0,-1.0,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1.0,-1.0


# Preliminary Model Analysis

### Splitting data into training and testing sets

In [260]:
from sklearn.model_selection import cross_val_score
num_test = 0.20 #20% of the data set will be used for testing the final model
X_train, X_test, y_train, y_test = train_test_split(X_select, Y, test_size=num_test, random_state=100)

## Ridge regression + Hyperparameter Tuning via Cross Validation

In [261]:
from sklearn.linear_model import Ridge
clf = Ridge(alpha=5.0)
scores = cross_val_score(clf, X_train, y_train, cv=5)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.53 (+/- 0.15)


#### Fitting the selected model 

In [262]:
clf.fit(X_train,y_train)

Ridge(alpha=5.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

#### Making Predictions

In [263]:
y_test_predict=clf.predict(X_test)

In [264]:
y_train_predict=clf.predict(X_train)

#### Model Performance Evaluation

In [265]:
from sklearn.metrics import mean_squared_error
from math import sqrt

rms_test = sqrt(mean_squared_error(y_test, y_test_predict))
rms_test

61.81386392226642

In [266]:
rms_train = sqrt(mean_squared_error(y_train, y_train_predict))
rms_train

60.266496119052796

In [267]:
clf.score(X_train, y_train)

0.56147800615003873

In [268]:
clf.score(X_test, y_test)

0.49680017050872149