# Importing Relevant Python Libraries

In [3]:
# Data Analysis and Wrangling
import pandas as pd
import numpy as np
import random as rnd
import math

# Visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# Machine Learning
from sklearn import linear_model
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

# Extracting Data

In [18]:
list1 = pd.read_csv('listings.csv')
list2 = pd.read_csv('listings2.csv')

# Data Size Metrics

In [19]:
#print(list1.columns.values)
print(list1.shape)

(3818, 16)


In [20]:
#list2.columns.values)
print(list2.shape)

(3818, 92)


In [21]:
# Join list1 and list2 to create the x vector
X = pd.concat([list1, list2], axis=1)
X.shape # Original Data Frame

(3818, 108)

# Feature Engineering

### Property Type

In [22]:
X=X[X['property_type'].notnull()]
X['property_type'].value_counts()

House              1733
Apartment          1708
Townhouse           118
Condominium          91
Loft                 40
Bed & Breakfast      37
Other                22
Cabin                21
Camper/RV            13
Bungalow             13
Boat                  8
Tent                  5
Treehouse             3
Chalet                2
Dorm                  2
Yurt                  1
Name: property_type, dtype: int64

In [23]:
X.shape

(3817, 108)

In [24]:
# Increasing the feature space
i=0
for item in X['property_type'].unique():
    a=X['property_type'] == X['property_type'].unique()[i]
    X[item] = a.map(lambda x: 1 if x == True else 0)
    i=i+1
X.shape

(3817, 124)

### Bed Type

In [25]:
X=X[X['bed_type'].notnull()]
X['bed_type'].unique()

array(['Real Bed', 'Futon', 'Pull-out Sofa', 'Airbed', 'Couch'], dtype=object)

In [26]:
# Increasing the feature space
i=0
for item in X['bed_type'].unique():
    a=X['bed_type'] == X['bed_type'].unique()[i]
    X[item] = a.map(lambda x: 1 if x == True else 0)
    i=i+1
X.shape

(3817, 129)

### Cancellation Policy

In [27]:
X=X[X['cancellation_policy'].notnull()]
X['cancellation_policy'].unique()

array(['moderate', 'strict', 'flexible'], dtype=object)

In [28]:
# Increasing the feature space
i=0
for item in X['cancellation_policy'].unique():
    a=X['cancellation_policy'] == X['cancellation_policy'].unique()[i]
    X[item] = a.map(lambda x: 1 if x == True else 0)
    i=i+1
X.shape

(3817, 132)

In [29]:
Z1=X.loc[:, X.dtypes == np.float64] #Extracting columns with values of type float 
Z2=X.loc[:, X.dtypes == np.int64]   #Extracting columns with values of type int
X_numeric=pd.concat([Z1,Z2], axis=1)

In [30]:
# X_numeric has duplicate columns. The code below removes the duplicate columns
_, i = np.unique(X_numeric.columns, return_index=True)
X_Num_Cov=X_numeric.iloc[:, i]
X_Num_Cov.to_csv('Numerical_FS.csv')
X_Num_Cov.shape

(3817, 55)

In [32]:
# This cell constructs the new feature space
space=['accommodates','bathrooms','bedrooms','beds','guests_included','price','reviews_per_month','review_scores_value']
for item in X['property_type'].unique().tolist():
    space.append(item)
for item in X['bed_type'].unique().tolist():
    space.append(item)
for item in X['cancellation_policy'].unique().tolist():
    space.append(item)

In [33]:
# Selecting certian numerical features
X_select = X_Num_Cov[space]
# Dropping the missing values from the feature space
X_select = X_select.dropna()
# Creating the output space
Y = X_select['reviews_per_month']
X_select = X_select.drop(['reviews_per_month'], axis = 1)
print(X_select.info())
X_select.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3144 entries, 0 to 3814
Data columns (total 31 columns):
accommodates           3144 non-null int64
bathrooms              3144 non-null float64
bedrooms               3144 non-null float64
beds                   3144 non-null float64
guests_included        3144 non-null int64
price                  3144 non-null int64
review_scores_value    3144 non-null float64
Apartment              3144 non-null int64
House                  3144 non-null int64
Cabin                  3144 non-null int64
Condominium            3144 non-null int64
Camper/RV              3144 non-null int64
Bungalow               3144 non-null int64
Townhouse              3144 non-null int64
Loft                   3144 non-null int64
Boat                   3144 non-null int64
Bed & Breakfast        3144 non-null int64
Other                  3144 non-null int64
Dorm                   3144 non-null int64
Treehouse              3144 non-null int64
Yurt                   31

Unnamed: 0,accommodates,bathrooms,bedrooms,beds,guests_included,price,review_scores_value,Apartment,House,Cabin,...,Chalet,Tent,Real Bed,Futon,Pull-out Sofa,Airbed,Couch,moderate,strict,flexible
0,4,1.0,1.0,1.0,2,85,10.0,1,0,0,...,0,0,1,0,0,0,0,1,0,0
1,4,1.0,1.0,1.0,1,150,10.0,1,0,0,...,0,0,1,0,0,0,0,0,1,0
2,11,4.5,5.0,7.0,10,975,10.0,0,1,0,...,0,0,1,0,0,0,0,0,1,0
4,6,2.0,3.0,3.0,6,450,9.0,0,1,0,...,0,0,1,0,0,0,0,0,1,0
5,2,1.0,1.0,1.0,1,120,10.0,0,1,0,...,0,0,1,0,0,0,0,0,1,0


# Preliminary Model Analysis

### Splitting data into training and testing sets

In [34]:
from sklearn.model_selection import cross_val_score
num_test = 0.20
X_train, X_test, y_train, y_test = train_test_split(X_select, Y, test_size=num_test, random_state=100)

### Ridge regression

In [35]:
from sklearn.linear_model import Ridge
clf = Ridge(alpha=10.0)
clf.fit(X_train, y_train)

Ridge(alpha=10.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

#### Cross Validation

In [36]:
scores = cross_val_score(clf, X_train, y_train, cv=5)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.09 (+/- 0.01)


#### Fitting the selected model 

In [37]:
y_test_predict=clf.predict(X_test)
np.size(y_test_predict)

629

In [38]:
y_train_predict=clf.predict(X_train)
np.size(y_train_predict)

2515

In [39]:
from sklearn.metrics import mean_squared_error
from math import sqrt

rms_test = sqrt(mean_squared_error(y_test, y_test_predict))
rms_test

1.7524505402074309

In [40]:
rms_train = sqrt(mean_squared_error(y_train, y_train_predict))
rms_train

1.7122202869934844

In [41]:
clf.score(X_test, y_test)

0.11465129713819211

In [42]:
clf.score(X_train, y_train)

0.10918336064610545

### Support Vector Machine

In [98]:
from sklearn import svm
clf_svm=svm.SVR(kernel='linear')
clf_svm.fit(X_train,y_train)

SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='auto',
  kernel='linear', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [127]:
clf.score(X_test, y_test)

0.088525007308193571