In [24]:
import pandas as pd
import numpy as np
import re
import datetime
from sklearn.metrics import median_absolute_error

## Read preprocessed csv data

In [25]:
data_features = pd.read_csv('preprocessed_data/preprocessed.csv')
data_features[0:10]

Unnamed: 0,host_neighbourhood,host_identity_verified,neighbourhood,city,zipcode,property_type,room_type,accommodates,bathrooms,bedrooms,...,availability_365,number_of_reviews,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,cancellation_policy
0,City Island,t,City Island,Bronx,10464,House,Private room,2,1.0,1.0,...,170,25,100.0,10.0,10.0,10.0,10.0,10.0,10.0,moderate
1,City Island,f,City Island,Bronx,10464,Apartment,Private room,4,1.0,1.0,...,180,0,93.004379,9.531624,9.19608,9.709184,9.738259,9.417991,9.338282,flexible
2,City Island,f,City Island,Bronx,10464,House,Entire home/apt,4,3.0,3.0,...,365,0,93.004379,9.531624,9.19608,9.709184,9.738259,9.417991,9.338282,strict
3,City Island,t,City Island,City Island,10464,Apartment,Entire home/apt,3,1.0,1.0,...,335,12,93.0,10.0,10.0,10.0,10.0,10.0,10.0,strict
4,City Island,t,City Island,Bronx,10464,House,Private room,4,1.0,1.0,...,352,86,97.0,10.0,10.0,10.0,10.0,10.0,10.0,moderate
5,Williamsburg,f,Williamsburg,Bronx,10464,House,Entire home/apt,2,1.0,0.0,...,129,41,97.0,10.0,10.0,10.0,10.0,10.0,10.0,moderate
6,City Island,t,City Island,City Island,10464,House,Entire home/apt,4,1.0,1.0,...,306,74,98.0,10.0,10.0,10.0,10.0,10.0,10.0,flexible
7,Allerton,t,Allerton,Bronx,10467,Apartment,Private room,3,1.0,1.0,...,306,114,90.0,9.0,9.0,9.0,9.0,9.0,9.0,strict
8,Baychester,t,Baychester,Bronx,10469,House,Entire home/apt,5,1.0,1.0,...,144,5,100.0,10.0,10.0,10.0,10.0,10.0,10.0,moderate
9,Baychester,t,Baychester,Bronx,10469,Apartment,Entire home/apt,8,1.0,1.0,...,106,206,92.0,9.0,9.0,10.0,10.0,9.0,9.0,strict


## Read Class Labels (Prices)

In [26]:
data_values = pd.read_csv('data/airbnb_class.csv')
prices = []
def removeDollar(col):
    for i in range(len(data_values[col])):
        price = data_values['price'][i]
        price = re.sub('[,$]', '', price)
        prices.append(float(price))
    print prices[0:3] , len(prices)

### Converting Price values to float [Class Labels]

In [27]:
print data_values[0:3]
removeDollar('price')

     price
0   $99.00
1  $200.00
2  $300.00
[99.0, 200.0, 300.0] 40753


### Convert Features into matrix

In [28]:
from collections import defaultdict
dd = defaultdict(list)
dict_data_features = data_features.to_dict('records')
from sklearn.feature_extraction import DictVectorizer
vec = DictVectorizer()
data_features_array = vec.fit_transform(dict_data_features).toarray()
print data_features_array.shape

(40753, 1247)


### Linear Regression model

In [41]:
from sklearn import linear_model
reg = linear_model.LinearRegression()
start = datetime.datetime.now()
reg.fit(data_features_array, prices)
end = datetime.datetime.now()
print  "Time taken: ", end - start
# predict = reg.predict(data_features_array[32000:])
# print "MAE: ", median_absolute_error(prices[32000:], predict)

Time taken:  0:00:10.550760


### SGDRegressor

In [8]:
from sklearn.linear_model import SGDClassifier
start = datetime.datetime.now()
clf = SGDClassifier(loss="log", penalty="l1")
clf.fit(data_features_array[0:32000], prices[0:32000])
end = datetime.datetime.now()
print "Time taken: ", end - start
predict1 = clf.predict(data_features_array[32000:])
print "MAE: ", median_absolute_error(prices[32000:], predict1)

Time taken:  0:06:46.252325
MAE:  55.0


### Ridge Regressor

In [9]:
from sklearn import linear_model
start = datetime.datetime.now()
reg = linear_model.Ridge(alpha = 10000.0)
reg.fit (data_features_array[0:32000], prices[0:32000]) 
end = datetime.datetime.now()
print  "Time taken: ", end - start
predict1 = reg.predict(data_features_array[32000:])
print median_absolute_error(prices[32000:], predict1)

Time taken:  0:00:01.779617
33.6268552082


### Lasso Regression

In [44]:
from sklearn import linear_model
start = datetime.datetime.now()
reg = linear_model.LassoLars(alpha=0.06)
reg.fit (data_features_array, prices) 
end = datetime.datetime.now()
print  "Time taken: ", end - start
# predict1 = reg.predict(data_features_array[30000:])
# print "MAE: ", median_absolute_error(prices[30000:], predict1)

Time taken:  0:00:02.357835


### Bayesian Ridge

In [11]:
start = datetime.datetime.now()
reg = linear_model.BayesianRidge(alpha_1=0.01)
reg.fit (data_features_array[0:32000], prices[0:32000]) 
end = datetime.datetime.now()
print  "Time taken: ", end - start
predict1 = reg.predict(data_features_array[32000:])
print "MAE: ", median_absolute_error(prices[32000:], predict1)

Time taken:  0:00:19.999200
MAE:  37.2212375009


### Cross Validation

In [41]:
from sklearn.model_selection import train_test_split
start = datetime.datetime.now()
X_train, X_test, y_train, y_test = train_test_split(data_features_array, prices, test_size=0.2, random_state=1)
print X_train.shape, X_test.shape
reg = linear_model.LassoLars(alpha=0.06)
reg.fit (X_train, y_train) 
end = datetime.datetime.now()
print  "Time taken: ", end - start
predict1 = reg.predict(X_test)
print "MAE: ", median_absolute_error(y_test, predict1)

(32602, 1247) (8151, 1247)
Time taken:  0:00:03.436472
MAE:  32.9548312327


In [77]:
prices = np.asarray(prices)
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=10)
lass = []
rid = []
avg = []
for train, test in skf.split(data_features_array, prices):
    X_train = data_features_array[train]
    X_test = data_features_array[test]
    y_train = prices[train]
    y_test = prices[test]
    start = datetime.datetime.now()
    reg = linear_model.LassoLars(alpha=0.06)
    reg.fit (X_train, y_train) 
    end = datetime.datetime.now()
#     print  "Time taken for lasso lars: ", end - start
    predict1 = reg.predict(X_test)
    lassoLars = median_absolute_error(y_test, predict1)
    lass.append(lassoLars)
    print "MAE for Lasso Lars: ", lassoLars
    start = datetime.datetime.now()
    reg = linear_model.Ridge(alpha = 10000.0)
    reg.fit (X_train, y_train) 
    end = datetime.datetime.now()
#     print  "Time taken for ridge regression: ", end - start
    predict2 = reg.predict(X_test)
    ridge = median_absolute_error(y_test, predict2)
    rid.append(ridge)
    print "MAE for ridge regression: ", median_absolute_error(y_test, predict2)
#     print "Difference: ", lassoLars - ridge
    print "Average MAE: ", median_absolute_error(y_test, ((predict1 + predict2)/2.0))
    avg.append(median_absolute_error(y_test, ((predict1 + predict2)/2.0)))
total = 0.00
for i in lass:
    total = total + float(i)
print total/len(lass)

total = 0.00
for i in rid:
    total = total + float(i)
print total/len(rid)

total = 0.00
for i in avg:
    total = total + float(i)
print total/len(rid)

MAE for Lasso Lars:  39.306808148
MAE for ridge regression:  36.544407858
Average MAE:  37.0204441904
MAE for Lasso Lars:  32.7895924448
MAE for ridge regression:  32.1564943616
Average MAE:  31.0625547037
MAE for Lasso Lars:  32.4217889081
MAE for ridge regression:  31.3945768599
Average MAE:  31.1728524602
MAE for Lasso Lars:  31.1751953029
MAE for ridge regression:  32.0858791629
Average MAE:  31.1940653946
MAE for Lasso Lars:  34.2776138501
MAE for ridge regression:  34.836324218
Average MAE:  33.7989505148
MAE for Lasso Lars:  35.5811504072
MAE for ridge regression:  34.3203256556
Average MAE:  34.5168072154
MAE for Lasso Lars:  38.8037584856
MAE for ridge regression:  38.4497025879
Average MAE:  38.5979451961
MAE for Lasso Lars:  30.8118269691
MAE for ridge regression:  33.7843448039
Average MAE:  31.7641073589
MAE for Lasso Lars:  37.1445133709
MAE for ridge regression:  37.6227261167
Average MAE:  36.8543284589
MAE for Lasso Lars:  28.6970984109
MAE for ridge regression:  30.18

In [81]:
prices = np.asarray(prices)
from sklearn.model_selection import ShuffleSplit
ss = ShuffleSplit(n_splits=10, test_size=0.15,random_state=0)
lass = []
rid = []
for train, test in ss.split(data_features_array, prices):
    X_train = data_features_array[train]
    X_test = data_features_array[test]
    y_train = prices[train]
    y_test = prices[test]
    start = datetime.datetime.now()
    reg = linear_model.LassoLars(alpha=0.06)
    reg.fit (X_train, y_train) 
    end = datetime.datetime.now()
#     print  "Time taken for lasso lars: ", end - start
    predict1 = reg.predict(X_test)
    lassoLars = median_absolute_error(y_test, predict1)
    lass.append(lassoLars)
#     print "MAE for Lasso Lars: ", lassoLars
    start = datetime.datetime.now()
    reg = linear_model.Ridge(alpha = 10000.0)
    reg.fit (X_train, y_train) 
    end = datetime.datetime.now()
#     print  "Time taken for ridge regression: ", end - start
    predict2 = reg.predict(X_test)
    ridge = median_absolute_error(y_test, predict2)
    rid.append(ridge)
#     print "MAE for ridge regression: ", median_absolute_error(y_test, predict1)
#     print "Difference: ", lassoLars - ridge
    print "Average MAE: ", median_absolute_error(y_test, ((predict1 + predict2)/2.0))
    avg.append(median_absolute_error(y_test, ((predict1 + predict2)/2.0)))
total = 0.00
for i in lass:
    total = total + float(i)
print total/len(lass)

total = 0.00
for i in rid:
    total = total + float(i)
print total/len(rid)

total = 0.00
for i in avg:
    total = total + float(i)
print total/len(avg)

Average MAE:  33.289631295
Average MAE:  32.5428065512
Average MAE:  32.4387827291
Average MAE:  32.8746086015
Average MAE:  31.6009678666
Average MAE:  32.5827044872
Average MAE:  32.0415968239
Average MAE:  32.5603808214
Average MAE:  32.9030823542
Average MAE:  32.8311087529
33.1608828322
33.2978034858
32.9011406905


In [79]:
total = 0.00
for i in avg:
    total = total + float(i)
print total/len(avg)

33.1411628079


In [47]:
predictValues = [{
  'accommodates': 2,
  'availability_30': 24,
  'availability_365': 170,
  'availability_60': 54,
  'availability_90': 80,
  'bathrooms': 1.0,
  'bed_type': 'Real Bed',
  'bedrooms': 1.0,
  'beds': 1.0,
  'cancellation_policy': 'moderate',
  'city': 'Bronx',
  'cleaning_fee': '$50.00',
  'extra_people': '$20.00',
  'guests_included': 1.0,
  'host_identity_verified': 't',
  'host_neighbourhood': 'City Island',
  'neighbourhood': 'City Island',
  'number_of_reviews': 25,
  'property_type': 'House',
  'review_scores_accuracy': 10.0,
  'review_scores_checkin': 10.0,
  'review_scores_cleanliness': 10.0,
  'review_scores_communication': 10.0,
  'review_scores_location': 10.0,
  'review_scores_rating': 100.0,
  'review_scores_value': 10.0,
  'room_type': 'Private room',
  'security_deposit': '$100.00',
  'square_feet': 718.1781305110001,
  'zipcode': '10464'
}]

In [48]:
hello = vec.transform(predictValues)
print "hello, array: ", hello.toarray()
super_predict = reg.predict(hello.toarray())
print "Predicted value is:-->", super_predict
price = str(super_predict[0])
print price

hello, array:  [[   2.   24.  170. ...,    0.    0.    0.]]
Predicted value is:--> [ 51.06176505]
51.0617650538


In [52]:
data_features['room_type'].unique

<bound method Series.unique of 0           Private room
1           Private room
2        Entire home/apt
3        Entire home/apt
4           Private room
5        Entire home/apt
6        Entire home/apt
7           Private room
8        Entire home/apt
9        Entire home/apt
10          Private room
11          Private room
12       Entire home/apt
13          Private room
14          Private room
15          Private room
16          Private room
17          Private room
18          Private room
19       Entire home/apt
20       Entire home/apt
21       Entire home/apt
22          Private room
23       Entire home/apt
24          Private room
25          Private room
26          Private room
27          Private room
28          Private room
29          Private room
              ...       
40723       Private room
40724    Entire home/apt
40725    Entire home/apt
40726    Entire home/apt
40727       Private room
40728    Entire home/apt
40729    Entire home/apt
40730    Entire hom