In [46]:
import pandas as pd
import numpy as np
dc_listings = pd.read_csv("dc_airbnb.csv")
stripped_commas = dc_listings['price'].str.replace(',', '')
stripped_dollars = stripped_commas.str.replace('$', '')
dc_listings['price'] = stripped_dollars.astype('float')
train_df = dc_listings.iloc[0:2792]
test_df = dc_listings.iloc[2792:]

def predict_price(new_listing):
    temp_df = train_df.copy()
    temp_df['distance'] = temp_df['accommodates'].apply(lambda x: np.abs(x - new_listing))
    temp_df = temp_df.sort_values('distance')
    nearest_neighbor_prices = temp_df.iloc[0:5]['price']
    predicted_price = nearest_neighbor_prices.mean()
    return(predicted_price)

test_df['predicted_price'] = test_df['accommodates'].apply(predict_price)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [47]:
MAE = np.absolute(test_df["predicted_price"] -  test_df["price"]).mean()
print("MAE: ", MAE)

MAE:  56.29001074113876


In [48]:
MSE = sum((test_df["price"] - test_df["predicted_price"])**2)/test_df.shape[0]
print("MSE: ", MSE)

MSE:  18646.525370569325


**bathroom**

In [49]:
train_df = dc_listings.iloc[0:2792]
test_df = dc_listings.iloc[2792:]

def predict_price(new_listing):
    temp_df = train_df.copy()
    temp_df['distance'] = temp_df['bathrooms'].apply(lambda x: np.abs(x - new_listing))
    temp_df = temp_df.sort_values('distance')
    nearest_neighbors_prices = temp_df.iloc[0:5]['price']
    predicted_price = nearest_neighbors_prices.mean()
    return(predicted_price)

## Multivariate KNN

In [57]:
import pandas as pd
import numpy as np
np.random.seed(1)

dc_listings = pd.read_csv('dc_airbnb.csv')
dc_listings = dc_listings.loc[np.random.permutation(len(dc_listings))]
stripped_commas = dc_listings['price'].str.replace(',', '')
stripped_dollars = stripped_commas.str.replace('$', '')
dc_listings['price'] = stripped_dollars.astype('float')

**Column values**

In [58]:
dc_listings.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3723 entries, 574 to 1061
Data columns (total 19 columns):
host_response_rate      3289 non-null object
host_acceptance_rate    3109 non-null object
host_listings_count     3723 non-null int64
accommodates            3723 non-null int64
room_type               3723 non-null object
bedrooms                3702 non-null float64
bathrooms               3696 non-null float64
beds                    3712 non-null float64
price                   3723 non-null float64
cleaning_fee            2335 non-null object
security_deposit        1426 non-null object
minimum_nights          3723 non-null int64
maximum_nights          3723 non-null int64
number_of_reviews       3723 non-null int64
latitude                3723 non-null float64
longitude               3723 non-null float64
city                    3723 non-null object
zipcode                 3714 non-null object
state                   3723 non-null object
dtypes: float64(6), int64(5), objec

**Erasing non ordinal and nun-numerical columns**

In [59]:
print("Shape before: ", dc_listings.shape)
values = ["room_type", "city", "state", "latitude", "longitude", "zipcode", "host_response_rate", "host_acceptance_rate", "host_listings_count"]
dc_listings = dc_listings.drop(values, axis=1)
print("Shape after: ", dc_listings.shape)

Shape before:  (3723, 19)
Shape after:  (3723, 10)


* "cleaning_fee" and "security_deposit" have many null values
* drop remaining null values

In [60]:
high_na = ["cleaning_fee", "security_deposit"]
dc_listings = dc_listings.drop(high_na, axis=1)

**Normalize Listing**

In [61]:
normalized_listings = (dc_listings - dc_listings.mean())/dc_listings.std()
normalized_listings["price"] = dc_listings["price"]
normalized_listings.head(3)

Unnamed: 0,accommodates,bedrooms,bathrooms,beds,price,minimum_nights,maximum_nights,number_of_reviews
574,-0.593875,-0.250231,-0.437816,-0.544209,125.0,-0.345048,-0.016488,4.509719
1593,-0.593875,-0.250231,0.416099,-0.544209,85.0,-0.345048,-0.016487,1.136535
3091,-1.090839,-0.250231,-1.291731,-0.544209,50.0,-0.345048,-0.016456,-0.482593


**Euclidean Distance**

* Finding the Euclidean distance of *"accomodates"* and *"bathroom"* columns between rows 1 and 5 from the normalized listings dataset

In [70]:
from scipy.spatial import distance
listing_first = normalized_listings[["accommodates", "bathrooms"]].iloc[0]
listing_fifth = normalized_listings[["accommodates", "bathrooms"]].iloc[4]
first_fifth_distance = distance.euclidean(listing_first, listing_fifth)
print(first_fifth_distance)

5.254907944983598


In [2]:
list(range(1,21))

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]

In [3]:
[x for x in range(1,21)]

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]

In [6]:
numbers= [26926.755403868032,
 20520.37087599545,
 16105.879155606117,
 15341.923065984072,
 14790.314266211606,
 15067.653836430289,
 15550.246035615612,
 15469.115152161547,
 15309.432885293332,
 15215.682081911264,
 16786.523133914387,
 16418.107010807737,
 16329.89425853747,
 16433.576849852572,
 16762.924095563143,
 16754.500071103528,
 16914.910727430903,
 16930.557086475932,
 17675.784103693757,
 17602.475116609785]

In [13]:
index = numbers.index(min(numbers))
value = min(numbers)

In [14]:
index

4

In [15]:
value

14790.314266211606

In [18]:
dict({index: value})

{4: 14790.314266211606}