# 1. Listings information preprocessing

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re, collections

## 1.1 Touch data

### Load listings data

In [31]:
raw_listings = pd.read_csv('../data/2018_04_14/listings.csv', low_memory=False)

### Show attributes

In [32]:
attributes = [(i, raw_listings.columns[i]) for i in range(len(raw_listings.columns))]
# for j in range(len(attributes)): print(attributes[j])

In [35]:
list_info

['id',
 'host_since',
 'neighbourhood_cleansed',
 'property_type',
 'room_type',
 'accommodates',
 'bathrooms',
 'bedrooms',
 'beds',
 'bed_type',
 'amenities',
 'price',
 'minimum_nights',
 'maximum_nights',
 'number_of_reviews',
 'review_scores_accuracy',
 'cancellation_policy']

## 1.2 Clean listing information

### Pick listings information columns

In [36]:
# list_info = [v[1] for v in attributes if v[0] in list_info_indices]

list_info = ['id',
 'host_since',
 'neighbourhood_cleansed',
 'property_type',
 'room_type',
 'accommodates',
 'bathrooms',
 'bedrooms',
 'beds',
 'bed_type',
 'amenities',
 'price',
 'minimum_nights',
 'maximum_nights',
 'number_of_reviews',
 'review_scores_accuracy',
 'cancellation_policy']

# for v in (list_info): print(v)

In [37]:
listings = raw_listings[list_info].copy()

### Prune the listings that lack significant listing information

In [38]:
listings['price'] = (listings['price'].str.replace(r'[^-+\d.]', '').astype(float))

In [39]:
listings = listings.dropna(how = 'any', subset = ['id', 'property_type', 'neighbourhood_cleansed', 'bathrooms', \
                                              'bathrooms', 'beds', 'price'])
listings = listings[listings['beds']!=0]
listings = listings[listings['bedrooms']!=0]
listings = listings[listings['bathrooms']!=0]
listings = listings[listings['price']!=0]
listings = listings[listings['accommodates']!=0]
listings = listings.reset_index(drop=True)

In [40]:
id_left = listings.id

## 1.3 Extract features from listings information

### Encode categorical attributes

In [41]:
for i in range(len(listings)):
    if listings.loc[i, 'review_scores_accuracy'] != listings.loc[i, 'review_scores_accuracy']:
        listings.loc[i, 'review_scores_accuracy'] = 'No Review'

In [None]:
categorical_attributes = ['neighbourhood_cleansed', 
                          'property_type',
                          'room_type',
                          'bed_type',
                          'review_scores_accuracy',
                          'cancellation_policy']

In [69]:
for attr in categorical_attributes:
    encoded_cols = pd.get_dummies(listings[attr])
    listings = pd.concat((listings.drop(attr, axis = 1), encoded_cols), axis = 1)

### Encode non-categorical attributes

In [70]:
def standardize(col):
    mean = np.mean(col)
    std = np.std(col)
    return col.apply(lambda x: (x - mean) / std)

In [71]:
noncategorical_attributes = ['host_since', 'accommodates', 
                             'bedrooms', 'beds', 'bathrooms', 'number_of_reviews',
                            'minimum_nights', 'maximum_nights']

In [72]:
for attr in noncategorical_attributes:
    if attr == 'host_since':
        listings[attr] = standardize(listings[attr].str.replace(r'-', '').astype(float))
    else:
        listings[attr] = standardize(listings[attr].astype(float))

## 1.4 Extract features from amenities

### Count the words frequency

In [73]:
numa = 20

In [74]:
def reformat(col):
    return col.apply(lambda x: x.strip('{}').replace('"','').split(','))

In [75]:
def sort_by_value(d): 
    items=d.items() 
    backitems=[[v[1],v[0]] for v in items] 
    backitems.sort(reverse=True) 
    return [ backitems[i][1] for i in range(0,len(backitems))] 

In [76]:
listings['amenities'] = reformat(listings['amenities'])

In [77]:
amenity_lists = []
for j in range(len(listings)):
    amenity_lists.extend(listings.loc[j, 'amenities'])

In [78]:
frequency = collections.defaultdict(int)
for amenity in amenity_lists: frequency[amenity] += 1

In [79]:
amenities_picked = sort_by_value(frequency)[0:numa]

In [80]:
new_cols = pd.DataFrame(columns = amenities_picked, data = list(np.zeros((len(listings), numa))), dtype=int)

In [81]:
for j in range(len(new_cols)):
    for i in range(numa):
        if amenities_picked[i] in listings.amenities[j]:
            new_cols.iloc[j,i] += 1

In [82]:
listings = pd.concat((listings.drop('amenities', axis=1),new_cols), axis=1)

## 1.5 Save listings

In [56]:
listings.to_csv('../save/listings.csv')