# 1. Listings information preprocessing

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re, collections

## 1.1 Touch data

### Load listings data

In [59]:
raw_listings = pd.read_csv('../data/2018_05_17/listings.csv')

### Show attributes

In [60]:
attributes = [(i, raw_listings.columns[i]) for i in range(len(raw_listings.columns))]
# for j in range(len(attributes)): print(attributes[j])

## 1.2 Clean listing information

### Pick listings information columns

In [61]:
list_info_indices = [0, 22, 39, 51, 52, 53, 54, 55, 56, 57, 58, 60, 67, 68, 76, 80, 91]

In [62]:
list_info = [v[1] for v in attributes if v[0] in list_info_indices]
# for v in (list_info): print(v)

In [63]:
listings = raw_listings[list_info].copy()

### Prune the listings that lack significant listing information

In [64]:
listings['price'] = (listings['price'].str.replace(r'[^-+\d.]', '').astype(float))

In [65]:
listings = listings.dropna(how = 'any', subset = ['id', 'property_type', 'neighbourhood_cleansed', 'bathrooms', \
                                              'bathrooms', 'beds', 'price'])
listings = listings[listings['beds']!=0]
listings = listings[listings['bedrooms']!=0]
listings = listings[listings['bathrooms']!=0]
listings = listings[listings['price']!=0]
listings = listings[listings['accommodates']!=0]
listings = listings.reset_index(drop=True)

In [66]:
id_left = listings.id

## 1.3 Extract features from listings information

### Encode categorical attributes

In [67]:
for i in range(len(listings)):
    if listings.loc[i, 'review_scores_accuracy'] != listings.loc[i, 'review_scores_accuracy']:
        listings.loc[i, 'review_scores_accuracy'] = 'No Review'

In [68]:
categorical_attributes = ['neighbourhood_cleansed', 
                          'property_type',
                          'room_type',
                          'bed_type',
                          'review_scores_accuracy',
                          'cancellation_policy']

In [69]:
for attr in categorical_attributes:
    encoded_cols = pd.get_dummies(listings[attr])
    listings = pd.concat((listings.drop(attr, axis = 1), encoded_cols), axis = 1)

### Encode non-categorical attributes

In [70]:
def standardize(col):
    mean = np.mean(col)
    std = np.std(col)
    return col.apply(lambda x: (x - mean) / std)

In [71]:
noncategorical_attributes = ['host_since', 'accommodates', 
                             'bedrooms', 'beds', 'bathrooms', 'number_of_reviews',
                            'minimum_nights', 'maximum_nights']

In [72]:
for attr in noncategorical_attributes:
    if attr == 'host_since':
        listings[attr] = standardize(listings[attr].str.replace(r'-', '').astype(float))
    else:
        listings[attr] = standardize(listings[attr].astype(float))

## 1.4 Extract features from amenities

### Count the words frequency

In [73]:
numa = 20

In [74]:
def reformat(col):
    return col.apply(lambda x: x.strip('{}').replace('"','').split(','))

In [75]:
def sort_by_value(d): 
    items=d.items() 
    backitems=[[v[1],v[0]] for v in items] 
    backitems.sort(reverse=True) 
    return [ backitems[i][1] for i in range(0,len(backitems))] 

In [76]:
listings['amenities'] = reformat(listings['amenities'])

In [77]:
amenity_lists = []
for j in range(len(listings)):
    amenity_lists.extend(listings.loc[j, 'amenities'])

In [78]:
frequency = collections.defaultdict(int)
for amenity in amenity_lists: frequency[amenity] += 1

In [79]:
amenities_picked = sort_by_value(frequency)[0:numa]

In [80]:
new_cols = pd.DataFrame(columns = amenities_picked, data = list(np.zeros((len(listings), numa))), dtype=int)

In [81]:
for j in range(len(new_cols)):
    for i in range(numa):
        if amenities_picked[i] in listings.amenities[j]:
            new_cols.iloc[j,i] += 1

In [82]:
listings = pd.concat((listings.drop('amenities', axis=1),new_cols), axis=1)

## 1.5 Save listings

In [83]:
listings

Unnamed: 0,id,host_since,accommodates,bathrooms,bedrooms,beds,price,minimum_nights,maximum_nights,number_of_reviews,...,Carbon monoxide detector,TV,Laptop friendly workspace,Iron,Fire extinguisher,Family/kid friendly,First aid kit,Free parking on premises,Hot water,Self check-in
0,2318,-3.195666,1.905351,1.801389,2.668650,1.538311,296.0,0.111291,-0.252808,-0.346458,...,1,0,1,1,1,1,1,1,0,0
1,4291,-2.677602,-0.755337,-0.505077,-0.601433,-0.711196,82.0,-0.081809,-0.434162,0.333615,...,0,1,0,0,1,0,0,1,0,0
2,6606,-2.702076,-0.755337,-0.505077,-0.601433,-0.711196,90.0,-0.081809,0.370597,1.335829,...,1,1,1,1,1,0,0,1,0,0
3,9419,-2.682384,-0.755337,2.570211,-0.601433,-0.711196,70.0,-0.081809,-0.456831,1.514795,...,0,0,1,1,1,0,1,1,1,0
4,9460,-2.682332,-0.755337,-0.505077,-0.601433,-0.711196,80.0,0.014741,-0.457587,5.917376,...,1,1,1,1,1,1,1,1,1,0
5,9531,-2.682178,0.131559,-0.505077,0.488594,2.288147,165.0,0.014741,-0.203691,-0.024318,...,1,1,1,1,1,0,1,0,1,0
6,9534,-2.682178,-0.311889,-0.505077,0.488594,0.038640,125.0,-0.081809,-0.203691,-0.060111,...,1,1,1,1,1,0,1,1,1,0
7,9596,-2.702076,0.131559,-0.505077,-0.601433,3.037982,120.0,-0.081809,-0.434162,0.458892,...,1,1,1,1,1,1,0,1,0,0
8,9909,-2.681664,0.131559,-0.505077,0.488594,0.038640,125.0,-0.081809,-0.388823,0.226235,...,1,1,1,1,1,1,0,1,1,1
9,10209,-2.682178,-1.198785,0.263745,-0.601433,-0.711196,48.0,0.207841,0.072119,-0.006421,...,0,0,0,0,0,0,0,1,0,1


In [56]:
listings.to_csv('../save/listings.csv')

## * Functions

In [None]:
def listings_preprocessing(path):
    raw_listings = load(path)
    