In [93]:
import pandas as pd
from pymongo import MongoClient


# Clean up categorical columns
def clean_categorical_columns(inpt):
    if not inpt:
        return []
    if inpt:
        inpt = inpt.replace("{", "")
        inpt = inpt.replace("}", "")
        inpt = inpt.replace("\"", "")
        inpt = inpt.replace("\'", "")
        inpt = inpt.replace("\/", "_")
        inpt = inpt.replace("-", "_")
        inpt = inpt.split(",")
        
        inpt = [i.lstrip() for i in inpt]
        inpt = [i.replace(" ", "_") for i in inpt]

        inpt = ",".join(inpt)
        inpt = inpt.lower()
        
        return inpt


# one_hot_encoude categorical columns
def one_hot_encode(dataset, id_col, val_col, prfx, multi_label_splt=','):
    cleaned = data.set_index(id_col)[val_col].str.split(multi_label_splt, expand=True).stack()
    # .reset_index(level=1, drop=False)
    cleaned = pd.get_dummies(cleaned, prefix=prfx).groupby(id_col, as_index=True).max()
    cleaned[id_col] = cleaned.index
    return cleaned

# We do not need these columns as features, at least initially
omit = ['scrape_id', 'host_picture_url', 'host_thumbnail_url', 'host_url', \
        'listing_url', 'medium_url', 'picture_url', 'thumbnail_url', 'xl_picture_url', \
       'state', 'city', 'smart_location', 'country', 'country_code', 'jurisdiction_names']
    
# We need to one_hot encode categorical columns
cat_columns = ['amenities', 'bed_type', 'room_type', 'experiences_offered', \
              'zipcode', 'host_verifications', 'neighbourhood_cleansed', \
              'calendar_las_updated']


client = MongoClient('localhost', 27017)
db = client['airbnb']
coll = db['listings']
cursor = coll.find({})
    
samples = []
for sample in cursor:
    for o in omit:
        sample.pop(o, None)
        
    
    for cc in cat_columns:
        cc_inpt = sample.pop(cc, None)
        sample[cc] = clean_categorical_columns(cc_inpt)
    
    samples.append(sample)
    

data = pd.DataFrame(samples)
# print data.sample(10)
# print len(data.columns)

In [94]:
data.sample(1)

Unnamed: 0,_id,access,accommodates,amenities,availability_30,availability_365,availability_60,availability_90,bathrooms,bed_type,...,reviews_per_month,room_type,security_deposit,space,square_feet,street,summary,transit,weekly_price,zipcode
117135,5b23318fa38514595340bb6a,Guests will also enjoy access to the 12th Floo...,4,"tv,cable_tv,internet,wireless_internet,air_con...",0.0,0.0,0.0,0.0,2.5,real_bed,...,0.11,entire_home/apt,,Guests may take advantage of the many services...,,"Financial District, San Francisco, CA 94104, U...",I am renting out a 2 bedroom luxury condo at t...,"During your stay, you'll be within walking dis...",,94104


In [95]:
for cc in cat_columns:
    cc_encoded = one_hot_encode(data, 'id', cc, cc, ',')
    data = data.join(cc_encoded.set_index('id'), on='id')

In [96]:
data.columns

Index([                        u'_id',                      u'access',
                      u'accommodates',                   u'amenities',
                   u'availability_30',            u'availability_365',
                   u'availability_60',             u'availability_90',
                         u'bathrooms',                    u'bed_type',
       ...
       u'amenities_wireless_internet',             u'bed_type_airbed',
                    u'bed_type_couch',              u'bed_type_futon',
            u'bed_type_pull_out_sofa',           u'bed_type_real_bed',
         u'room_type_entire_home/apt',      u'room_type_private_room',
             u'room_type_shared_room',    u'experiences_offered_none'],
      dtype='object', length=286)

In [97]:
for c in data.columns: print c

_id
access
accommodates
amenities
availability_30
availability_365
availability_60
availability_90
bathrooms
bed_type
bedrooms
beds
calculated_host_listings_count
calendar_last_scraped
calendar_updated
cancellation_policy
cleaning_fee
description
experiences_offered
extra_people
first_review
guests_included
has_availability
host_about
host_acceptance_rate
host_has_profile_pic
host_id
host_identity_verified
host_is_superhost
host_listings_count
host_location
host_name
host_neighbourhood
host_response_rate
host_response_time
host_since
host_total_listings_count
host_verifications
house_rules
id
instant_bookable
interaction
is_business_travel_ready
is_location_exact
last_review
last_scraped
latitude
license
longitude
market
maximum_nights
minimum_nights
monthly_price
name
neighborhood_overview
neighbourhood
neighbourhood_cleansed
neighbourhood_group_cleansed
notes
number_of_reviews
price
property_type
require_guest_phone_verification
require_guest_profile_picture
requires_license
review_s

In [99]:
client = MongoClient('localhost', 27017)
db = client['airbnb']   
coll = db['review_features']

In [120]:
data = data.drop('_id', axis=1)

In [121]:
data = data.drop(cat_columns, axis=1)

In [1]:
data.columns

NameError: name 'data' is not defined

In [123]:
data.to_csv('./data/review_features.csv', encoding='utf-8')

In [125]:
# payload = json.loads(data.to_json(orient='records', force_ascii=False))
# coll.insert_many(payload)