In [2]:
import re
import pandas as pd
from pymongo import MongoClient


# Clean up categorical columns
def clean_list_columns(inpt, regex_pat=r'[a-zA-Z]'):
    if not inpt:
        return []
    
    if not isinstance(inpt, basestring):
        inpt = str(inpt)
    elif not isinstance(inpt, str):
        inpt = inpt.encode('utf-8')
    
    inpt = inpt.split(",")
    temp = []
    for item in inpt:
        temp.append(''.join(re.findall(regex_pat, item)).lower())
    
    inpt = temp

    inpt = ",".join(inpt)
    inpt = inpt.lower()

    return inpt


def clean_numeric_columns(inpt, rexp=re.compile(r'\d+\.\d+')):
    if not inpt:
        return None
    
    if not isinstance(inpt, basestring):
        inpt = str(inpt)
    elif not isinstance(inpt, str):
        inpt = inpt.encode('utf-8')
    
    inpt = rexp.findall(inpt)
    if not inpt:
        return None
    
    return float(inpt[0])

In [10]:
# We do not need these columns as features, at least initially
omit_original = ['Unnamed: 0', 'access', 'calendar_las_updated', \
            'calendar_last_scraped', 'calendar_updated', 'city', \
            'country', 'country_code', 'description', 'first_review', \
            'host_about', 'host_location', 'host_name', 'host_picture_url', \
            'host_thumbnail_url', 'host_url', 'host_verifications', \
            'house_rules', 'interaction', 'jurisdiction_names', \
            'last_review', 'latitude', 'license', \
            'listing_url', 'longitude', 'market', 'medium_url', \
            'neighborhood_overview', 'neighbourhood', \
            'neighbourhood_group_cleansed', 'notes', 'picture_url', \
            'review_scores_accuracy', 'review_scores_checkin', \
            'review_scores_cleanliness', 'review_scores_communication', \
            'review_scores_location', 'scrape_id', 'smart_location', \
            'space', 'state', 'street', 'summary', 'thumbnail_url', \
            'transit', 'xl_picture_url', 'zipcode']

text_columns = ['description', 'host_about', 'house_rules', 
               'neighborhood_overview', 'notes', 'summary', 
               'review_scores_rating']

omit = []
for o in omit_original:
    if o not in text_columns:
        omit.append(o)

# We need to one_hot encode categorical columns
# cat_columns = ['bed_type', 'room_type', \
#                'neighbourhood_cleansed', \
#                'property_type', 'cancellation_policy', \
#                'host_response_time']

# list_columns = ['amenities',]

# numeric_columns = ['bathrooms', 'bedrooms', 'beds', 'cleaning_fee', \
#                        'guests_included', \
#                        'host_listings_count', \
#                        'host_total_listings_count']

# percent_columns = ['host_acceptance_rate', 'host_response_rate']

# date_columns = ['host_since', 'last_scraped']

# bool_columns = ['host_has_profile_pic', 'host_identity_verified']

In [8]:
# Read all the data from MongoDB; 
# We should be batch processing this
client = MongoClient('localhost', 27017)
db = client['airbnb']
coll = db['listings']
cursor = coll.find({})
    
data = []
for sample in cursor:
    for o in omit:
        sample.pop(o, None)
    
    data.append(sample)

data = pd.DataFrame(data)

In [13]:
text_cols_df = pd.DataFrame()
for tc in text_columns:
    text_cols_df[tc] = data[tc]

text_cols_df

Unnamed: 0,description,host_about,house_rules,neighborhood_overview,notes,summary,review_scores_rating
0,Three-bedroom in exclusive Seacliff neighborho...,"We are a family of four, all avid travelers. ...",,,,Three-bedroom in exclusive Seacliff neighborho...,
1,"Charming, private, peaceful hideaway 2 blocks ...","I LOVE San Francisco!! I mean, I am IN LOVE wi...",Absolutely NO SMOKING or drugs on property - p...,I'm excited for you to discover why I love liv...,"The studio is not equipped for cooking, nor is...","Charming, private, peaceful hideaway 2 blocks ...",98.0
2,"My place is close to Golden Gate Bridge, Park ...",,,,,"My place is close to Golden Gate Bridge, Park ...",100.0
3,"Cozy, sunny single occupancy bedroom in a spac...",,,,,"Cozy, sunny single occupancy bedroom in a spac...",92.0
4,"My place is close to Golden Gate Bridge, Lands...",Retired Accounting Professional. Down to earth...,"- 1, Front door of the house need to be lock a...",This is a Muti culture neighborhood that inclu...,"1, Since your room is on the top floor of the ...","My place is close to Golden Gate Bridge, Lands...",
5,Welcome to our cozy apartment located in the h...,,"No shoes inside, no loud music after 10pm out ...",,,Welcome to our cozy apartment located in the h...,100.0
6,"Quite house, clean room, nice roommates.",Hello!\r\n\r\nI am currently in grad school pu...,,,,"Quite house, clean room, nice roommates.",
7,Newly built cottage that comfortably sleeps 2....,San Francisco natives who own this home. Alexi...,Please treat house respectfully and keep in mi...,Quiet neighborhood that is half a block from ...,"there is great surfing, hiking, and golf cours...",Newly built cottage that comfortably sleeps 2....,92.0
8,My two story house is located in the quite sid...,Retired Accounting Professional. Down to earth...,"1. No pets, no smoking inside the house. 2. F...",It is in the city and close to everything. Par...,The summer in San Francisco is cooler than any...,My two story house is located in the quite sid...,90.0
9,Welcome to one of the coziest and charming hom...,"Teresa is passionate about leadership, kaizen ...",,Three blocks away on Clement Street are plenty...,,Welcome to one of the coziest and charming hom...,


In [16]:
dependent_variable='review_scores_rating'
text_cols_df = text_cols_df.dropna(subset=[dependent_variable])

In [17]:
print text_cols_df.shape

(190004, 7)


In [22]:
from sklearn.model_selection import train_test_split

Train, Test = train_test_split(text_cols_df, 
                               test_size=0.33, 
                               random_state=1234)
train_colnames = [c for c in Train.columns if c != dependent_variable]
test_colnames = [c for c in Test.columns if c!= dependent_variable]

print Train.shape
print Test.shape

(127302, 7)
(62702, 7)


In [23]:
y_train = Train[dependent_variable].values
y_train = pd.DataFrame(y_train, columns = [dependent_variable])
X_train = Train.drop([dependent_variable], axis=1)
X_train = pd.DataFrame(X_train.values, columns=train_colnames)


y_test = Test[dependent_variable].values
y_test = pd.DataFrame(y_test, columns=[dependent_variable])
X_test = Test.drop([dependent_variable], axis=1)
X_test = pd.DataFrame(X_test.values, columns=test_colnames)

In [None]:
# Write processed train data to csv
X_train.to_csv('./data/categorical_train_deps.csv', encoding='utf8')
y_train.to_csv('./data/categorical_train_indeps.csv', encoding='utf8')

# Write processed test data to csv
X_test.to_csv('./data/categorical_test_deps.csv', encoding='utf8')
y_test.to_csv('./data/categorical_test_indeps.csv', encoding='utf8')