In [60]:
import cPickle as pickle
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
import matplotlib.pyplot as plt
from datetime import date
import re
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
% matplotlib inline

In [40]:
WORD_PATTERN = re.compile("(^|\s+)([A-Za-z]+)")
STEMMER = SnowballStemmer("english")
LEMMER = WordNetLemmatizer()

#Combine two sets of stop words
STOPWORDS = stopwords.words('english')

#Define tokenizers that do stemming only, lemmatizing only, and both 
def tokenize_stem(s):
    return  [STEMMER.stem(match.group(2)) \
             for match in WORD_PATTERN.finditer(s) \
             if match.group(2) not in STOPWORDS and len(match.group(2)) >= 2]

def tokenize_lem(s):
    return [LEMMER.lemmatize(match.group(2)) \
            for match in WORD_PATTERN.finditer(s) \
            if match.group(2) not in STOPWORDS and len(match.group(2)) >= 2]

def tokenize_stem_lem(s):
    return [STEMMER.stem(LEMMER.lemmatize(match.group(2))) \
            for match in WORD_PATTERN.finditer(s) \
            if match.group(2) not in STOPWORDS and len(match.group(2)) >= 2]

In [27]:
def create_dataframe_webapp(zip_codes,
                            non_text_features,
                            additional_stop_words,
                            input_text,
                            zip_code,
                            home_size_sq_ft,
                            number_of_bedrooms,
                            number_of_bathrooms,
                            vectorizer):
    '''Prepare a pandas dataframe with the input information with the right format for inserting into
    a trained Random Forest model to predict the sale_price to askling_price ratio.
    input:
    zip_codes:  a LIST of zip codes that are in random forest model.
    non_text_features: a LIST of text of the names of the non-text features, e.g. 'home_size', 'number_of_bedrooms'.
    additional_stop_words:  a LIST of tokeized_expression that will be removed from the final input matrix.

    (The following pertains specific to the home whose price_ratio will be predicted )
    input_text: The agent's description of the home.
    zip_code:  The 5-digit zip_code of the home (text).
    home_size_sq_ft: home size in square feet (integer).
    number_of_bedrooms: number of bedrooms (float).
    number_of_bathrooms: number of bathrooms (float)

    vectorizer: vectorizer to convert input text of the agent's description to the vector of tekenized expressions.

    '''
    df_zip = pd.DataFrame(index = [1], columns = zip_codes)
    df_zip.fillna(0, inplace = True)

    X_stem_lem = vectorizer.transform([input_text])
    features = vectorizer.get_feature_names()
    df_text = pd.DataFrame(X_stem_lem.toarray(), index = [1], columns = features)
    df_text.drop(additional_stop_words, axis = 1, inplace = True)

    df_nontxt = pd.DataFrame(index = [1], columns = non_text_features)
    df_nontxt.set_value(1, non_text_features, [home_size_sq_ft, number_of_bedrooms, number_of_bathrooms])
    df_nontxt

    df = pd.concat([df_zip, df_text, df_nontxt],
                  axis = 1)

    return df

In [73]:
filepath = 'list_of_zip_codes'
zip_codes = pickle.load(open(filepath, 'rb'))

zip_code = input('Input the 5-digit zip code of the property:')
zip_code = str(zip_code)
number_of_bedrooms = input('Input the number of bedrooms of the property: ')
number_of_bathrooms = input('Input the number of bathrooms of the property: ')
home_size_sq_ft = input('Input the size of the home in square feet (interior): ')
print



Input the 5-digit zip code of the property:94110
Input the number of bedrooms of the property: 3
Input the number of bathrooms of the property: 3
Input the size of the home in square feet (interior): 1000



In [29]:
New almost everything. New roof, plumbing and electrical systems. 2nd floor has a spacious master suite with walk in closet and 2 other bedrooms, full bathroom, and  laundry closet. 1st floor has living room, dining room, powder room, and kitchen. A walk out rear yard makes this home great for BBQs and parties.  Large attic is accessed by pull-down ladder but see it and imagine what you could do with that space.

SyntaxError: invalid syntax (<ipython-input-29-9a230d3ec0fb>, line 1)

In [61]:
filepath = 'agent_desc_vectorizer.p'
vectorizer = pickle.load(open(filepath, 'rb'))


In [74]:
input_text = raw_input("Input the text of the agent's description")

Input the text of the agent's descriptionNew almost everything. New roof, plumbing and electrical systems. 2nd floor has a spacious master suite with walk in closet and 2 other bedrooms, full bathroom, and  laundry closet. 1st floor has living room, dining room, powder room, and kitchen. A walk out rear yard makes this home great for BBQs and parties.  Large attic is accessed by pull-down ladder but see it and imagine what you could do with that space.


In [63]:
'New almost everything. New roof, plumbing and electrical systems. 2nd floor has a spacious master suite with walk in closet and 2 other bedrooms, full bathroom, and  laundry closet. 1st floor has living room, dining room, powder room, and kitchen. A walk out rear yard makes this home great for BBQs and parties.  Large attic is accessed by pull-down ladder but see it and imagine what you could do with that space.'

TypeError: 'CountVectorizer' object is not callable

In [77]:
X_stem_lem = vectorizer.transform([input_text])

In [78]:
X_stem_lem.sum()

57

In [79]:
additional_stop_words  =  ['monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday', 'offer',
                           'account', 'disclosur', 'due', 'date', 'sfar', 'broker tour', 'pm', 'offer review',
                           'tour', 'offer date', 'offer due', 'pre', 'accept', 'call', 'close', 'pleas call',
                           'noon', 'open', 'price', 'zestim', 'zestim accur', 'zestim forecast', u'zestim forecastcr',
                           'zestim home', 'zestim owner', 'zestim rent', 'zestim see', 'zestim valu','zestim zestim',
                           'zestim zillow', 'zillow', 'zillow estim', 'zillow valu'
                           ]

In [80]:
non_text_features = ['number_of_bedrooms', 'number_of_bathrooms', 'home_size']

In [82]:
df = create_dataframe_webapp(zip_codes,
                            non_text_features,
                            additional_stop_words,
                            input_text,
                            zip_code,
                            home_size_sq_ft,
                            number_of_bedrooms,
                            number_of_bathrooms,
                            vectorizer)

In [96]:
filepath = 'rf_zip_txt_nontxt_model.p'
rf_final_model_100_50_7 = pickle.load(open(filepath, 'rb'))

In [104]:

price_ratio = rf_final_model_100_50_7.predict(df.values)
print ('This model estimates the sale price will be ' + str(price_ratio[0])[0:6] + ' times the asking price.')

This model estimates the sale price will be 1.0646 times the asking price.


In [98]:
rf_final_model_100_50_7

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features=0.8, max_leaf_nodes=None, min_samples_leaf=7,
           min_samples_split=50, min_weight_fraction_leaf=0.0,
           n_estimators=100, n_jobs=-1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)

In [107]:
sorted(zip(df.columns, rf_final_model_100_50_7.feature_importances_), key = lambda x: x[1], reverse = True)

[('home_size', 0.17497023638465645),
 ('number_of_bathrooms', 0.047250617733946287),
 ('94110', 0.023039243411272805),
 ('94019', 0.016786446473091118),
 ('94002', 0.010074988879216797),
 ('94070', 0.0096997097684996206),
 ('number_of_bedrooms', 0.0086631005570184955),
 (u'bedroom', 0.0083323646151604366),
 ('94131', 0.0080983525773195807),
 (u'home', 0.0079278044872418355),
 ('94114', 0.0078561415855693331),
 (u'court confirm', 0.0073669510060068895),
 (u'reduc', 0.006963523685857901),
 (u'review', 0.0065766185619264179),
 ('94020', 0.0058250278882577607),
 (u'escrow', 0.0053143283706041889),
 (u'room', 0.0051252016484333064),
 (u'owner', 0.0051207247372046013),
 (u'develop', 0.0049050926742888242),
 (u'bath', 0.0048857477095615208),
 (u'level lawn', 0.0048650235508313324),
 (u'dwood', 0.0047925699324357553),
 ('94116', 0.0046932389916191398),
 (u'hous', 0.0046260866527578015),
 (u'great', 0.0044476122514936871),
 (u'broker', 0.0043141618584317202),
 (u'view', 0.0040307448091175197),


In [108]:
df

Unnamed: 0,94002,94005,94010,94014,94015,94018,94019,94020,94021,94025,...,yard perfect,year,year old,year see,yet,yr,zone,number_of_bedrooms,number_of_bathrooms,home_size
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1000,3,3
