In [1]:
import cPickle as pickle
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
import matplotlib.pyplot as plt
from datetime import date
import re
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
import gzip
% matplotlib inline




In [2]:
def zip(*iterables):
    # zip('ABCD', 'xy') --> Ax By
    sentinel = object()
    iterators = [iter(it) for it in iterables]
    while iterators:
        result = []
        for it in iterators:
            elem = next(it, sentinel)
            if elem is sentinel:
                return
            result.append(elem)
        yield tuple(result)

In [3]:
WORD_PATTERN = re.compile("(^|\s+)([A-Za-z]+)")
STEMMER = SnowballStemmer("english")
LEMMER = WordNetLemmatizer()

#Combine two sets of stop words
STOPWORDS = stopwords.words('english')

#Define tokenizers that do stemming only, lemmatizing only, and both 
def tokenize_stem(s):
    return  [STEMMER.stem(match.group(2)) \
             for match in WORD_PATTERN.finditer(s) \
             if match.group(2) not in STOPWORDS and len(match.group(2)) >= 2]

def tokenize_lem(s):
    return [LEMMER.lemmatize(match.group(2)) \
            for match in WORD_PATTERN.finditer(s) \
            if match.group(2) not in STOPWORDS and len(match.group(2)) >= 2]

def tokenize_stem_lem(s):
    return [STEMMER.stem(LEMMER.lemmatize(match.group(2))) \
            for match in WORD_PATTERN.finditer(s) \
            if match.group(2) not in STOPWORDS and len(match.group(2)) >= 2]

In [4]:
###
def create_dataframe_webapp(zip_codes,
                            non_text_features,
                            additional_stop_words,
                            input_text,
                            zip_code,
                            home_size_sq_ft,
                            number_of_bedrooms,
                            number_of_bathrooms,
                            current_listing_price,
                            vectorizer):
    '''Prepare a pandas dataframe with the input information with the right format for inserting into
    a trained Random Forest model to predict the sale_price to askling_price ratio.
    input:
    zip_codes:  a LIST of zip codes that are in random forest model.
    non_text_features: a LIST of text of the names of the non-text features, e.g. 'home_size', 'number_of_bedrooms'.
    additional_stop_words:  a LIST of tokeized_expression that will be removed from the final input matrix.

    (The following pertains specific to the home whose price_ratio will be predicted )
    input_text: The agent's description of the home.
    zip_code:  The 5-digit zip_code of the home (text).
    home_size_sq_ft: home size in square feet (integer).
    number_of_bedrooms: number of bedrooms (float).
    number_of_bathrooms: number of bathrooms (float)

    vectorizer: vectorizer to convert input text of the agent's description to the vector of tekenized expressions.

    '''
    df_zip = pd.DataFrame(index = [1], columns = zip_codes)
    df_zip.fillna(0, inplace = True)

    X_stem_lem = vectorizer.transform([input_text])
    features = vectorizer.get_feature_names()
    df_text = pd.DataFrame(X_stem_lem.toarray(), index = [1], columns = features)
    df_text.drop(additional_stop_words, axis = 1, inplace = True)

    df_nontxt = pd.DataFrame(index = [1], columns = non_text_features)
    df_nontxt.set_value(1, non_text_features, [home_size_sq_ft, 
                                               number_of_bedrooms, 
                                               number_of_bathrooms, 
                                               current_listing_price]
                       )
    df_nontxt

    df = pd.concat([df_zip, df_text, df_nontxt],
                  axis = 1)

    return df

In [7]:
filepath = 'list_of_zip_codes'
zip_codes = pickle.load(open(filepath, 'rb'))

zip_code = input('Input the 5-digit zip code of the property:')
zip_code = str(zip_code)
number_of_bedrooms = input('Input the number of bedrooms of the property: ')
number_of_bathrooms = input('Input the number of bathrooms of the property: ')
home_size_sq_ft = input('Input the size of the home in square feet (interior): ')
current_listing_price = input('Input the listing price: ')
print



Input the 5-digit zip code of the property:94030
Input the number of bedrooms of the property: 4
Input the number of bathrooms of the property: 5
Input the size of the home in square feet (interior): 3000
Input the listing price: 2600000



In [36]:
New almost everything. New roof, plumbing and electrical systems. 2nd floor has a spacious master suite with walk in closet and 2 other bedrooms, full bathroom, and  laundry closet. 1st floor has living room, dining room, powder room, and kitchen. A walk out rear yard makes this home great for BBQs and parties.  Large attic is accessed by pull-down ladder but see it and imagine what you could do with that space.


SyntaxError: invalid syntax (<ipython-input-36-9a230d3ec0fb>, line 1)

In [8]:
filepath = 'agent_desc_vectorizer.p'
vectorizer = pickle.load(open(filepath, 'rb'))


In [9]:
input_text = raw_input("Input the text of the agent's description:  ")

Input the text of the agent's description:  Distinctive elements of this Mediterranean design begin at the entrance. The allure begins at the welcoming entry and continues into the refinely finished living areas. Through the door is a surprisingly spacious interior, which spans one level providing approximately 3,000 square feet of living space on a 9,265 sq ft lot. Beyond the living room and dining room with a wide Bay view, an updated kitchen, is certain to be a central gathering spot for relaxed living off the living room. Gleaming hardwood extends throughout. Included in the appeal are fine features that include designer stone counter tops, imported cabinets, crown molding, and a formal dining area with Bay view to complete the appeal. The home's 4 bedroom suites are comfortably arranged over the one level, while a family room off the kitchen provides easy access out to the deck with a view of the Bay and canyon views. Welcome Home


In [10]:
X_stem_lem = vectorizer.transform([input_text])

In [11]:
X_stem_lem.sum()

109

In [12]:
additional_stop_words  =  ['monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday', 'offer',
                           'account', 'disclosur', 'due', 'date', 'sfar', 'broker tour', 'pm', 'offer review',
                           'tour', 'offer date', 'offer due', 'pre', 'accept', 'call', 'close', 'pleas call',
                           'noon', 'open', 'price', 'zestim', 'zestim accur', 'zestim forecast', u'zestim forecastcr',
                           'zestim home', 'zestim owner', 'zestim rent', 'zestim see', 'zestim valu','zestim zestim',
                           'zestim zillow', 'zillow', 'zillow estim', 'zillow valu'
                           ]

In [13]:
non_text_features = ['number_of_bedrooms', 'number_of_bathrooms', 'home_size', 'current_listing_price']

In [14]:
vectorizer

CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=0.9, max_features=None, min_df=75,
        ngram_range=(1, 2), preprocessor=None, stop_words=None,
        strip_accents='unicode', token_pattern=u'(?u)\\b\\w\\w+\\b',
        tokenizer=<function tokenize_stem_lem at 0x10dbd3320>,
        vocabulary=None)

In [15]:
# def create_dataframe_webapp(zip_codes,
#                             non_text_features,
#                             additional_stop_words,
#                             input_text,
#                             zip_code,
#                             home_size_sq_ft,
#                             number_of_bedrooms,
#                             number_of_bathrooms,
#                             vectorizer):
#     '''Prepare a pandas dataframe with the input information with the right format for inserting into
#     a trained Random Forest model to predict the sale_price to askling_price ratio.
#     input:
#     zip_codes:  a LIST of zip codes that are in random forest model.
#     non_text_features: a LIST of text of the names of the non-text features, e.g. 'home_size', 'number_of_bedrooms'.
#     additional_stop_words:  a LIST of tokeized_expression that will be removed from the final input matrix.

#     (The following pertains specific to the home whose price_ratio will be predicted )
#     input_text: The agent's description of the home.
#     zip_code:  The 5-digit zip_code of the home (text).
#     home_size_sq_ft: home size in square feet (integer).
#     number_of_bedrooms: number of bedrooms (float).
#     number_of_bathrooms: number of bathrooms (float)

#     vectorizer: vectorizer to convert input text of the agent's description to the vector of tekenized expressions.

#     '''
#     df_zip = pd.DataFrame(index = [1], columns = zip_codes)
#     df_zip.fillna(0, inplace = True)

#     X_stem_lem = vectorizer.transform([input_text])
#     features = vectorizer.get_feature_names()
#     df_text = pd.DataFrame(X_stem_lem.toarray(), index = [1], columns = features)
#     df_text.drop(additional_stop_words, axis = 1, inplace = True)

#     df_nontxt = pd.DataFrame(index = [1], columns = non_text_features)
#     df_nontxt.set_value(1, non_text_features, [home_size_sq_ft, number_of_bedrooms, number_of_bathrooms])
#     df_nontxt

#     df = pd.concat([df_zip, df_text, df_nontxt],
#                   axis = 1)

#     return df

In [16]:
df = create_dataframe_webapp(zip_codes,
                            non_text_features,
                            additional_stop_words,
                            input_text,
                            zip_code,
                            home_size_sq_ft,
                            number_of_bedrooms,
                            number_of_bathrooms,
                            current_listing_price,
                            vectorizer)

In [17]:
####
filepath = 'rf_zip_full_dict_400_50_7.gzip'
rf_zip_full_dict_400_50_7 = pickle.load(gzip.open(filepath, 'rb'))

In [19]:
####
predicted_price_ratio = rf_zip_full_dict_400_50_7.predict(df.values)
sale_price = predicted_price_ratio[0] * current_listing_price
print ('Our model estimates the sale price will be $' + str(int(sale_price)) + \
       ', which is ' + str(predicted_price_ratio[0])[0:6] + ' times the asking price.')

Our model estimates the sale price will be $2834854, which is 1.0903 times the asking price.


In [20]:
sorted(zip(df.columns, rf_zip_full_dict_400_50_7.feature_importances_), key = lambda x: x[1], reverse = True)

[('home_size', 0.16217482923765389),
 ('current_listing_price', 0.11703695951629119),
 ('number_of_bathrooms', 0.053786605157416741),
 ('94110', 0.016766655778479665),
 ('94019', 0.012198042797852089),
 (u'review', 0.0080976020736380841),
 ('number_of_bedrooms', 0.0076758379632740391),
 (u'home', 0.0066181366182726832),
 ('94131', 0.0060815845338860267),
 (u'escrow', 0.005554335182357528),
 ('94122', 0.0053068991657568287),
 ('94114', 0.005004316437588534),
 (u'bath', 0.0048286534388946355),
 (u'court confirm', 0.0047965402473989747),
 (u'bedroom', 0.0046649754416636432),
 (u'ocean', 0.0045072524115642486),
 (u'dwood', 0.004397391124227012),
 ('94070', 0.0041229822982802294),
 (u'properti', 0.0041100828167609967),
 (u'opportun', 0.0041004672135450133),
 (u'acr', 0.0040103016890943883),
 (u'develop', 0.0038385975448414606),
 (u'room', 0.0037236821203394561),
 (u'escrow open', 0.0035980796125376232),
 ('94002', 0.0035272411076612874),
 (u'broker', 0.0034787274243222588),
 (u'great opport

In [65]:
df_X_regr_all['price_ratio'].mean()

1.065199404753559

In [22]:
print vectorizer.transform([input_text]).toarray().tolist()

[[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,

In [23]:
transformed_text = vectorizer.transform([input_text])

In [24]:
a =transformed_text.toarray() > 1

In [25]:
vectorizer.get_feature_names()

[u'abound',
 u'absolut',
 u'abund',
 u'abund natur',
 u'ac',
 u'accent',
 u'accept',
 u'access',
 u'access freeway',
 u'access highway',
 u'access hwi',
 u'accommod',
 u'account',
 u'accur',
 u'accur plus',
 u'accur rent',
 u'accur zestim',
 u'accuraci',
 u'accuraci tabl',
 u'accuraci tabledon',
 u'acr',
 u'acr lot',
 u'across',
 u'activ',
 u'ad',
 u'add',
 u'add owner',
 u'addit',
 u'addit bedroom',
 u'adjac',
 u'adjoin',
 u'ador',
 u'afford',
 u'agent',
 u'agre',
 u'agre home',
 u'air',
 u'air condit',
 u'airi',
 u'airport',
 u'allow',
 u'almost',
 u'along',
 u'also',
 u'alto',
 u'amaz',
 u'amaz view',
 u'ambianc',
 u'amen',
 u'amen includ',
 u'ampl',
 u'ampl storag',
 u'anoth',
 u'apart',
 u'apart home',
 u'appeal',
 u'appl',
 u'applianc',
 u'applianc granit',
 u'applianc includ',
 u'applianc new',
 u'appoint',
 u'apprais',
 u'apprais use',
 u'appreci',
 u'approv',
 u'approx',
 u'approx sq',
 u'approxim',
 u'april',
 u'arch',
 u'architect',
 u'architectur',
 u'architectur detail',
 

In [26]:
features_present= sorted(zip(vectorizer.get_feature_names(), vectorizer.transform([input_text]).toarray().tolist()[0]), 
                             key = lambda x: x[1],
                             reverse = True
                            )

In [27]:
def features_present(vectorizer, input_text):
    '''Accepts a trained vectorizer and a text to be transformed into an array by the vectorizer.  
    Returns a list of tuples of tokenized text features and each of their occurrences in the text arranged in 
    descending order by occurrences.'''
    return sorted(zip(vectorizer.get_feature_names(), vectorizer.transform([input_text]).toarray().tolist()[0]), 
                  key = lambda x: x[1],
                  reverse = True
                 )

In [28]:
features = features_present(vectorizer, input_text)

In [29]:
rf_zip_full_dict_400_50_7.feature_importances_.shape

(1996,)

In [30]:
df.shape

(1, 1996)

In [31]:
def feature_importance_dict(df, model):
    '''Accepts a dataframe which is used to extract the feature names (tokenized text features) and a model 
    which is used to extract the feature_importances.  Combines the two to form a list of feature importances 
    with feature names ranked by the feature importance.  Then uses this list to generate a dictionary of the feature'''
    feature_importances_dict = {}
    
    #This creates a preliminary list of [rank of feature_imp, (feature_name, feature_imp_value)] that is sorted by rank.
    feature_importances_prelim = zip(range(df.shape[1]),
                                     sorted(zip(df.columns, model.feature_importances_),
                                            key = lambda x: x[1],
                                            reverse = True
                                           )
                                    )
    
    #This flattens the preliminary list to create a list of [feature_name, rank of feature_imp, feature_imp_value].
    feature_importances_list = [(str(x[1][0]), x[0] + 1, x[1][1]) for x in feature_importances_prelim]

    #From this list, create a dictionary in which the key is the feature_name and the values are 
    #rank of feature_importance and the value of feature_importance
    for feature in feature_importances_list:                        
        feature_importances_dict[feature[0]] = feature[1], feature[2]  

    return  feature_importances_dict, feature_importances_list

In [32]:
####
feature_importance_dict(df, rf_zip_full_dict_400_50_7)

({'sleek': (225, 0.00079805987603602047),
  'four': (1079, 5.2900118512722983e-05),
  'level yard': (927, 9.0462729918704501e-05),
  'regular sale': (1425, 1.2616974068967513e-05),
  'lot potenti': (635, 0.00021839116336078415),
  'origin owner': (896, 9.887769550517584e-05),
  'accur': (1652, 0.0),
  'chef kitchen': (300, 0.00059842223799815258),
  'refinish': (103, 0.0014665752919459057),
  'call agent': (1147, 4.2798576513882477e-05),
  'concret': (1347, 1.8222100075658663e-05),
  'shop center': (431, 0.00039611725204295842),
  'home perfect': (1798, 0.0),
  'separ dine': (397, 0.00043490844750361491),
  'pride': (1478, 9.3334080134353855e-06),
  'data coverag': (1714, 0.0),
  'everi': (627, 0.00022245793340186791),
  'kitchen new': (617, 0.00022946752512535755),
  'tenni': (1002, 6.6823920700277812e-05),
  'level featur': (1012, 6.5109297688187715e-05),
  'must see': (293, 0.00061219616892801672),
  'showcas': (1160, 4.1210888761350959e-05),
  'back patio': (1575, 4.025903565474238

In [33]:
feature_importances_dict = {}
feature_importances_list = zip(range(df.shape[1]), 
                               sorted(zip(df.columns, rf_zip_full_dict_400_50_7.feature_importances_),
                                      key = lambda x: x[1],
                                      reverse = True
                                      )
                               )
    
feature_importances_list = [(str(x[1][0]), x[0] + 1, x[1][1]) for x in feature_importances_list]
                               
for feature in feature_importances_list:                        
    feature_importances_dict[feature[0]] = feature[1], feature[2]  
feature_importances_dict

{'sleek': (225, 0.00079805987603602047),
 'four': (1079, 5.2900118512722983e-05),
 'level yard': (927, 9.0462729918704501e-05),
 'regular sale': (1425, 1.2616974068967513e-05),
 'lot potenti': (635, 0.00021839116336078415),
 'origin owner': (896, 9.887769550517584e-05),
 'accur': (1652, 0.0),
 'chef kitchen': (300, 0.00059842223799815258),
 'refinish': (103, 0.0014665752919459057),
 'call agent': (1147, 4.2798576513882477e-05),
 'concret': (1347, 1.8222100075658663e-05),
 'shop center': (431, 0.00039611725204295842),
 'home perfect': (1798, 0.0),
 'separ dine': (397, 0.00043490844750361491),
 'pride': (1478, 9.3334080134353855e-06),
 'data coverag': (1714, 0.0),
 'everi': (627, 0.00022245793340186791),
 'kitchen new': (617, 0.00022946752512535755),
 'tenni': (1002, 6.6823920700277812e-05),
 'level featur': (1012, 6.5109297688187715e-05),
 'must see': (293, 0.00061219616892801672),
 'showcas': (1160, 4.1210888761350959e-05),
 'back patio': (1575, 4.0259035654742385e-06),
 'enjoy': (141,

In [34]:
features

[(u'live', 5),
 (u'room', 4),
 (u'view', 4),
 (u'bay', 3),
 (u'appeal', 2),
 (u'area', 2),
 (u'bay view', 2),
 (u'design', 2),
 (u'dine', 2),
 (u'home', 2),
 (u'includ', 2),
 (u'kitchen', 2),
 (u'level', 2),
 (u'live room', 2),
 (u'one', 2),
 (u'one level', 2),
 (u'provid', 2),
 (u'welcom', 2),
 (u'access', 1),
 (u'approxim', 1),
 (u'bedroom', 1),
 (u'bedroom suit', 1),
 (u'beyond', 1),
 (u'cabinet', 1),
 (u'canyon', 1),
 (u'canyon view', 1),
 (u'central', 1),
 (u'comfort', 1),
 (u'complet', 1),
 (u'counter', 1),
 (u'counter top', 1),
 (u'crown', 1),
 (u'crown mold', 1),
 (u'deck', 1),
 (u'deck view', 1),
 (u'dine area', 1),
 (u'dine room', 1),
 (u'door', 1),
 (u'easi', 1),
 (u'easi access', 1),
 (u'entranc', 1),
 (u'entri', 1),
 (u'extend', 1),
 (u'famili', 1),
 (u'famili room', 1),
 (u'featur', 1),
 (u'featur includ', 1),
 (u'fine', 1),
 (u'finish', 1),
 (u'foot', 1),
 (u'formal', 1),
 (u'formal dine', 1),
 (u'ft', 1),
 (u'ft lot', 1),
 (u'gather', 1),
 (u'gleam', 1),
 (u'gleam hardw

In [35]:
sorted(feature_importances_dict.keys())

['94002',
 '94005',
 '94010',
 '94014',
 '94015',
 '94018',
 '94019',
 '94020',
 '94021',
 '94025',
 '94027',
 '94028',
 '94030',
 '94037',
 '94038',
 '94044',
 '94060',
 '94061',
 '94062',
 '94063',
 '94064',
 '94065',
 '94066',
 '94070',
 '94074',
 '94080',
 '94102',
 '94103',
 '94105',
 '94107',
 '94108',
 '94109',
 '94110',
 '94111',
 '94112',
 '94114',
 '94115',
 '94116',
 '94117',
 '94118',
 '94121',
 '94122',
 '94123',
 '94124',
 '94127',
 '94131',
 '94132',
 '94133',
 '94134',
 '94303',
 '94401',
 '94402',
 '94403',
 '94404',
 '94901',
 '94903',
 '94904',
 '94920',
 '94925',
 '94929',
 '94930',
 '94937',
 '94939',
 '94941',
 '94945',
 '94947',
 '94949',
 '94950',
 '94960',
 '94965',
 '94973',
 'abound',
 'absolut',
 'abund',
 'abund natur',
 'ac',
 'accent',
 'access',
 'access freeway',
 'access highway',
 'access hwi',
 'accommod',
 'accur',
 'accur plus',
 'accur rent',
 'accur zestim',
 'accuraci',
 'accuraci tabl',
 'accuraci tabledon',
 'acr',
 'acr lot',
 'across',
 'act

In [36]:
[feature_importances_dict[str(feature[0])] for feature in features]

KeyError: 'accept'

In [37]:
for feature_present in features_present: 
    feature_present[0], feature_importances_dict[feature_present[0]]

TypeError: 'function' object is not iterable

In [38]:
feature_importances_dict['accept']

KeyError: 'accept'

In [39]:
'accept' in df.columns

False

In [40]:
print sorted(df.columns)

['94002', '94005', '94010', '94014', '94015', '94018', '94019', '94020', '94021', '94025', '94027', '94028', '94030', '94037', '94038', '94044', '94060', '94061', '94062', '94063', '94064', '94065', '94066', '94070', '94074', '94080', '94102', '94103', '94105', '94107', '94108', '94109', '94110', '94111', '94112', '94114', '94115', '94116', '94117', '94118', '94121', '94122', '94123', '94124', '94127', '94131', '94132', '94133', '94134', '94303', '94401', '94402', '94403', '94404', '94901', '94903', '94904', '94920', '94925', '94929', '94930', '94937', '94939', '94941', '94945', '94947', '94949', '94950', '94960', '94965', '94973', u'abound', u'absolut', u'abund', u'abund natur', u'ac', u'accent', u'access', u'access freeway', u'access highway', u'access hwi', u'accommod', u'accur', u'accur plus', u'accur rent', u'accur zestim', u'accuraci', u'accuraci tabl', u'accuraci tabledon', u'acr', u'acr lot', u'across', u'activ', u'ad', u'add', u'add owner', u'addit', u'addit bedroom', u'adjac'

In [41]:
feature_importances_dict['bay window']

(1201, 3.5447251478593186e-05)

In [42]:
feature_importances

NameError: name 'feature_importances' is not defined

In [43]:
filepath = 'df_sum.p'
f = open(filepath, 'rb')
df_feature_occurrence = pickle.load(f)
f.close()

def feature_occurrence(feature_name, df_feature_occurrence):
    '''This function accepts a feature name and returns the occurrences for in the training data'''
    return df_feature_occurrence[feature_name]

    

In [44]:
feature_occurrence('wood burn', df_feature_occurrence)

512

In [45]:
df_feature_occurrence['wood burn']

512

In [46]:
df_X_regr_all = pd.read_csv('df_X_regr_all.csv')

In [47]:
df_X_regr_all.set_index('original_index', inplace = True)

In [48]:
def find_mean_ratio_feature(feature, df):
    return df[df[feature] > 0]['price_ratio'].mean()

In [49]:
find_mean_ratio_feature('94010', df_X_regr_all)

1.0392486783450985

In [50]:
features_present

<function __main__.features_present>

In [51]:
features_present_list = features_present(vectorizer, input_text)
features_present_list = [str(feature[0]) for feature in features_present_list if feature[1] > 0]

In [52]:
def create_feature_importance_dict(df, model):
    '''Accepts a dataframe which is used to extract the feature names (tokenized text features) and a model
    which is used to extract the feature_importances.  Combines the two to form a list of feature importances
    with feature names ranked by the feature importance.  Then uses this list to generate a dictionary of the feature'''
    feature_importances_dict = {}

    #This creates a preliminary list of [rank of feature_imp, (feature_name, feature_imp_value)] that is sorted by rank.
    feature_importances_prelim = zip(range(df.shape[1]),
                                     sorted(zip(df.columns, model.feature_importances_),
                                            key = lambda x: x[1],
                                            reverse = True
                                           )
                                    )

    #This flattens the preliminary list to create a list of [feature_name, rank of feature_imp, feature_imp_value].
    feature_importances_list = [(str(x[1][0]), x[0] + 1, x[1][1]) for x in feature_importances_prelim]

    #From this list, create a dictionary in which the key is the feature_name and the values are
    #rank of feature_importance and the value of feature_importance
    for feature in feature_importances_list:
        feature_importances_dict[feature[0]] = feature[1], feature[2]

    return  feature_importances_dict, feature_importances_list

In [53]:
def find_mean_ratio_feature(feature, df):
    return df[df[feature] > 0]['price_ratio'].mean()

In [54]:
def feature_occurrence(feature_name, df_feature_occurrence):
    '''This function accepts a feature name and returns the occurrences for in the training data'''
    return df_feature_occurrence[feature_name]

In [55]:
df_feature_occurrence['94002']

525

In [56]:
feature_importances_dict, feature_importances_list = create_feature_importance_dict(df, rf_zip_full_dict_400_50_7)

In [57]:
feature_importances_dict

{'sleek': (225, 0.00079805987603602047),
 'four': (1079, 5.2900118512722983e-05),
 'level yard': (927, 9.0462729918704501e-05),
 'regular sale': (1425, 1.2616974068967513e-05),
 'lot potenti': (635, 0.00021839116336078415),
 'origin owner': (896, 9.887769550517584e-05),
 'accur': (1652, 0.0),
 'chef kitchen': (300, 0.00059842223799815258),
 'refinish': (103, 0.0014665752919459057),
 'call agent': (1147, 4.2798576513882477e-05),
 'concret': (1347, 1.8222100075658663e-05),
 'shop center': (431, 0.00039611725204295842),
 'home perfect': (1798, 0.0),
 'separ dine': (397, 0.00043490844750361491),
 'pride': (1478, 9.3334080134353855e-06),
 'data coverag': (1714, 0.0),
 'everi': (627, 0.00022245793340186791),
 'kitchen new': (617, 0.00022946752512535755),
 'tenni': (1002, 6.6823920700277812e-05),
 'level featur': (1012, 6.5109297688187715e-05),
 'must see': (293, 0.00061219616892801672),
 'showcas': (1160, 4.1210888761350959e-05),
 'back patio': (1575, 4.0259035654742385e-06),
 'enjoy': (141,

In [71]:

def Create_text_feat_imp_Dataframe(features_present_list):
    '''Create Dataframe of Text_features_present, Feature_importances, average sale-price/asking-price ratio for a 
    particular text_feature, occurrences.
    Input:
    features_present_list: A list of tokenized texts that are present in the input text.
    
    Output:
    df_feature_present: a Dataframe of Text_features_present, Feature_importances, average sale-price/asking-price ratio for a 
    particular text_feature, occurrences.
    '''
        
    features_dict_list = []
    #Names of the columns in dataframe.
    cols = ['Text feature',
            'Feature importance (10^-3)',
            'Sale-price/asking-price ratio (Avg. = 1.065)',
            'Occurrences in the training data (13,335 properties)']

    for feature in features_present_list:
        #Create a dictionary of values for a given tokenized text feature
        dict_temp = {cols[0]: feature.upper(),
                     cols[1]: feature_importances_dict[feature][1] * 1000,
                     cols[2]: str(find_mean_ratio_feature(feature, df_X_regr_all))[0:8],
                     cols[3]:  str(feature_occurrence(feature, df_feature_occurrence))
                    }
        #Append the dictionary to a list which will be used to create the dataframe
        features_dict_list.append(dict_temp)
    
    #Create dataframe, sort order by Feature_importance
    df_feature_present = pd.DataFrame(features_dict_list, columns = cols )
    df_feature_present.set_index('Text feature',   inplace = True)
    df_feature_present.sort_values('Feature importance (10^-3)',
                                    axis = 0,
                                   inplace = True,
                                   ascending = False       
                                  )
    return df_feature_present


In [72]:
Distinctive elements of this Mediterranean design begin at the entrance. 
The allure begins at the welcoming entry and continues into the refinely finished living areas. 
Through the door is a surprisingly spacious interior, 
which spans one level providing approximately 3,000 square feet of living space on a 9,265 sq ft lot. 
Beyond the living room and dining room with a wide Bay view, an updated kitchen, 
is certain to be a central gathering spot for relaxed living off the living room. 
Gleaming hardwood extends throughout. 
Included in the appeal are fine features that include designer stone counter tops, imported cabinets, 
crown molding, and a formal dining area with Bay view to complete the appeal. 
The home's 4 bedroom suites are comfortably arranged over the one level, 
while a family room off the kitchen provides easy access out to the deck with a view of the Bay and canyon views. 
Welcome Home

SyntaxError: invalid syntax (<ipython-input-72-18e033874a3d>, line 1)

In [73]:
print 'Below are a table of expressions tokenized from your input text.'
print 'Note that the "importance" of an expression is a combination of how'
print 'far the price_ratio with the expression is different from the average'
print 'price_ratio for all data (1.065) and how often the expression occurs '
print 'in all data (13,335).'

Below are a table of expressions tokenized from your input text.
Note that the "importance" of an expression is a combination of how
far the price_ratio with the expression is different from the average
price_ratio for all data (1.065) and how often the expression occurs 
in all data (13,335).


In [74]:

Create_text_feat_imp_Dataframe(features_present_list)

Unnamed: 0_level_0,Feature importance (10^-3),Sale-price/asking-price ratio (Avg. = 1.065),"Occurrences in the training data (13,335 properties)"
Text feature,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
HOME,6.618137,1.062796,8148
BEDROOM,4.664975,1.057499,5140
ROOM,3.723682,1.062767,6530
LOT,3.345187,1.066200,2656
VIEW,3.142817,1.052142,2845
KITCHEN,2.766586,1.063107,6866
LEVEL,2.194115,1.062940,2104
CANYON,2.167044,1.077562,220
LIVE,2.165804,1.065101,4792
DINE,2.132710,1.069346,2952


In [60]:
for feature in features_present_list:
    print feature.upper(), 
    print 'has an importance of ',
    print str(feature_importances_dict[feature][1]) + '.'
    print 'The average sale/asking price ratio for properties with this feature is ',
    print str(find_mean_ratio_feature(feature, df_X_regr_all))[0:8] + '.'
    print 'It occurred in' + str(feature_occurrence(feature, df_feature_occurrence)) + ' rows in 13,335 rows of the training data.'
    print 
    print 

LIVE has an importance of  0.00216580435495.
The average sale/asking price ratio for properties with this feature is  1.065101.
It occurred in4792 rows in 13,335 rows of the training data.


ROOM has an importance of  0.00372368212034.
The average sale/asking price ratio for properties with this feature is  1.062767.
It occurred in6530 rows in 13,335 rows of the training data.


VIEW has an importance of  0.0031428169273.
The average sale/asking price ratio for properties with this feature is  1.052142.
It occurred in2845 rows in 13,335 rows of the training data.


BAY has an importance of  0.00083281153181.
The average sale/asking price ratio for properties with this feature is  1.060792.
It occurred in1104 rows in 13,335 rows of the training data.


APPEAL has an importance of  0.000388912637818.
The average sale/asking price ratio for properties with this feature is  1.083402.
It occurred in244 rows in 13,335 rows of the training data.


AREA has an importance of  0.00174623878254.


In [61]:
print 'It occurred ' + feature_occurrence(feature, df_feature_occurrence) + 'in the training data.'

TypeError: ufunc 'add' did not contain a loop with signature matching types dtype('S21') dtype('S21') dtype('S21')

In [110]:
feature_importances_dict['almost']

(1354, 1.7850430808521591e-05)

In [111]:
features_present_list = [feature for feature in features_present_list if feature not in additional_stop_words]

In [112]:
features_present_list

['room',
 'closet',
 'floor',
 'new',
 'walk',
 'access',
 'almost',
 'attic',
 'bathroom',
 'bedroom',
 'bedroom full',
 'could',
 'dine',
 'dine room',
 'electr',
 'everyth',
 'floor live',
 'floor spacious',
 'full',
 'full bathroom',
 'great',
 'home',
 'home great',
 'imagin',
 'kitchen',
 'larg',
 'laundri',
 'live',
 'live room',
 'make',
 'make home',
 'master',
 'master suit',
 'new roof',
 'parti',
 'plumb',
 'powder',
 'rear',
 'rear yard',
 'roof',
 'room dine',
 'room kitchen',
 'see',
 'space',
 'spacious',
 'spacious master',
 'suit',
 'suit walk',
 'system',
 'walk closet',
 'yard']