# Text Features

## Prep

In [308]:
# General libraries.
import re
import numpy as np
import pandas as pd
#import matplotlib.pyplot as plt

# SK-learn libraries for learning.
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.grid_search import GridSearchCV

# SK-learn libraries for evaluation.
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report
from scipy.stats import itemfreq

# SK-learn libraries for feature extraction from text.
from sklearn.feature_extraction.text import *

In [309]:
train_df = pd.read_json("../input/train.json")
test_df = pd.read_json("../input/test.json")

In [310]:
from sklearn.model_selection import train_test_split

y = train_df['interest_level']
y2 = y.replace({'low':1,'medium':2,'high':3})
X = train_df.drop('interest_level', 1)
X_train, X_dev, y_train, y_dev = train_test_split(X, y2, test_size=0.2, random_state=1)

In [311]:
def accuracy_report(name, X_train, y_train, X_dev, y_dev):

    #LR
    #lr = LogisticRegression()
    #lr.fit(X_train, y_train)

    print('===============================')
    #print(name)
    
    #print('LR Training Accuracy: %.2f%% \t Test Accuracy: %.2f%%' % (
    #    lr.score(X_train, y_train)*100,                                                         
    #    lr.score(X_dev, y_dev)*100))
 
    ##precision (% select relevent), recall (% relevant selected)
    #print ('LR dev: \n{}'.format(classification_report(y_dev, lr.predict(X_dev))))
    #print itemfreq(lr.predict(X_dev))
    
    #RFC
    rfc = RandomForestClassifier(n_estimators=20, n_jobs=-1) #-1 means use all available cores
    rfc.fit(X_train, y_train)
    print('RF Training Accuracy: %.2f%% \t Test Accuracy: %.2f%%' % (
        rfc.score(X_train, y_train)*100,                                                         
        rfc.score(X_dev, y_dev)*100))

    print ('RFC dev: \n{}'.format(classification_report(y_dev, rfc.predict(X_dev))))
    print itemfreq(rfc.predict(X_dev))

    importances = rfc.feature_importances_
    features = X_train.columns

    sort_indices = np.argsort(importances)[::-1]
    sorted_features = []
    for i in sort_indices:
        sorted_features.append(features[i])

    print('\nfeatures')
    print(sorted_features)
    print importances[sort_indices]

## Text Features

In [312]:
from string import punctuation
from sklearn.ensemble import RandomForestClassifier

#using dif var in case user wants to keep original df.
def add_txt_features(orig):
    dat = orig
    dat.loc[:,'strlen'] = [len(x) for x in dat['description']] #can ignore warnings because making new column
    dat.loc[:,'numwords'] = [len(x.split()) for x in dat['description']]
    dat.loc[:,'numcaps'] = [sum(1 for c in x if c.upper()) for x in dat['description']]
    dat.loc[:,'numpunct'] = [sum(1 for c in x if c in punctuation) for x in dat['description']]
    dat.loc[:,'richness'] = [len(set(x)) / (len(x)+0.001) for x in dat['description']]
    return dat

X_train = add_txt_features(X_train)
X_dev =   add_txt_features(X_dev)
#X_train.head()

feature_list = ['strlen','numwords','numcaps','numpunct','richness']
X_train_limited = X_train[feature_list]
X_dev_limited = X_dev[feature_list]

accuracy_report('text features', X_train_limited, y_train, X_dev_limited, y_dev)

X_train_limited = X_train[['richness']]
X_dev_limited = X_dev[['richness']]
accuracy_report('richness only', X_train_limited, y_train, X_dev_limited, y_dev)

RF Training Accuracy: 94.66% 	 Test Accuracy: 67.12%
RFC dev: 
             precision    recall  f1-score   support

          1       0.73      0.89      0.80      6797
          2       0.37      0.20      0.26      2334
          3       0.28      0.10      0.15       740

avg / total       0.61      0.67      0.63      9871

[[   1 8311]
 [   2 1296]
 [   3  264]]

features
['richness', 'numwords', 'strlen', 'numcaps', 'numpunct']
[ 0.25921804  0.18805102  0.18546005  0.18545572  0.18181517]
RF Training Accuracy: 81.14% 	 Test Accuracy: 60.96%
RFC dev: 
             precision    recall  f1-score   support

          1       0.71      0.80      0.75      6797
          2       0.29      0.22      0.25      2334
          3       0.14      0.10      0.11       740

avg / total       0.57      0.61      0.59      9871

[[   1 7616]
 [   2 1750]
 [   3  505]]

features
['richness']
[ 1.]


RFC does better job with recall, but given these signalls, is still too heavy in assigning to 1 (Low)

## Price Features

In [313]:
def add_price_features(orig):
    dat = orig
    dat.loc[:,'price_per_bed'] = dat['price'] / (dat['bedrooms']+0.00001)
    dat.loc[:,'price_per_bath'] = dat['price'] / (dat['bathrooms']+0.00001)
    dat.loc[:,'price_per_room'] = dat['price'] / (dat['bathrooms'] + dat['bedrooms'] +0.00001)
    return dat

X_train = add_price_features(X_train)
X_dev =   add_price_features(X_dev)
#X_train.head()

priceplus_list = ['price_per_bed','price_per_bath','price_per_room',
                'price','bedrooms','bathrooms']
X_train_limited = X_train[priceplus_list]
X_dev_limited =     X_dev[priceplus_list]

feature_list = ['price_per_bed','price_per_bath','price_per_room',
                'price','bedrooms','bathrooms',
                'strlen','numwords','numcaps','numpunct','richness']
X_train_limited = X_train[feature_list]
X_dev_limited =     X_dev[feature_list]
accuracy_report('text features', X_train_limited, y_train, X_dev_limited, y_dev)


RF Training Accuracy: 96.68% 	 Test Accuracy: 69.35%
RFC dev: 
             precision    recall  f1-score   support

          1       0.76      0.89      0.82      6797
          2       0.42      0.27      0.33      2334
          3       0.41      0.25      0.31       740

avg / total       0.65      0.69      0.67      9871

[[   1 7909]
 [   2 1503]
 [   3  459]]

features
['richness', 'strlen', 'numwords', 'numpunct', 'numcaps', u'price', 'price_per_room', 'price_per_bed', 'price_per_bath', u'bedrooms', u'bathrooms']
[ 0.1378492   0.12406399  0.12270829  0.12066397  0.12052267  0.09630815
  0.09596195  0.08597939  0.07745668  0.01104811  0.0074376 ]


## Let's take a look at the number of photos

In [314]:
def get_num_photos(orig):
    dat = orig
    dat.loc[:,'numphotos'] = [len(x) for x in dat['photos'].values]
    return dat

X_train = get_num_photos(X_train)
X_dev =   get_num_photos(X_dev)

X_train_ph = X_train[['numphotos']]
X_dev_ph = X_dev[['numphotos']]

X_train_limited = X_train[['numphotos']]
X_dev_limited = X_dev[['numphotos']]
accuracy_report('photos', X_train_limited, y_train, X_dev_limited, y_dev)

feature_list = ['numphotos', 'price_per_bed','price_per_bath','price_per_room',
                'price','bedrooms','bathrooms',
                'strlen','numwords','numcaps','numpunct','richness']

X_train_limited = X_train[feature_list]
X_dev_limited = X_dev[feature_list]
accuracy_report('full', X_train_limited, y_train, X_dev_limited, y_dev)

X_train_limited = X_train[['numphotos','richness']]
X_dev_limited = X_dev[['numphotos', 'richness']]
accuracy_report('num photos+richness', X_train_limited, y_train, X_dev_limited, y_dev)

RF Training Accuracy: 69.62% 	 Test Accuracy: 68.86%
RFC dev: 
             precision    recall  f1-score   support

          1       0.69      1.00      0.82      6797
          2       0.00      0.00      0.00      2334
          3       0.00      0.00      0.00       740

avg / total       0.47      0.69      0.56      9871

[[   1 9871]]

features
['numphotos']
[ 1.]
RF Training Accuracy: 97.12% 	 Test Accuracy: 69.54%
RFC dev: 
             precision    recall  f1-score   support

          1       0.76      0.89      0.82      6797
          2       0.42      0.28      0.33      2334
          3       0.41      0.24      0.30       740

avg / total       0.66      0.70      0.67      9871

[[   1 7910]
 [   2 1537]
 [   3  424]]

features
['richness', 'numcaps', 'numpunct', 'strlen', 'numwords', 'price_per_room', u'price', 'numphotos', 'price_per_bed', 'price_per_bath', u'bedrooms', u'bathrooms']
[ 0.1226749   0.10890446  0.10869676  0.10814959  0.10496556  0.08961571
  0.086623

## Time Info

In [315]:
def add_time_features(orig):  #(x,"%Y-%m-%d %H:%M:%S")
    dat = orig
    #dat['created2'] = dat['created'].str[0:10].astype("datetime64");
    dat.loc[:,"created2"] = dat['created'].astype("datetime64");
     
    dat.loc[:,'year']   = dat['created2'].dt.year
    dat.loc[:,'month']  = dat['created2'].dt.month
    dat.loc[:,'day']    = dat['created2'].dt.day
    dat.loc[:,'weekday']= dat['created2'].dt.dayofweek
    dat.loc[:,'hour']   = dat['created2'].dt.hour
    dat.loc[:,'minute'] = dat['created2'].dt.minute
    dat.loc[:,'second'] = dat['created2'].dt.second
    dat.loc[:,'hr_min'] = dat['created2'].dt.hour.multiply(100).add(dat['created2'].dt.minute)
    return dat

X_train = add_time_features(X_train)
X_dev =   add_time_features(X_dev)

time_list = ['year', 'month', 'day', 'weekday', 'hour', 'minute', 'second', 'hr_min']
X_train_limited = X_train[time_list]
X_dev_limited =     X_dev[time_list]
accuracy_report('text features', X_train_limited, y_train, X_dev_limited, y_dev)


feature_list = ['year', 'month', 'day', 'weekday', 'hour', 'minute', 'second', 'hr_min',
                'numphotos', 'price_per_bed','price_per_bath','price_per_room',
                'price','bedrooms','bathrooms',
                'strlen','numwords','numcaps','numpunct','richness']
X_train_limited = X_train[feature_list]
X_dev_limited =     X_dev[feature_list]
accuracy_report('text features', X_train_limited, y_train, X_dev_limited, y_dev)


RF Training Accuracy: 98.71% 	 Test Accuracy: 64.02%
RFC dev: 
             precision    recall  f1-score   support

          1       0.70      0.87      0.78      6797
          2       0.30      0.15      0.20      2334
          3       0.14      0.05      0.07       740

avg / total       0.57      0.64      0.59      9871

[[   1 8421]
 [   2 1197]
 [   3  253]]

features
['second', 'hr_min', 'minute', 'day', 'weekday', 'hour', 'month', 'year']
[ 0.33112838  0.22354764  0.19247798  0.11637448  0.06102392  0.04676748
  0.02868011  0.        ]
RF Training Accuracy: 99.49% 	 Test Accuracy: 70.71%
RFC dev: 
             precision    recall  f1-score   support

          1       0.76      0.92      0.83      6797
          2       0.45      0.26      0.33      2334
          3       0.45      0.18      0.26       740

avg / total       0.66      0.71      0.67      9871

[[   1 8239]
 [   2 1332]
 [   3  300]]

features
['hr_min', u'price', 'price_per_bed', 'price_per_room', 'price_pe

## Add in num features

In [316]:
#todo add in cleaning

def get_num_features(orig):
    dat = orig
    dat.loc[:,'numfeatures'] = [len(x) for x in dat['features'].values]
    return dat

X_train = get_num_features(X_train)
X_dev =   get_num_features(X_dev)

X_train_limited = X_train[['numfeatures']]
X_dev_limited = X_dev[['numfeatures']]
accuracy_report('numfeatures', X_train_limited, y_train, X_dev_limited, y_dev)

feature_list = ['year', 'month', 'day', 'weekday', 'hour', 'minute', 'second', 'hr_min',
                'numphotos', 'numfeatures',
                'price_per_bed','price_per_bath','price_per_room',
                'price','bedrooms','bathrooms',
                'strlen','numwords','numcaps','numpunct','richness']
X_train_limited = X_train[feature_list]
X_dev_limited =     X_dev[feature_list]
accuracy_report('numfeatures+phtotos+text', X_train_limited, y_train, X_dev_limited, y_dev)

RF Training Accuracy: 69.63% 	 Test Accuracy: 68.84%
RFC dev: 
             precision    recall  f1-score   support

          1       0.69      1.00      0.82      6797
          2       0.00      0.00      0.00      2334
          3       0.00      0.00      0.00       740

avg / total       0.47      0.69      0.56      9871

[[   1 9869]
 [   2    1]
 [   3    1]]

features
['numfeatures']
[ 1.]
RF Training Accuracy: 99.55% 	 Test Accuracy: 71.06%
RFC dev: 
             precision    recall  f1-score   support

          1       0.76      0.92      0.83      6797
          2       0.46      0.26      0.33      2334
          3       0.48      0.20      0.28       740

avg / total       0.67      0.71      0.67      9871

[[   1 8236]
 [   2 1328]
 [   3  307]]

features
['hr_min', 'price_per_room', u'price', 'price_per_bed', 'richness', 'numpunct', 'price_per_bath', 'second', 'strlen', 'minute', 'numcaps', 'numwords', 'day', 'numfeatures', 'numphotos', 'hour', 'weekday', 'month', u'

## Compare addresses

In [317]:
def get_address_dif(orig):
    dat = orig
    street_addr_len = [len(sa) for sa in dat['street_address']]
    display_addr_len = [len(da) for da in dat['display_address']]
    dat.loc[:,'addr_dif'] = np.subtract(street_addr_len,display_addr_len)
    return dat

X_train = get_address_dif(X_train)
X_dev =   get_address_dif(X_dev)


X_train_limited = X_train[['addr_dif']]
X_dev_limited = X_dev[['addr_dif']]
accuracy_report('addrdiff', X_train_limited, y_train, X_dev_limited, y_dev)

feature_list = ['year', 'month', 'day', 'weekday', 'hour', 'minute', 'second', 'hr_min',
                'numphotos', 'numfeatures','addr_dif',
                'price_per_bed','price_per_bath','price_per_room',
                'price','bedrooms','bathrooms',
                'strlen','numwords','numcaps','numpunct','richness']

X_train_limited = X_train[feature_list]
X_dev_limited = X_dev[feature_list]
accuracy_report('full', X_train_limited, y_train, X_dev_limited, y_dev)


RF Training Accuracy: 69.74% 	 Test Accuracy: 68.85%
RFC dev: 
             precision    recall  f1-score   support

          1       0.69      0.99      0.82      6797
          2       0.43      0.02      0.03      2334
          3       0.17      0.00      0.00       740

avg / total       0.59      0.69      0.57      9871

[[   1 9778]
 [   2   87]
 [   3    6]]

features
['addr_dif']
[ 1.]
RF Training Accuracy: 99.50% 	 Test Accuracy: 71.16%
RFC dev: 
             precision    recall  f1-score   support

          1       0.76      0.92      0.83      6797
          2       0.46      0.26      0.33      2334
          3       0.48      0.19      0.27       740

avg / total       0.67      0.71      0.67      9871

[[   1 8237]
 [   2 1337]
 [   3  297]]

features
['hr_min', 'price_per_room', u'price', 'price_per_bed', 'numpunct', 'richness', 'price_per_bath', 'numcaps', 'second', 'minute', 'strlen', 'numwords', 'day', 'numfeatures', 'numphotos', 'hour', 'weekday', 'addr_dif', 'm

## Adding in Neighborhoods

In [318]:
from sklearn.cluster import KMeans
np.random.seed(0)

#from scipy import stats
#remvoe outliers
def remove_outliers(orig):
    dat = orig
    dat = dat[((dat.latitude - dat.latitude.mean()) / dat.latitude.std()).abs() < 3]
    dat = dat[((dat.longitude - dat.longitude.mean()) / dat.longitude.std()).abs() < 3]
    return dat

X_train_no_outliers = remove_outliers(X_train)

def make_neighborhoods(orig, num_clusters):
    #returns a km, wth which we can classify other points
    dat = orig[['latitude', 'longitude']].copy()
    km = KMeans(num_clusters, random_state=1).fit(dat)
    return km

def fit_neighborhoods(orig, km):
    dat = orig[['latitude', 'longitude']].copy()
    dat2 = orig
    dat2.loc[:,'neighborhood'] = km.predict(dat)
    return dat2

neighborhoods = make_neighborhoods(X_train, 50)
X_train = fit_neighborhoods(X_train, neighborhoods)
X_dev = fit_neighborhoods(X_dev, neighborhoods)

X_train_limited = X_train[['neighborhood']]
X_dev_limited = X_dev[['neighborhood']]
accuracy_report('neighborhood', X_train_limited, y_train, X_dev_limited, y_dev)

feature_list = ['year', 'month', 'day', 'weekday', 'hour', 'minute', 'second', 'hr_min',
                'numphotos', 'numfeatures','addr_dif','neighborhood',
                'price_per_bed','price_per_bath','price_per_room',
                'price','bedrooms','bathrooms',
                'strlen','numwords','numcaps','numpunct','richness']

X_train_limited = X_train[feature_list]
X_dev_limited = X_dev[feature_list]
accuracy_report('full', X_train_limited, y_train, X_dev_limited, y_dev)


RF Training Accuracy: 69.72% 	 Test Accuracy: 69.05%
RFC dev: 
             precision    recall  f1-score   support

          1       0.69      1.00      0.82      6797
          2       0.39      0.01      0.02      2334
          3       0.76      0.03      0.05       740

avg / total       0.63      0.69      0.57      9871

[[   1 9790]
 [   2   56]
 [   3   25]]

features
['neighborhood']
[ 1.]
RF Training Accuracy: 99.51% 	 Test Accuracy: 71.08%
RFC dev: 
             precision    recall  f1-score   support

          1       0.76      0.92      0.83      6797
          2       0.45      0.26      0.33      2334
          3       0.48      0.20      0.28       740

avg / total       0.67      0.71      0.67      9871

[[   1 8201]
 [   2 1366]
 [   3  304]]

features
['hr_min', u'price', 'price_per_room', 'price_per_bed', 'price_per_bath', 'numpunct', 'richness', 'second', 'strlen', 'minute', 'numcaps', 'numwords', 'day', 'numfeatures', 'neighborhood', 'numphotos', 'hour', 'week

In [319]:
#mean clustering wasn't working as cleanly. trying w/o first
k = '''def make_neighborhoods(orig, num_clusters):
    #returns a km, wth which we can classify other points
    dat = orig[['latitude', 'longitude']].copy()
    lat_mean = dat['latitude'].mean() 
    lon_mean = dat['longitude'].mean()
    lat_std = dat['latitude'].std() 
    lon_std = dat['longitude'].std()
    
    dat['lat_norm'] = (dat['latitude'] - lat_mean)/lat_std
    dat['long_norm'] = (dat['longitude'] - lon_mean)/lon_std 

    km = KMeans(num_clusters, random_state=1).fit(dat[['lat_norm','long_norm']])
    return km, lat_mean, lon_mean, lat_std, lon_std

def fit_neighborhoods(orig, km, lat_mean, lon_mean, lat_std, lon_std):
    dat = orig[['latitude', 'longitude']].copy()
    
    lat_mean = dat['latitude'].mean() 
    lon_mean = dat['longitude'].mean()
    lat_std = dat['latitude'].std() 
    lon_std = dat['longitude'].std()
    
    #normalize
    dat['lat_norm'] = (dat['latitude'] - lat_mean)/lat_std
    dat['long_norm'] = (dat['longitude'] - lon_mean)/lon_std 

    dat2 = orig
    dat2['neighborhood'] = km.predict(dat[['lat_norm','long_norm']])
    return dat2
    
#need to get these beofre running others
km, lat_mean, lon_mean, lat_std, lon_std = make_neighborhoods(X_train_no_outliers, 100) #todo justify choice, show graph num v reward

X_train = fit_neighborhoods(X_train, km, lat_mean, lon_mean, lat_std, lon_std)
X_dev = fit_neighborhoods(X_dev, km, lat_mean, lon_mean, lat_std, lon_std)
print X_train[['latitude','longitude','neighborhood']].head(50)
print X_dev[['latitude','longitude','neighborhood']].head(50)'''

## Description Preprocessing

In [320]:
#Preprocessor
def pre_proc(s,
              word_length_range=(3,7),
              remove_stop_words=True,
              scale_capitals=1,
              set_to_lower=True,
              remove_numbers=False
             ):
   
    s2 = re.sub(ur"\p{P}+","",s) #strip punctuation
    s2 = re.sub(ur"[^\w ]+"," ",s2) #remove punctuation2
    s2 = re.sub(ur"\_","",s2) #remove underscores (ignored by w)
    
    #http://stackoverflow.com/questions/8745821/find-words-with-capital-letters-not-at-start-of-a-sentence-with-regex
    #doesn't matter if at start of sentence, often it's the key NP. If a stopword, those get stripped anyway
    names = " "+" ".join(re.findall(ur'\b[A-Z][A-Za-z0-9]*\b',s2))
    for i in range(0,scale_capitals):
        s2 = s2 + names
        
    if set_to_lower:
        s2 = s2.lower() #lower case

    s2 = re.sub(ur"\s+", " ",s2) #remove mult spaces (avoids cases with double spaces for look behind)
    
    if remove_numbers:
        s2 = re.sub(ur"\d", " ",s2) #remove all numbers

    truncation_re = ur"(?<=(\s\w{"+ur"{}".format(word_length_range[1])+ur"}))(\w*\s)"
    s2 = re.sub(truncation_re,"\1 ",s2) #truncate words > n char

    short_elim_re = ur"\b\w{1,"+ur"{}".format(word_length_range[0])+ur"}\b"
    s2 = re.sub(short_elim_re, "", s2) #removes all words/numbers < n in length
    
    #remomve stop words
    if remove_stop_words:
        s2_split = s2.split()
        s3_split = s2.split()
        for key in s2_split:
            if key.lower in stop_words:
                s3_split.remove(key)
        s2 =' '.join(s3_split)
    return s2



In [321]:
pre_proc_custom = lambda x: pre_proc(x, 
                                      word_length_range = (3,8), 
                                      remove_stop_words = False, 
                                      scale_capitals = 1, 
                                      set_to_lower = True,
                                      remove_numbers = False
                                     )

mytv = TfidfVectorizer(ngram_range=(1,1), 
                       analyzer='word', 
                       preprocessor=pre_proc_custom)
mytv.fit_transform(X_train['description'].values)
train_words = mytv.get_feature_names()

In [322]:
def get_desc_mnb(X, y):    
    mytv_dev = TfidfVectorizer(ngram_range=(1,1), 
                           analyzer='word', 
                           preprocessor=pre_proc_custom,  #set above
                           vocabulary=train_words) #also set above
    
    X_dev_words = mytv_dev.fit_transform(X['description'].values) 
    
    mnb = MultinomialNB(alpha = 0.009)
    mnb.fit(X_dev_words, y)
    return mnb

my_desc_mnb = get_mnb(X_train, y_train)

def get_description_scores(X, mnb):
    mytv_dev = TfidfVectorizer(ngram_range=(1,1), 
                           analyzer='word', 
                           preprocessor=pre_proc_custom,  #set above
                           vocabulary=train_words) #also set above
    
    X_dev_words = mytv_dev.fit_transform(X['description'].values) 
    
    pred_train = mnb.predict_proba(X_dev_words)
    #print pred_train
    #print mnb.classes_
    
    dat = X
    dat.loc[:,'desc_1'] = pred_train[:,0]
    dat.loc[:,'desc_2'] = pred_train[:,1]
    dat.loc[:,'desc_3'] = pred_train[:,2]
    return dat

X_train = get_description_scores(X_train, my_desc_mnb)
X_dev = get_description_scores(X_dev, my_desc_mnb)

desc_list = ['desc_1','desc_2','desc_3']

X_train_limited = X_train[desc_list]
X_dev_limited = X_dev[desc_list]
accuracy_report('desc', X_train_limited, y_train, X_dev_limited, y_dev)

feature_list = ['desc_1','desc_2','desc_3',
                'year', 'month', 'day', 'weekday', 'hour', 'minute', 'second', 'hr_min',
                'numphotos', 'numfeatures','addr_dif','neighborhood',
                'price_per_bed','price_per_bath','price_per_room',
                'price','bedrooms','bathrooms',
                'strlen','numwords','numcaps','numpunct','richness']

X_train_limited = X_train[feature_list]
X_dev_limited = X_dev[feature_list]
accuracy_report('full', X_train_limited, y_train, X_dev_limited, y_dev)


[[  7.21062996e-01   1.99006002e-01   7.99310019e-02]
 [  7.79460626e-01   1.72158974e-01   4.83803993e-02]
 [  9.01390775e-01   9.36675750e-02   4.94164982e-03]
 ..., 
 [  9.99422901e-01   2.48061018e-04   3.29038233e-04]
 [  5.37700225e-01   2.51754772e-01   2.10545003e-01]
 [  7.00036656e-01   2.09229421e-01   9.07339223e-02]]
[1 2 3]
[[  5.86560001e-01   3.57145185e-01   5.62948145e-02]
 [  9.71526797e-01   2.79161608e-02   5.57042004e-04]
 [  7.90276689e-01   1.77630121e-01   3.20931894e-02]
 ..., 
 [  8.02553942e-01   1.86559286e-01   1.08867714e-02]
 [  6.98961800e-01   2.71136433e-01   2.99017676e-02]
 [  6.96208303e-01   2.25298245e-01   7.84934525e-02]]
[1 2 3]
RF Training Accuracy: 94.36% 	 Test Accuracy: 67.12%
RFC dev: 
             precision    recall  f1-score   support

          1       0.76      0.85      0.80      6797
          2       0.39      0.29      0.33      2334
          3       0.30      0.20      0.24       740

avg / total       0.64      0.67      0.65 

## Feature Cleanup and Text Processing

In [323]:
def clean(s):
    for i,x in enumerate(s):
        x = x.lower()
        x = x.strip()
        x = x.replace("-", "")
        x = x.replace(" ", "")
        x = x.replace("twenty four hour", "24")
        x = x.replace("24/7", "24")
        x = x.replace("24hr", "24")
        x = x.replace("24-hour", "24")
        x = x.replace("24hour", "24")
        x = x.replace("24 hour", "24")
        x = x.replace("common", "cm")
        x = x.replace("concierge", "doorman")
        x = x.replace("bicycle", "bike")
        x = x.replace("private", "pv")
        x = x.replace("deco", "dc")
        x = x.replace("decorative", "dc")
        x = x.replace("onsite", "os")
        x = x.replace("outdoor", "od")
        x = x.replace("ss appliances", "stainless")
        s[i] = x
    return s

def clean_features(orig):
    dat = orig
    dat.loc[:,'cleaned_features'] = [' '.join(clean(f)) for f in dat['features'].values]
    return dat

X_train = clean_features(X_train)

## Run Analysis on Features

In [324]:
mytv = TfidfVectorizer(ngram_range=(1,1), 
                       analyzer='word', 
                      ) #preprocessor=pre_proc_custom) #no preprocessor

mytv.fit_transform(X_train['cleaned_features'].values)
train_words = mytv.get_feature_names()

def get_feature_mnb(X, y):
    X = clean_features(X)
    mytv_dev = TfidfVectorizer(ngram_range=(1,1), 
                           analyzer='word', 
                           #preprocessor=pre_proc_custom,  #set above
                           vocabulary=train_words) #also set above
    
    X_dev_words = mytv_dev.fit_transform(X['cleaned_features'].values) 
    
    mnb = MultinomialNB(alpha = 0.009)
    mnb.fit(X_dev_words, y)
    return mnb

def get_feature_scores(X, mnb):
    X = clean_features(X)
    mytv_dev = TfidfVectorizer(ngram_range=(1,1), 
                           analyzer='word', 
                           preprocessor=pre_proc_custom,  #set above
                           vocabulary=train_words) #also set above
    
    X_dev_words = mytv_dev.fit_transform(X['cleaned_features'].values) 
    
    pred_train = mnb.predict_proba(X_dev_words)
    #print pred_train
    #print mnb.classes_
    
    dat = X
    dat.loc[:,'feat_1'] = pred_train[:,0]
    dat.loc[:,'feat_2'] = pred_train[:,1]
    dat.loc[:,'feat_3'] = pred_train[:,2]
    return dat

my_feature_mnb = get_feature_mnb(X_train, y_train)
X_train = get_feature_scores(X_train, my_feature_mnb)
X_dev = get_feature_scores(X_dev, my_feature_mnb)

feat_list = ['feat_1','feat_2','feat_3']

X_train_limited = X_train[feat_list]
X_dev_limited = X_dev[feat_list]
accuracy_report('feat', X_train_limited, y_train, X_dev_limited, y_dev)

feature_list = ['feat_1','feat_2','feat_3', 'desc_1','desc_2','desc_3',
                'year', 'month', 'day', 'weekday', 'hour', 'minute', 'second', 'hr_min',
                'numphotos', 'numfeatures','addr_dif','neighborhood',
                'price_per_bed','price_per_bath','price_per_room',
                'price','bedrooms','bathrooms',
                'strlen','numwords','numcaps','numpunct','richness']

X_train_limited = X_train[feature_list]
X_dev_limited = X_dev[feature_list]
accuracy_report('full', X_train_limited, y_train, X_dev_limited, y_dev)


[[  6.96208303e-01   2.25298245e-01   7.84934525e-02]
 [  9.62971519e-01   3.65733618e-02   4.55118858e-04]
 [  7.43953767e-01   1.95998537e-01   6.00476962e-02]
 ..., 
 [  7.65951199e-01   1.82963634e-01   5.10851670e-02]
 [  7.23370843e-01   2.07659553e-01   6.89696041e-02]
 [  6.94147110e-01   1.76421227e-01   1.29431664e-01]]
[1 2 3]
[[ 0.7502805   0.19220378  0.05751571]
 [ 0.75438007  0.17615451  0.06946542]
 [ 0.61085932  0.28133982  0.10780086]
 ..., 
 [ 0.64538548  0.25929188  0.09532264]
 [ 0.77222915  0.17373251  0.05403834]
 [ 0.75615518  0.19272626  0.05111856]]
[1 2 3]
RF Training Accuracy: 71.89% 	 Test Accuracy: 68.18%
RFC dev: 
             precision    recall  f1-score   support

          1       0.70      0.96      0.81      6797
          2       0.37      0.07      0.12      2334
          3       0.23      0.02      0.04       740

avg / total       0.59      0.68      0.59      9871

[[   1 9348]
 [   2  453]
 [   3   70]]

features
['feat_2', 'feat_3', 'feat_1'

In [347]:
X_test = test_df
X_test = add_txt_features(X_test)
X_test = add_price_features(X_test)
X_test = get_num_photos(X_test)
X_test = add_time_features(X_test)
X_test = get_num_features(X_test)
X_test = get_address_dif(X_test)

neighborhoods = make_neighborhoods(X_train, 50)
X_dev = fit_neighborhoods(X_test, neighborhoods)

my_desc_mnb = get_desc_mnb(X_train, y_train)
X_test = get_description_scores(X_test,my_desc_mnb)

my_feature_mnb = get_feature_mnb(X_train, y_train)
X_test = get_feature_scores(X_test, my_feature_mnb)

[[ 0.68481778  0.23213846  0.08304376]
 [ 0.6558193   0.24792455  0.09625615]
 [ 0.7117906   0.22182928  0.06638012]
 ..., 
 [ 0.73700453  0.21226676  0.05072871]
 [ 0.67905114  0.21879433  0.10215453]
 [ 0.65318451  0.25515624  0.09165925]]
[1 2 3]
[[ 0.72083648  0.20943693  0.06972659]
 [ 0.77222915  0.17373251  0.05403834]
 [ 0.70498828  0.22496152  0.07005021]
 ..., 
 [ 0.83996454  0.1245549   0.03548056]
 [ 0.75438007  0.17615451  0.06946542]
 [ 0.71777751  0.21108092  0.07114156]]
[1 2 3]


In [372]:
feature_list = ['feat_1','feat_2','feat_3', 'desc_1','desc_2','desc_3',
                'year', 'month', 'day', 'weekday', 'hour', 'minute', 'second', 'hr_min',
                'numphotos', 'numfeatures','addr_dif','neighborhood',
                'price_per_bed','price_per_bath','price_per_room',
                'price','bedrooms','bathrooms',
                'strlen','numwords','numcaps','numpunct','richness','listing_id']

X_test_limited = X_test[feature_list]
X_test_limited = X_test_limited.set_index('listing_id')
print X_test_limited.shape

(74659, 29)


In [373]:
X_train_limited = X_train[feature_list]
X_train_limited = X_train_limited.set_index('listing_id')

rfc = RandomForestClassifier(n_estimators=20, n_jobs=-1) #-1 means use all available cores
rfc.fit(X_train_limited, y_train)
print X_train_limited.shape
labels = rfc.classes_

predictions = rfc.predict_proba(X_test_limited)
print predictions.shape

temp = X_test_limited
temp.loc[:,'high'] = predictions[:,2]
temp.loc[:,'medium'] = predictions[:,1]
temp.loc[:,'low'] = predictions[:,0]

final_table = temp[['high','medium', 'low']]

print final_table

final_table.to_csv("sumbmission002.csv", index=True)

(39481, 29)
(74659, 3)
            high  medium   low
listing_id                    
7142618     0.00    0.25  0.75
7210040     0.25    0.25  0.50
7103890     0.05    0.00  0.95
7143442     0.15    0.15  0.70
6860601     0.05    0.20  0.75
6840081     0.00    0.05  0.95
6922337     0.10    0.25  0.65
6913616     0.25    0.50  0.25
6937820     0.15    0.20  0.65
6893933     0.05    0.15  0.80
6832604     0.00    0.00  1.00
6915282     0.25    0.20  0.55
7127565     0.05    0.10  0.85
6827899     0.00    0.00  1.00
6934855     0.00    0.05  0.95
6861826     0.05    0.20  0.75
6871643     0.30    0.25  0.45
6842542     0.00    0.20  0.80
6934145     0.00    0.20  0.80
6829365     0.25    0.10  0.65
7167858     0.00    0.30  0.70
6859483     0.00    0.45  0.55
6861377     0.10    0.50  0.40
6848960     0.05    0.15  0.80
6918850     0.00    0.00  1.00
6916867     0.15    0.60  0.25
6895840     0.00    0.15  0.85
6813539     0.55    0.35  0.10
7116900     0.15    0.20  0.65
6890328     0.30