# Text Features

## Prep

In [313]:
# General libraries.
import re
import numpy as np
import pandas as pd
#import matplotlib.pyplot as plt


# SK-learn libraries for learning.
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
#from sklearn.naive_bayes import BernoulliNB
#from sklearn.naive_bayesf import MultinomialNB
#from sklearn.grid_search import GridSearchCV

# SK-learn libraries for evaluation.
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report
from scipy.stats import itemfreq

# SK-learn libraries for feature extraction from text.
from sklearn.feature_extraction.text import *

In [314]:
train_df = pd.read_json("../input/train.json")
test_df = pd.read_json("../input/test.json")

In [315]:
from sklearn.model_selection import train_test_split

y = train_df['interest_level']
y2 = y.replace({'low':1,'medium':2,'high':3})

X = train_df.drop('interest_level', 1)

X_train, X_dev, y_train, y_dev = train_test_split(X, y2, test_size=0.2, random_state=1)

In [316]:
from string import punctuation

#using dif var in case user wants to keep original df.
def add_txt_features(orig):
    dat = orig
    dat.loc[:,'strlen'] = [len(x) for x in dat['description']] #can ignore warnings because making new column
    dat.loc[:,'numwords'] = [len(x.split()) for x in dat['description']]
    dat.loc[:,'numcaps'] = [sum(1 for c in x if c.upper()) for x in dat['description']]
    dat.loc[:,'numpunct'] = [sum(1 for c in x if c in punctuation) for x in dat['description']]
    dat.loc[:,'richness'] = [len(set(x)) / (len(x)+0.001) for x in dat['description']]
    return dat

X_train = add_txt_features(X_train)
X_dev =   add_txt_features(X_dev)
#X_train.head()

In [317]:
from sklearn.ensemble import RandomForestClassifier

X_train_limited = X_train[['strlen','numwords','numcaps','numpunct','richness']]
X_dev_limited = X_dev[['strlen','numwords','numcaps','numpunct','richness']]

rfc = RandomForestClassifier(n_estimators=100, n_jobs=-1) #-1 means use all available cores
rfc.fit(X_train_limited, y_train)

print('Training Score: %.2f%%' % (rfc.score(X_train_limited, y_train) * 100))
print('Training Score: %.2f%%' % (rfc.score(X_dev_limited, y_dev) * 100))

Training Score: 95.16%
Training Score: 68.36%


In [318]:
importances = rfc.feature_importances_
features = X_train_limited.columns

sort_indices = np.argsort(importances)[::-1]
sorted_features = []
for i in sort_indices:
    sorted_features.append(features[i])

print(sorted_features)
print importances[sort_indices]

['richness', 'numwords', 'numcaps', 'strlen', 'numpunct']
[ 0.2594755   0.19082518  0.18518305  0.18305032  0.18146596]


## Let's take a look at # photos

In [319]:
#pd.options.display.max_colwidth = 1000
pd.options.display.max_colwidth = 50

def get_num_photos(orig):
    dat = orig
    dat.loc[:,'numphotos'] = [len(x) for x in dat['photos'].values]
    return dat

X_train = get_num_photos(X_train)
X_dev =   get_num_photos(X_dev)

X_train_ph = X_train[['numphotos']]
X_dev_ph = X_dev[['numphotos']]

#print X_train_ph.head()

lr = LogisticRegression()
lr.fit(X_train_ph, y_train)

print('\t Training Accuracy: %.2f%% \t Test Accuracy: %.2f%%' % (lr.score(X_train_ph, y_train)*100, 
                                                                 lr.score(X_dev_ph, y_dev)*100))


	 Training Accuracy: 69.62% 	 Test Accuracy: 68.85%


## Words, Photos

In [320]:
X_train_combo = X_train[['strlen','numwords','numcaps','numpunct','richness','numphotos']]
X_dev_combo = X_dev[['strlen','numwords','numcaps','numpunct','richness','numphotos']]

rfc2 = RandomForestClassifier(n_estimators=100, n_jobs=-1) #-1 means use all available cores
rfc2.fit(X_train_combo, y_train)

print('Training Score: %.2f%% \t Training Score: %.2f%%' % (
        rfc2.score(X_train_combo, y_train) * 100,
        rfc2.score(X_dev_combo, y_dev) * 100))

importances = rfc2.feature_importances_
features = X_train_combo.columns

sort_indices = np.argsort(importances)[::-1]
sorted_features = []
for i in sort_indices:
    sorted_features.append(features[i])

print(sorted_features)
print importances[sort_indices]


Training Score: 95.78% 	 Training Score: 68.95%
['richness', 'strlen', 'numcaps', 'numwords', 'numpunct', 'numphotos']
[ 0.21972227  0.16989367  0.16817354  0.1681099   0.1588827   0.11521791]


## Add in to features

In [324]:
def get_num_features(orig):
    dat = orig
    dat.loc[:,'numfeatures'] = [len(x) for x in dat['features'].values]
    return dat

X_train = get_num_features(X_train)
X_dev = get_num_features(X_dev)

X_train_f = X_train[['numfeatures']]
X_dev_f = X_dev[['numfeatures']]


lr = LogisticRegression()
lr.fit(X_train_f, y_train)

print('\t Training Accuracy: %.2f%% \t Test Accuracy: %.2f%%' % (lr.score(X_train_f, y_train)*100, 
                                                                 lr.score(X_dev_f, y_dev)*100))


	 Training Accuracy: 69.62% 	 Test Accuracy: 68.86%


## What about comparing addresses

In [118]:
def get_address_combo(orig):
    dat = orig
    dat.loc[:,'numfeatures'] = [len(x) for x in dat['features'].values]
    return dat

################## rewriting in progress ################## bb1:20170420

sa_len = [len(sa) for sa in X_train['street_address']]
da_len = [len(da) for da in X_train['display_address']]
dif_len = np.subtract(sa_len,da_len)

dd_t = pd.DataFrame()
dd_t['dif_len'] = dif_len

sa_len_dev = [len(sa) for sa in X_dev['street_address']]
da_len_dev = [len(da) for da in X_dev['display_address']]
dif_len_dev = np.subtract(sa_len_dev,da_len_dev)


dd_d = pd.DataFrame()
dd_d['dif_len'] = dif_len_dev


lr = LogisticRegression()
lr.fit(dd_t, y_train)

print('LR \t Training Accuracy: %.2f%% \t Dev Accuracy: %.2f%%' % (lr.score(dd_t, y_train)*100, 
                                                                 lr.score(dd_d, y_dev)*100))


LR 	 Training Accuracy: 69.58% 	 Dev Accuracy: 69.23%


In [98]:
ddf_tr['num_photos'] = num_photos
ddf_de['num_photos'] = num_photos_dev

rfc = RandomForestClassifier(n_estimators=10, n_jobs=-1) #-1 means use all available cores
rfc.fit(ddf_tr, y_train)
print('\nRF\tTraining Score:%.2f%% \
       \tDev Score: %.2f%%' % (rfc.score(ddf_tr, y_train) * 100, rfc.score(ddf_de, y_dev) * 100))


importances = rfc.feature_importances_
features = ddf_tr.columns

sort_indices = np.argsort(importances)[::-1]
sorted_features = []
for i in sort_indices:
    sorted_features.append(features[i])

vals = importances[sort_indices]
labels = sorted_features

lr = LogisticRegression()
lr.fit(ddf_tr, y_train)
print('LR \t Training Accuracy: %.2f%% \t Dev Accuracy: %.2f%%' % (lr.score(ddf_tr, y_train)*100, 
                                                                 lr.score(ddf_de, y_dev)*100))



print labels
print vals




RF	Training Score:94.15%        	Dev Score: 67.10%
LR 	 Training Accuracy: 69.53% 	 Dev Accuracy: 69.15%
['richness', 'numcaps', 'strlen', 'numwords', 'numpunct', 'num_photos']
[ 0.21859142  0.17003016  0.16858574  0.1677809   0.16266951  0.11234227]


In [98]:
ddf_tr['num_photos'] = num_photos
ddf_de['num_photos'] = num_photos_dev

rfc = RandomForestClassifier(n_estimators=10, n_jobs=-1) #-1 means use all available cores
rfc.fit(ddf_tr, y_train)
print('\nRF\tTraining Score:%.2f%% \
       \tDev Score: %.2f%%' % (rfc.score(ddf_tr, y_train) * 100, rfc.score(ddf_de, y_dev) * 100))


importances = rfc.feature_importances_
features = ddf_tr.columns

sort_indices = np.argsort(importances)[::-1]
sorted_features = []
for i in sort_indices:
    sorted_features.append(features[i])

vals = importances[sort_indices]
labels = sorted_features

lr = LogisticRegression()
lr.fit(ddf_tr, y_train)
print('LR \t Training Accuracy: %.2f%% \t Dev Accuracy: %.2f%%' % (lr.score(ddf_tr, y_train)*100, 
                                                                 lr.score(ddf_de, y_dev)*100))



print labels
print vals




RF	Training Score:94.15%        	Dev Score: 67.10%
LR 	 Training Accuracy: 69.53% 	 Dev Accuracy: 69.15%
['richness', 'numcaps', 'strlen', 'numwords', 'numpunct', 'num_photos']
[ 0.21859142  0.17003016  0.16858574  0.1677809   0.16266951  0.11234227]


In [98]:
ddf_tr['num_photos'] = num_photos
ddf_de['num_photos'] = num_photos_dev

rfc = RandomForestClassifier(n_estimators=10, n_jobs=-1) #-1 means use all available cores
rfc.fit(ddf_tr, y_train)
print('\nRF\tTraining Score:%.2f%% \
       \tDev Score: %.2f%%' % (rfc.score(ddf_tr, y_train) * 100, rfc.score(ddf_de, y_dev) * 100))


importances = rfc.feature_importances_
features = ddf_tr.columns

sort_indices = np.argsort(importances)[::-1]
sorted_features = []
for i in sort_indices:
    sorted_features.append(features[i])

vals = importances[sort_indices]
labels = sorted_features

lr = LogisticRegression()
lr.fit(ddf_tr, y_train)
print('LR \t Training Accuracy: %.2f%% \t Dev Accuracy: %.2f%%' % (lr.score(ddf_tr, y_train)*100, 
                                                                 lr.score(ddf_de, y_dev)*100))



print labels
print vals




RF	Training Score:94.15%        	Dev Score: 67.10%
LR 	 Training Accuracy: 69.53% 	 Dev Accuracy: 69.15%
['richness', 'numcaps', 'strlen', 'numwords', 'numpunct', 'num_photos']
[ 0.21859142  0.17003016  0.16858574  0.1677809   0.16266951  0.11234227]


add in address dif to bigger set

In [119]:
ddf_tr['addr_diff'] = dif_len
ddf_de['addr_diff'] = dif_len_dev

rfc = RandomForestClassifier(n_estimators=10, n_jobs=-1) #-1 means use all available cores
rfc.fit(ddf_tr, y_train)
print('\nRF\tTraining Score:%.2f%% \
       \tDev Score: %.2f%%' % (rfc.score(ddf_tr, y_train) * 100, rfc.score(ddf_de, y_dev) * 100))

lr = LogisticRegression()
lr.fit(ddf_tr, y_train)
print('LR \t Training Accuracy: %.2f%% \t Dev Accuracy: %.2f%%' % (lr.score(ddf_tr, y_train)*100, 
                                                                 lr.score(ddf_de, y_dev)*100))

importances = rfc.feature_importances_
features = ddf_tr.columns

sort_indices = np.argsort(importances)[::-1]
sorted_features = []
for i in sort_indices:
    sorted_features.append(features[i])

vals = importances[sort_indices]
labels = sorted_features

print labels
print vals




RF	Training Score:94.88%        	Dev Score: 67.85%
LR 	 Training Accuracy: 69.53% 	 Dev Accuracy: 69.17%
['richness', 'strlen', 'numwords', 'numcaps', 'numpunct', 'num_features', 'num_photos', 'addr_diff']
[ 0.16974545  0.1423083   0.14183043  0.14065956  0.13946252  0.10279839
  0.10262046  0.06057489]


In [129]:

'''labels = lr.classes_
print(labels)
predictions = lr.predict_proba(ddf_tr)
print(predictions, predictions.shape)

indexes = ['low','medium','high']
print(indexes)

my_df = pd.DataFrame(data=predictions, index=indexes, columns=labels)  
my_df.index.names = ['listing_id']

cols = my_df.columns.tolist()
print(cols)
cols = [cols[2], cols[1], cols[0]]
print(cols)

print(my_df[cols])'''

[1 2 3]
(array([[ 0.63026916,  0.28405181,  0.08567904],
       [ 0.75915932,  0.17011315,  0.07072753],
       [ 0.74824329,  0.16984734,  0.08190937],
       ..., 
       [ 0.71844147,  0.2064246 ,  0.07513394],
       [ 0.66258059,  0.25896006,  0.07845935],
       [ 0.73225698,  0.19140256,  0.07634047]]), (33065L, 3L))


AttributeError: 'list' object has no attribute 'shape'