In [1]:
%matplotlib inline
import math
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn import preprocessing, model_selection
import string
from sklearn.feature_extraction.text import  CountVectorizer
from scipy.stats import boxcox
from scipy import stats
from scipy import sparse



In [2]:
train_file = 'data/train.json'
test_file = 'data/test.json'
train = pd.read_json(train_file)
test = pd.read_json(test_file)
y_map = {'low': 2, 'medium': 1, 'high': 0}

In [3]:
%%time
def get_manager_worth(manager_id):
    interests = train[train.manager_id == manager_id].interest_level.apply(lambda x: 2 - y_map[x])
    return sum(interests) / len(interests) if len(interests) > 5 else 0

manager_ids = np.concatenate((np.unique(train.manager_id), np.unique(test.manager_id)), axis=0)
manager_worths = {}
for manager_id in manager_ids:
    manager_worths[manager_id] = get_manager_worth(manager_id)

CPU times: user 27.1 s, sys: 417 ms, total: 27.5 s
Wall time: 29.8 s


In [4]:
%%time
def get_building_worth(building_id):
    interests = train[train.building_id == building_id].interest_level.apply(lambda x: 2 - y_map[x])
    return sum(interests) / len(interests) if len(interests) > 5 else 0

building_ids = np.concatenate((np.unique(train.building_id), np.unique(test.building_id)), axis=0)
building_worths = {}
for building_id in building_ids:
    building_worths[building_id] = get_building_worth(building_id)

CPU times: user 56.9 s, sys: 687 ms, total: 57.6 s
Wall time: 58.5 s


In [5]:
%%time
listing_id = test.listing_id.values
train['interest_level'] = train['interest_level'].apply(lambda x: y_map[x])
y_train = train.interest_level.values

train = train.drop(['listing_id', 'interest_level'], axis=1)
test = test.drop('listing_id', axis=1)

ntrain = train.shape[0]

train_test = pd.concat((train, test), axis=0).reset_index(drop=True)

CPU times: user 184 ms, sys: 72.7 ms, total: 257 ms
Wall time: 256 ms


In [6]:
%%time
lat_long_price = train_test[['latitude', 'longitude', 'price']]
remove_outliers = (np.abs(stats.zscore(lat_long_price)) < 0.15).all(axis=1)
lat_long_price = lat_long_price[remove_outliers]
lat_max, lat_min = max(lat_long_price.latitude), min(lat_long_price.latitude)
long_max, long_min = max(lat_long_price.longitude), min(lat_long_price.longitude)
lat_scale, long_scale = lat_max - lat_min, long_max - long_min
costs = np.zeros((100,100))
num_listings = np.zeros((100,100))
for lat, long, price in lat_long_price.values:
    scaled_lat, scaled_long = int((lat - lat_min) * 99 / lat_scale), int((long - long_min) * 99 / long_scale)
    costs[scaled_lat][scaled_long] += price
    num_listings[scaled_lat][scaled_long] += 1

price_by_area = []
for lat, long, price in train_test[['latitude', 'longitude', 'price']].values:
    scaled_lat, scaled_long = int((lat - lat_min) * 99 / lat_scale), int((long - long_min) * 99 / long_scale)
    if scaled_lat < 0 or scaled_lat >= 100 or scaled_long < 0 or scaled_long >= 100:
        price_by_area.append(0)
    elif num_listings[scaled_lat][scaled_long] > 5:
        price_by_area.append(price - (costs[scaled_lat][scaled_long] / num_listings[scaled_lat][scaled_long]))
    else:
        cost = 0
        num = 0
        for i in range(scaled_lat - 1, scaled_lat + 2):
            for j in range(scaled_long - 1, scaled_long + 2):
                if i > 0 and i < 100 and j >= 0 and j < 100:
                    cost += costs[i][j]
                    num += num_listings[i][j]
        if num > 5:
            price_by_area.append(price - (cost / num))
        else:
            price_by_area.append(0)

#train_test['price_by_area'] = price_by_area

CPU times: user 1.01 s, sys: 9.43 ms, total: 1.02 s
Wall time: 1.03 s


In [7]:
%%time
# convert date to be more useable
train_test['Date'] = pd.to_datetime(train_test['created'])
# year is all the same
#train_test['Year'] = train_test['Date'].dt.year
train_test['Month'] = train_test['Date'].dt.month
train_test['Day'] = train_test['Date'].dt.day
train_test['Wday'] = train_test['Date'].dt.dayofweek
train_test['Yday'] = train_test['Date'].dt.dayofyear
train_test['hour'] = train_test['Date'].dt.hour
train_test = train_test.drop(['Date', 'created'], axis=1)


# check if this building id/manager id and add worths is 0
train_test['Zero_building_id'] = train_test['building_id'].apply(lambda x: 1 if x == '0' else 0)
train_test['Zero_manager_id'] = train_test['manager_id'].apply(lambda x: 1 if x == '0' else 0)
train_test['manager_worths'] = train_test['manager_id'].apply(lambda x: manager_worths[x] if x in manager_worths else 0)
train_test['building_worths'] = train_test['building_id'].apply(lambda x: building_worths[x] if x in building_worths else 0)


# mess with description meta data
train_test['desc'] = train_test['description']
train_test['desc'] = train_test['desc'].apply(lambda x: x.replace('<p><a  website_redacted ', ''))
train_test['desc'] = train_test['desc'].apply(lambda x: x.replace('!<br /><br />', ''))

string.punctuation.__add__('!!')
string.punctuation.__add__('(')
string.punctuation.__add__(')')
remove_punct_map = dict.fromkeys(map(ord, string.punctuation))

train_test['desc'] = train_test['desc'].apply(lambda x: x.translate(remove_punct_map))
train_test['desc_letters_count'] = train_test['description'].apply(lambda x: len(x.strip()))
train_test['desc_words_count'] = train_test['desc'].apply(lambda x: 0 if len(x.strip()) == 0 else len(x.split(' ')))
train_test['desc_words_length'] = (train_test['desc_letters_count'] / train_test['desc_words_count']).apply(lambda x: 0 if math.isnan(x) or math.isinf(x) else x)

train_test.drop(['description', 'desc'], axis=1, inplace=True)


# address
train_test['address1'] = train_test['display_address']
train_test['address1'] = train_test['address1'].apply(lambda x: x.lower())

address_map = {
    'w': 'west',
    'st.': 'street',
    'ave': 'avenue',
    'st': 'street',
    'e': 'east',
    'n': 'north',
    's': 'south'
}

def address_map_func(s):
    s = s.split(' ')
    out = []
    for x in s:
        if x in address_map:
            out.append(address_map[x])
        else:
            out.append(x)
    return ' '.join(out)


train_test['address1'] = train_test['address1'].apply(lambda x: x.translate(remove_punct_map))
train_test['address1'] = train_test['address1'].apply(lambda x: address_map_func(x))

new_cols = ['street', 'avenue', 'east', 'west', 'north', 'south']
for col in new_cols:
    train_test[col] = train_test['address1'].apply(lambda x: 1 if col in x else 0)

train_test['other_address'] = train_test[new_cols].apply(lambda x: 1 if x.sum() == 0 else 0, axis=1)
train_test.drop(['display_address', 'street_address'], axis=1, inplace=True)


# features (better not to lower)
train_test['features_count'] = train_test['features'].apply(lambda x: len(x))
train_test['features2'] = train_test['features']
#train_test['features2'] = train_test['features2'].apply(lambda x: ' '.join([''.join(i.replace('_',' ').replace('-',' ').split(' ')) for i in x]).lower())
train_test['features2'] = train_test['features2'].apply(lambda x: ' '.join([''.join(i.replace('_',' ').replace('-',' ').split(' ')) for i in x]))

c_vect = CountVectorizer(stop_words='english', max_features=200, ngram_range=(1, 1))
c_vect.fit(train_test['features2'])

c_vect_sparse_1 = c_vect.transform(train_test['features2'])
c_vect_sparse1_cols = c_vect.get_feature_names()
train_test.drop(['features', 'features2'], axis=1, inplace=True)


# photos
train_test['photos_count'] = train_test['photos'].apply(lambda x: len(x))
train_test.drop(['photos'], axis=1, inplace=True)


# convert ['building_id', 'manager_id', 'address1'] to enumerated labels
categoricals = [x for x in train_test.columns if train_test[x].dtype == 'object']
for feat in categoricals:
    lbl = preprocessing.LabelEncoder()
    lbl.fit(list(train_test[feat].values))
    train_test[feat] = lbl.transform(list(train_test[feat].values))


# convert bed/bath number to enumeration
train_test['bathrooms_cat'] = train_test['bathrooms'].apply(lambda x: str(x))
train_test['bathrooms_cat'], labels = pd.factorize(train_test['bathrooms_cat'].values, sort=True)
train_test.drop('bathrooms', axis=1, inplace=True)
train_test['bedroom_cat'], labels = pd.factorize(train_test['bedrooms'].values, sort=True)
train_test.drop('bedrooms', axis=1, inplace=True)

    
# try adding real - predicted price
# Try to predict price for a listing and add real_price - expected_price as a feature
# train on [exclusive, byowner, nofee] and #bed, #bath, display address
features_to_use = ['bathrooms_cat', 'bedroom_cat', 'street', 'avenue', 'east', 'west', 'north', 'south', 'address1', 'other_address', 'building_worths', 'manager_worths']
feature_to_predict = 'price'
params = {
    'objective': 'reg:linear',
    'booster':'gblinear',
    'lambda': 0,
    'lambda_bias' : 0,
    'alpha': 0.2
}
prices = train_test[feature_to_predict]
remove_outliers = np.abs(prices-prices.mean())<=(3*prices.std())
dtrain = xgb.DMatrix(data=train_test[remove_outliers][features_to_use], label=train_test[remove_outliers][feature_to_predict])

bst = xgb.cv(params, dtrain, 10000, 4, early_stopping_rounds=50, verbose_eval=200)
best_rounds = np.argmin(bst['test-rmse-mean'])
print (bst['test-rmse-mean'][best_rounds])
bst = xgb.train(params, dtrain, best_rounds)
dtrain = xgb.DMatrix(data=train_test[features_to_use])
expected_price = bst.predict(dtrain)
train_test['real_minus_expected_price'] = train_test[feature_to_predict] - expected_price
train_test['real_over_expected_price'] = train_test[feature_to_predict] / expected_price


# transform price
bc_price, tmp = boxcox(train_test.price)
train_test['bc_price'] = bc_price
train_test.drop('price', axis=1, inplace=True)


# add sparse
train_test_cv1_sparse = sparse.hstack((train_test, c_vect_sparse_1)).tocsr()
#x_train = train_test_cv1_sparse[:20000, :]
#y_train = y_train[:20000]
x_train = train_test_cv1_sparse[:ntrain, :]
x_test = train_test_cv1_sparse[ntrain:, :]


# add feature names
features = list(train_test.columns) + ['sparse_' + vect_name for vect_name in c_vect_sparse1_cols]

SEED = 777
NFOLDS = 5

params = {
    'eta':.01,
    'colsample_bytree':.8,
    'subsample':.8,
    'seed':0,
    'nthread':16,
    'objective':'multi:softprob',
    'eval_metric':'mlogloss',
    'num_class':3,
    'silent':1
}


dtrain = xgb.DMatrix(data=x_train, label=y_train)
dtest = xgb.DMatrix(data=x_test)

[0]	train-rmse:1915.83+15.558	test-rmse:1915.57+51.9921
[200]	train-rmse:1600.69+13.7083	test-rmse:1600.63+41.3602
1600.6278075
CPU times: user 38 s, sys: 2.3 s, total: 40.3 s
Wall time: 25.1 s


In [8]:
%%time
bst = xgb.cv(params, dtrain, 10000, NFOLDS, early_stopping_rounds=50, verbose_eval=200)
best_rounds = np.argmin(bst['test-mlogloss-mean'])
print (bst['test-mlogloss-mean'][best_rounds])
#bst_cv_scores.append(bst['test-mlogloss-mean'][best_rounds])

[0]	train-mlogloss:1.09137+1.48647e-05	test-mlogloss:1.09152+3.15062e-05
[200]	train-mlogloss:0.598842+0.000920458	test-mlogloss:0.620982+0.00177241
[400]	train-mlogloss:0.52221+0.00111015	test-mlogloss:0.562176+0.0029349
[600]	train-mlogloss:0.489104+0.00121371	test-mlogloss:0.544833+0.00357821
[800]	train-mlogloss:0.466682+0.00149326	test-mlogloss:0.536698+0.00378331
[1000]	train-mlogloss:0.448672+0.0014917	test-mlogloss:0.531847+0.00409619
[1200]	train-mlogloss:0.433125+0.00162193	test-mlogloss:0.528762+0.00432083
[1400]	train-mlogloss:0.419082+0.00171988	test-mlogloss:0.52656+0.00448819
[1600]	train-mlogloss:0.406143+0.0017383	test-mlogloss:0.525117+0.00461924
[1800]	train-mlogloss:0.393869+0.00157158	test-mlogloss:0.524121+0.00473042
[2000]	train-mlogloss:0.382159+0.00171889	test-mlogloss:0.523456+0.00487192
[2200]	train-mlogloss:0.371075+0.001714	test-mlogloss:0.522928+0.004976
[2400]	train-mlogloss:0.360703+0.00168349	test-mlogloss:0.522747+0.00507555
0.5227132
CPU times: user 1

In [9]:
bst = xgb.train(params, dtrain, best_rounds)
preds = bst.predict(dtest)
preds = pd.DataFrame(preds)
cols = ['high', 'medium', 'low']
preds.columns = cols
preds['listing_id'] = listing_id
preds.to_csv('my_preds.csv', index=None)

In [10]:
importance = bst.get_fscore()
feature_importance = [(feature, (importance['f{}'.format(i)]) if 'f{}'.format(i) in importance else 0) for i, feature in enumerate(features)]
sorted(feature_importance, key=lambda x: -x[1])

[('bc_price', 24760),
 ('manager_worths', 22637),
 ('latitude', 20639),
 ('longitude', 19604),
 ('desc_words_length', 19528),
 ('building_id', 18451),
 ('building_worths', 18138),
 ('address1', 17624),
 ('real_over_expected_price', 16943),
 ('real_minus_expected_price', 16589),
 ('desc_letters_count', 14857),
 ('manager_id', 14114),
 ('Yday', 14099),
 ('desc_words_count', 12208),
 ('Day', 11191),
 ('hour', 10315),
 ('bedroom_cat', 9406),
 ('features_count', 9377),
 ('photos_count', 9329),
 ('Wday', 8251),
 ('sparse_nofee', 3417),
 ('bathrooms_cat', 3176),
 ('sparse_laundryinbuilding', 2010),
 ('sparse_furnished', 1966),
 ('sparse_hardwoodfloors', 1341),
 ('sparse_laundryinunit', 1312),
 ('Month', 1174),
 ('sparse_catsallowed', 957),
 ('street', 955),
 ('sparse_dogsallowed', 944),
 ('sparse_exclusive', 937),
 ('sparse_prewar', 919),
 ('sparse_elevator', 875),
 ('sparse_reducedfee', 845),
 ('avenue', 824),
 ('east', 756),
 ('sparse_doorman', 732),
 ('sparse_dishwasher', 720),
 ('sparse_f