In [1]:
%matplotlib inline
import math
import numpy as np
import pandas as pd
import random
import xgboost as xgb
from sklearn import preprocessing, model_selection
from sklearn.metrics import log_loss
import string
from sklearn.feature_extraction.text import  CountVectorizer
from scipy.stats import boxcox
from scipy import stats
from scipy import sparse



In [3]:
def fpreproc_leaky(dtrain, dtest, param):
    train, test = dtrain.data, dtest.data
    ntrain = train.shape[0]
    train_test = pd.concat((train, test), axis=0).reset_index(drop=True)
    y_train = dtrain.labels
    
    
    # add manager worths
    manager_ids = np.unique(train.manager_id)
    manager_worths = {}
    for manager_id in manager_ids:
        interests = y_train[train.manager_id == manager_id].apply(lambda x: 2 - x)
        if len(interests) > 5:
            manager_worths[manager_id] = sum(interests) / len(interests)
    average_interest = y_train.apply(lambda x: 2 - x)
    average_manager_worth = 0 #sum(average_interest) / len(average_interest)
    train_test['manager_worths'] = train_test['manager_id'].apply(lambda x: manager_worths[x] if x in manager_worths else average_manager_worth)

    
    # add building worths
    building_ids = np.unique(train.building_id)
    building_worths = {}
    for building_id in building_ids:
        interests = y_train[train.building_id == building_id].apply(lambda x: 2 - x)
        if len(interests) > 5:
            building_worths[building_id] = sum(interests) / len(interests)
    average_interest = y_train.apply(lambda x: 2 - x)
    average_building_worth = 0 #sum(average_interest) / len(average_interest)
    train_test['building_worths'] = train_test['building_id'].apply(lambda x: building_worths[x] if x in building_worths else average_building_worth)

    
    # add price by area
    lat_long_price = train_test[['latitude', 'longitude', 'price', 'bedroom_cat']]
    remove_outliers = (np.abs(stats.zscore(lat_long_price)) < 0.15).all(axis=1)
    lat_long_price = lat_long_price[remove_outliers]
    lat_max, lat_min = max(lat_long_price.latitude), min(lat_long_price.latitude)
    long_max, long_min = max(lat_long_price.longitude), min(lat_long_price.longitude)
    lat_scale, long_scale = lat_max - lat_min, long_max - long_min
    costs = np.zeros((100,100))
    num_listings = np.zeros((100,100))
    for lat, long, price, beds in lat_long_price.values:
        scaled_lat, scaled_long = int((lat - lat_min) * 99 / lat_scale), int((long - long_min) * 99 / long_scale)
        costs[scaled_lat][scaled_long] += price / beds
        num_listings[scaled_lat][scaled_long] += 1

    price_by_area = []
    for lat, long, price, beds in train_test[['latitude', 'longitude', 'price', 'bedroom_cat']].values:
        scaled_lat, scaled_long = int((lat - lat_min) * 99 / lat_scale), int((long - long_min) * 99 / long_scale)
        if scaled_lat < 0 or scaled_lat >= 100 or scaled_long < 0 or scaled_long >= 100:
            price_by_area.append(0)
        elif num_listings[scaled_lat][scaled_long] > 8:
            price_by_area.append((price / beds) - (costs[scaled_lat][scaled_long] / num_listings[scaled_lat][scaled_long]))
        else:
            cost = 0
            num = 0
            for i in range(scaled_lat - 1, scaled_lat + 2):
                for j in range(scaled_long - 1, scaled_long + 2):
                    if i > 0 and i < 100 and j >= 0 and j < 100:
                        cost += costs[i][j]
                        num += num_listings[i][j]
            if num > 8:
                price_by_area.append((price / beds) - (cost / num))
            else:
                price_by_area.append(0)

    train_test['price_by_area'] = price_by_area
    
    
    # try adding real - predicted price
    # Try to predict price for a listing and add real_price - expected_price as a feature
    # train on [exclusive, byowner, nofee] and #bed, #bath, display address
    features_to_use = ['bathrooms', 'bedroom', 'street', 'avenue', 'east', 'west', 'north', 'south', 'address1', 'other_address', 'building_worths', 'manager_worths']
    feature_to_predict = 'price'
    params = {
        'objective': 'reg:linear',
        'booster':'gblinear',
        'lambda': 0,
        'lambda_bias' : 0,
        'alpha': 0.2
    }
    prices = train_test[feature_to_predict]
    remove_outliers = np.abs(prices-prices.mean())<=(3*prices.std())
    dtrain = xgb.DMatrix(data=train_test[remove_outliers][features_to_use], label=train_test[remove_outliers][feature_to_predict])

    bst = xgb.cv(params, dtrain, 10000, 4, early_stopping_rounds=50, verbose_eval=200)
    best_rounds = np.argmin(bst['test-rmse-mean'])
    print (bst['test-rmse-mean'][best_rounds])
    bst = xgb.train(params, dtrain, best_rounds)
    dtrain = xgb.DMatrix(data=train_test[features_to_use])
    expected_price = bst.predict(dtrain)
    train_test['real_minus_expected_price'] = train_test[feature_to_predict] - expected_price
    train_test['real_over_expected_price'] = train_test[feature_to_predict] / expected_price
    
    
    # enumerated streets / price / price per street (/ num bedrooms) / price / price per street (/ num bedrooms + 0.1*bath)
    
    address1
    BoroCode
    NTACode
    
    # enumerated neighborhoods / same as above
    
    
    
    # remove extra
    train_test.drop('price', axis=1, inplace=True)
    
    dtrain = xgb.DMatrix(data=train_test[:ntrain], label=y_train)
    dtest = xgb.DMatrix(data=train_test[ntrain:], label=dtest.labels)
    return dtrain, dtest, param

In [60]:
current_filter = ['bc_price', 'price_per_bedroom', 'price_per_bathroom', 'price_per_bed_and_bath', 'price', 'latitude', 'price_by_area', 'BoroCode', 'NTACode', 'longitude', 'manager_worths', 'address1', 'real_over_expected_price', 'desc_words_length', 'real_minus_expected_price', 'manager_id', 'Yday', 'building_id', 'building_worths', 'desc_letters_count', 'Day', 'desc_words_count', 'hour', 'bedrooms', 'photos_count', 'features_count', 'Wday', 'sparse_nofee', 'bathrooms', 'sparse_furnished', 'sparse_laundryinbuilding', 'sparse_hardwoodfloors', 'sparse_laundryinunit', 'Month', 'sparse_catsallowed', 'sparse_exclusive', 'street', 'sparse_elevator', 'sparse_prewar', 'sparse_dogsallowed', 'sparse_reducedfee', 'avenue', 'east', 'sparse_doorman', 'sparse_dishwasher', 'other_address', 'sparse_fitnesscenter', 'sparse_privateoutdoorspace', 'sparse_commonoutdoorspace', 'sparse_outdoorspace', 'sparse_loft', 'sparse_diningroom', 'sparse_balcony', 'sparse_highspeedinternet', 'sparse_parkingspace', 'sparse_terrace', 'sparse_swimmingpool', 'west', 'sparse_roofdeck', 'sparse_actualapt', 'sparse_wheelchairaccess', 'sparse_newconstruction', 'Zero_building_id', 'sparse_simplex', 'sparse_patio', 'sparse_garden', 'sparse_multilevel', 'sparse_hardwood', 'sparse_shorttermallowed', 'south', 'sparse_stainlesssteelappliances', 'sparse_fireplace', 'sparse_highceiling', 'sparse_renovated', 'sparse_liveinsuper', 'sparse_storage', 'sparse_garage', 'sparse_dryerinunit', 'sparse_outdoorareas', 'sparse_petsok', 'sparse_lndrybldg', 'sparse_concierge', 'sparse_new', 'sparse_highceilings', 'sparse_onsitelaundry', 'sparse_centrala', 'sparse_flex3', 'sparse_photos', 'sparse_view', 'sparse_publicoutdoor', 'sparse_allutilitiesincluded', 'sparse_residentslounge', 'sparse_newlyrenovated', 'sparse_washerinunit', 'sparse_onsitegarage', 'sparse_assignedparkingspace', 'north', 'sparse_washer', 'sparse_light', 'sparse_dryer', 'sparse_lowrise', 'sparse_sublet', 'sparse_granitekitchen', 'sparse_elev', 'sparse_virtualdoorman', 'sparse_sundeck', 'sparse_rooftopdeck', 'sparse_wallsofwindows', 'sparse_sharesok', 'sparse_duplex', 'sparse_nopets', 'sparse_cable', 'sparse_microwave', 'sparse_wifiaccess', 'sparse_walkincloset', 'sparse_petsonapproval', 'sparse_pool', 'sparse_eatinkitchen', 'sparse_marblebath', 'sparse_live', 'sparse_sauna', 'sparse_greenbuilding', 'sparse_exposedbrick', 'sparse_largelivingroom', 'sparse_bikeroom', 'sparse_highrise', 'sparse_laundry', 'sparse_privateroofdeck', 'sparse_laundryroom', 'sparse_commonbackyard', 'sparse_privatebackyard', 'sparse_parking', 'sparse_privateparking', 'sparse_childrensplayroom', 'sparse_privatebalcony', 'sparse_indoorpool']
leaky = ['manager_worths', 'building_worths', 'price_by_area', 'real_minus_expected_price', 'real_over_expected_price']
def filter_data_to_columns(data):
    return data[[col for col in current_filter if col not in leaky]]

# Run CV

In [5]:
%%time
class FakeDMatrix:
    def __init__(self, data, labels=None):
        self.data = data
        self.labels = labels
        self.num = len(data)

    def num_row(self):
        return self.num

    def slice(self, rindex):
        indices = np.zeros(self.num, dtype=np.bool)
        for index in rindex:
            indices[index] = True
        return FakeDMatrix(data=self.data[indices], labels=self.labels[indices])
    
SEED = 777
NFOLDS = 3
y_map = {'low': 2, 'medium': 1, 'high': 0}

params = {
    'eta':.01,
    'colsample_bytree':.8,
    'subsample':.8,
    'seed':0,
    'nthread':16,
    'objective':'multi:softprob',
    'eval_metric':'mlogloss',
    'num_class':3,
    'silent':1
}

processed_train = filter_data_to_columns(pd.read_json('data/train.json'))
processed_test = filter_data_to_columns(pd.read_json('data/test.json'))
y_train = pd.read_json('data/train_interest.json', typ='series')
listing_id = pd.read_json('data/test_ids.json', typ='series').values


dtrain = FakeDMatrix(data=processed_train, labels=y_train)
processed_train = None
y_train = None

bst = xgb.cv(params, dtrain, 10000, NFOLDS, early_stopping_rounds=50, verbose_eval=200, fpreproc=fpreproc_leaky)
best_rounds = np.argmin(bst['test-mlogloss-mean'])
print (bst['test-mlogloss-mean'][best_rounds])
print (best_rounds)

[0]	train-rmse:1950.96+34.4251	test-rmse:1949.5+53.2433
[200]	train-rmse:1645.32+20.4463	test-rmse:1645.08+61.1426
1645.081482
[0]	train-rmse:1935.76+52.4955	test-rmse:1934.8+64.7097
[200]	train-rmse:1645.64+22.1719	test-rmse:1645.95+66.6588
1645.953247
[0]	train-rmse:1966.36+22.6638	test-rmse:1966.06+67.9858
[200]	train-rmse:1645.48+22.4534	test-rmse:1645.44+67.2012
1645.440918
[0]	train-mlogloss:1.09143+2.00499e-05	test-mlogloss:1.09176+1.41657e-05
[200]	train-mlogloss:0.598601+0.000625148	test-mlogloss:0.641504+0.00183947
[400]	train-mlogloss:0.519138+0.000594581	test-mlogloss:0.587296+0.00274033
[600]	train-mlogloss:0.482462+0.000448078	test-mlogloss:0.569714+0.0029264
[800]	train-mlogloss:0.457343+0.000351523	test-mlogloss:0.561291+0.00289459
[1000]	train-mlogloss:0.437719+0.000530272	test-mlogloss:0.556462+0.00286698
[1200]	train-mlogloss:0.420989+0.000573644	test-mlogloss:0.553342+0.00277451
[1400]	train-mlogloss:0.405876+0.000678381	test-mlogloss:0.551211+0.00266778
[1600]	trai

# Run To Submit

In [7]:
%%time
dtest = FakeDMatrix(data=processed_test)
processed_test = None

dtrain_final, dtest_final, _ = fpreproc_leaky(dtrain, dtest, None)
dtrain = None
dtest = None

bst = xgb.train(params, dtrain_final, best_rounds)
dtrain_final = None

preds = bst.predict(dtest_final)
# save for column names
#dtest_final = None

preds = pd.DataFrame(preds)
preds.columns = ['high', 'medium', 'low']
preds['listing_id'] = listing_id
preds.to_csv('data/my_preds.csv', index=None)

importance = bst.get_fscore()
feature_importance = [(feature, (importance[feature])) for feature in importance]
print (sorted(feature_importance, key=lambda x: -x[1]))

[('bc_price', 22052), ('latitude', 20582), ('price_by_area', 20205), ('longitude', 19794), ('manager_worths', 18563), ('address1', 17124), ('real_over_expected_price', 16990), ('desc_words_length', 16945), ('real_minus_expected_price', 16153), ('manager_id', 15467), ('Yday', 14915), ('building_id', 14768), ('building_worths', 14457), ('desc_letters_count', 13911), ('Day', 11938), ('desc_words_count', 10407), ('hour', 10283), ('bedroom_cat', 8818), ('photos_count', 8595), ('features_count', 7736), ('Wday', 6037), ('sparse_nofee', 3430), ('bathrooms_cat', 2698), ('sparse_furnished', 1896), ('sparse_laundryinbuilding', 1860), ('sparse_hardwoodfloors', 1452), ('sparse_laundryinunit', 1402), ('Month', 1303), ('sparse_catsallowed', 1091), ('sparse_exclusive', 1067), ('street', 1064), ('sparse_elevator', 1006), ('sparse_prewar', 997), ('sparse_dogsallowed', 983), ('sparse_reducedfee', 915), ('avenue', 906), ('east', 885), ('sparse_doorman', 878), ('sparse_dishwasher', 828), ('other_address', 