In [1]:
%matplotlib inline
import math
import numpy as np
import pandas as pd
import random
import xgboost as xgb
from sklearn import preprocessing, model_selection
from sklearn.metrics import log_loss
import string
from sklearn.feature_extraction.text import  CountVectorizer
from scipy.stats import boxcox
from scipy import stats
from scipy import sparse



In [2]:
train_file = 'data/train.json'
train = pd.read_json(train_file)
test_file = 'data/test.json'
test = pd.read_json(test_file)

listing_id = test.listing_id.values

y_map = {'low': 2, 'medium': 1, 'high': 0}
train['interest_level'] = train['interest_level'].apply(lambda x: y_map[x])
y_train = train.interest_level.reset_index(drop=True)

train = train.drop(['listing_id', 'interest_level'], axis=1)
test = test.drop('listing_id', axis=1)

In [3]:
def fpreproc_safe(train, test):
    ntrain = train.shape[0]
    train_test = pd.concat((train, test), axis=0).reset_index(drop=True)
    
    
    # convert date to be more useable
    train_test['Date'] = pd.to_datetime(train_test['created'])
    # year is all the same
    #train_test['Year'] = train_test['Date'].dt.year
    train_test['Month'] = train_test['Date'].dt.month
    train_test['Day'] = train_test['Date'].dt.day
    train_test['Wday'] = train_test['Date'].dt.dayofweek
    train_test['Yday'] = train_test['Date'].dt.dayofyear
    train_test['hour'] = train_test['Date'].dt.hour
    train_test = train_test.drop(['Date', 'created'], axis=1)


    # check if this building id/manager id and add worths is 0
    train_test['Zero_building_id'] = train_test['building_id'].apply(lambda x: 1 if x == '0' else 0)
    train_test['Zero_manager_id'] = train_test['manager_id'].apply(lambda x: 1 if x == '0' else 0)
    

    # mess with description meta data
    train_test['desc'] = train_test['description']
    train_test['desc'] = train_test['desc'].apply(lambda x: x.replace('<p><a  website_redacted ', ''))
    train_test['desc'] = train_test['desc'].apply(lambda x: x.replace('!<br /><br />', ''))

    string.punctuation.__add__('!!')
    string.punctuation.__add__('(')
    string.punctuation.__add__(')')
    remove_punct_map = dict.fromkeys(map(ord, string.punctuation))

    train_test['desc'] = train_test['desc'].apply(lambda x: x.translate(remove_punct_map))
    train_test['desc_letters_count'] = train_test['description'].apply(lambda x: len(x.strip()))
    train_test['desc_words_count'] = train_test['desc'].apply(lambda x: 0 if len(x.strip()) == 0 else len(x.split(' ')))
    train_test['desc_words_length'] = (train_test['desc_letters_count'] / train_test['desc_words_count']).apply(lambda x: 0 if math.isnan(x) or math.isinf(x) else x)

    train_test.drop(['description', 'desc'], axis=1, inplace=True)


    # address
    train_test['address1'] = train_test['display_address']
    train_test['address1'] = train_test['address1'].apply(lambda x: x.lower())

    address_map = {
        'w': 'west',
        'st.': 'street',
        'ave': 'avenue',
        'st': 'street',
        'e': 'east',
        'n': 'north',
        's': 'south'
    }

    def address_map_func(s):
        s = s.split(' ')
        out = []
        for x in s:
            if x in address_map:
                out.append(address_map[x])
            else:
                out.append(x)
        return ' '.join(out)


    train_test['address1'] = train_test['address1'].apply(lambda x: x.translate(remove_punct_map))
    train_test['address1'] = train_test['address1'].apply(lambda x: address_map_func(x))

    new_cols = ['street', 'avenue', 'east', 'west', 'north', 'south']
    for col in new_cols:
        train_test[col] = train_test['address1'].apply(lambda x: 1 if col in x else 0)

    train_test['other_address'] = train_test[new_cols].apply(lambda x: 1 if x.sum() == 0 else 0, axis=1)
    train_test.drop(['display_address', 'street_address'], axis=1, inplace=True)


    # features (better not to lower)
    train_test['features_count'] = train_test['features'].apply(lambda x: len(x))
    train_test['features2'] = train_test['features']
    train_test['features2'] = train_test['features2'].apply(lambda x: ' '.join([''.join(i.replace('_',' ').replace('-',' ').split(' ')) for i in x]))

    c_vect = CountVectorizer(stop_words='english', max_features=200, ngram_range=(1, 1))
    c_vect.fit(train_test['features2'])

    c_vect_sparse_1 = c_vect.transform(train_test['features2'])
    c_vect_sparse1_cols = c_vect.get_feature_names()
    train_test.drop(['features', 'features2'], axis=1, inplace=True)


    # photos
    train_test['photos_count'] = train_test['photos'].apply(lambda x: len(x))
    train_test.drop(['photos'], axis=1, inplace=True)


    # convert ['building_id', 'manager_id', 'address1'] to enumerated labels
    categoricals = [x for x in train_test.columns if train_test[x].dtype == 'object']
    for feat in categoricals:
        lbl = preprocessing.LabelEncoder()
        lbl.fit(list(train_test[feat].values))
        train_test[feat] = lbl.transform(list(train_test[feat].values))


    # convert bed/bath number to enumeration
    train_test['bathrooms_cat'] = train_test['bathrooms'].apply(lambda x: str(x))
    train_test['bathrooms_cat'], labels = pd.factorize(train_test['bathrooms_cat'].values, sort=True)
    train_test.drop('bathrooms', axis=1, inplace=True)
    train_test['bedroom_cat'], labels = pd.factorize(train_test['bedrooms'].values, sort=True)
    train_test.drop('bedrooms', axis=1, inplace=True)

    
    # transform managers and building ids
    train_test['manager_id'] = train_test['manager_id'].apply(lambda x: str(x))
    train_test['manager_id'], labels = pd.factorize(train_test['manager_id'].values, sort=True)
    train_test['building_id'] = train_test['building_id'].apply(lambda x: str(x))
    train_test['building_id'], labels = pd.factorize(train_test['building_id'].values, sort=True)
    

    # transform price
    bc_price, tmp = boxcox(train_test.price)
    train_test['bc_price'] = bc_price


    # add sparse
    train_test_cv1_sparse = sparse.hstack((train_test, c_vect_sparse_1)).tocsr()
    x_train = train_test_cv1_sparse[:ntrain, :]
    x_test = train_test_cv1_sparse[ntrain:, :]


    # add feature names
    features = list(train_test.columns) + ['sparse_' + vect_name for vect_name in c_vect_sparse1_cols]
    dtrain_data = pd.DataFrame(np.array(x_train.todense()), columns=features)
    dtest_data = pd.DataFrame(np.array(x_test.todense()), columns=features)
    return dtrain_data, dtest_data

In [38]:
def fpreproc_leaky(dtrain, dtest, param):
    train, test = dtrain.data, dtest.data
    ntrain = train.shape[0]
    train_test = pd.concat((train, test), axis=0).reset_index(drop=True)
    y_train = dtrain.labels
    
    # add manager worths
    manager_ids = np.unique(train.manager_id)
    manager_worths = {}
    for manager_id in manager_ids:
        interests = y_train[train.manager_id == manager_id].apply(lambda x: 2 - x)
        if len(interests) > 5:
            manager_worths[manager_id] = sum(interests) / len(interests)
    average_interest = y_train.apply(lambda x: 2 - x)
    average_manager_worth = 0 #sum(average_interest) / len(average_interest)
    train_test['manager_worths'] = train_test['manager_id'].apply(lambda x: manager_worths[x] if x in manager_worths else average_manager_worth)

    
    # add building worths
    building_ids = np.unique(train.building_id)
    building_worths = {}
    for building_id in building_ids:
        interests = y_train[train.building_id == building_id].apply(lambda x: 2 - x)
        if len(interests) > 5:
            building_worths[building_id] = sum(interests) / len(interests)
    average_interest = y_train.apply(lambda x: 2 - x)
    average_building_worth = 0 #sum(average_interest) / len(average_interest)
    train_test['building_worths'] = train_test['building_id'].apply(lambda x: building_worths[x] if x in building_worths else average_building_worth)

    
    # add price by area
    lat_long_price = train_test[['latitude', 'longitude', 'price']]
    remove_outliers = (np.abs(stats.zscore(lat_long_price)) < 0.15).all(axis=1)
    lat_long_price = lat_long_price[remove_outliers]
    lat_max, lat_min = max(lat_long_price.latitude), min(lat_long_price.latitude)
    long_max, long_min = max(lat_long_price.longitude), min(lat_long_price.longitude)
    lat_scale, long_scale = lat_max - lat_min, long_max - long_min
    costs = np.zeros((100,100))
    num_listings = np.zeros((100,100))
    for lat, long, price in lat_long_price.values:
        scaled_lat, scaled_long = int((lat - lat_min) * 99 / lat_scale), int((long - long_min) * 99 / long_scale)
        costs[scaled_lat][scaled_long] += price
        num_listings[scaled_lat][scaled_long] += 1

    price_by_area = []
    for lat, long, price in train_test[['latitude', 'longitude', 'price']].values:
        scaled_lat, scaled_long = int((lat - lat_min) * 99 / lat_scale), int((long - long_min) * 99 / long_scale)
        if scaled_lat < 0 or scaled_lat >= 100 or scaled_long < 0 or scaled_long >= 100:
            price_by_area.append(0)
        elif num_listings[scaled_lat][scaled_long] > 8:
            price_by_area.append(price - (costs[scaled_lat][scaled_long] / num_listings[scaled_lat][scaled_long]))
        else:
            cost = 0
            num = 0
            for i in range(scaled_lat - 1, scaled_lat + 2):
                for j in range(scaled_long - 1, scaled_long + 2):
                    if i > 0 and i < 100 and j >= 0 and j < 100:
                        cost += costs[i][j]
                        num += num_listings[i][j]
            if num > 8:
                price_by_area.append(price - (cost / num))
            else:
                price_by_area.append(0)

    train_test['price_by_area'] = price_by_area
    
    
    # try adding real - predicted price
    # Try to predict price for a listing and add real_price - expected_price as a feature
    # train on [exclusive, byowner, nofee] and #bed, #bath, display address
    features_to_use = ['bathrooms_cat', 'bedroom_cat', 'street', 'avenue', 'east', 'west', 'north', 'south', 'address1', 'other_address', 'building_worths', 'manager_worths']
    feature_to_predict = 'price'
    params = {
        'objective': 'reg:linear',
        'booster':'gblinear',
        'lambda': 0,
        'lambda_bias' : 0,
        'alpha': 0.2
    }
    prices = train_test[feature_to_predict]
    remove_outliers = np.abs(prices-prices.mean())<=(3*prices.std())
    dtrain = xgb.DMatrix(data=train_test[remove_outliers][features_to_use], label=train_test[remove_outliers][feature_to_predict])

    bst = xgb.cv(params, dtrain, 10000, 4, early_stopping_rounds=50, verbose_eval=200)
    best_rounds = np.argmin(bst['test-rmse-mean'])
    print (bst['test-rmse-mean'][best_rounds])
    bst = xgb.train(params, dtrain, best_rounds)
    dtrain = xgb.DMatrix(data=train_test[features_to_use])
    expected_price = bst.predict(dtrain)
    train_test['real_minus_expected_price'] = train_test[feature_to_predict] - expected_price
    train_test['real_over_expected_price'] = train_test[feature_to_predict] / expected_price
    
    
    # remove extra
    train_test.drop('price', axis=1, inplace=True)
    
    dtrain = xgb.DMatrix(data=train_test[:ntrain], label=y_train)
    dtest = xgb.DMatrix(data=train_test[ntrain:], label=dtest.labels)
    return dtrain, dtest, param

In [5]:
class FakeDMatrix:
    def __init__(self, data, labels=None):
        self.data = data
        self.labels = labels
        self.num = len(data)

    def num_row(self):
       return self.num

    def slice(self, rindex):
        indices = np.zeros(self.num, dtype=np.bool)
        for index in rindex:
            indices[index] = True
        return FakeDMatrix(data=self.data[indices], labels=self.labels[indices])
    
SEED = 777
NFOLDS = 5

params = {
    'eta':.01,
    'colsample_bytree':.8,
    'subsample':.8,
    'seed':0,
    'nthread':16,
    'objective':'multi:softprob',
    'eval_metric':'mlogloss',
    'num_class':3,
    'silent':1
}

dtrain_data, dtest_data = fpreproc_safe(train, test)
dtrain = FakeDMatrix(data=dtrain_data, labels=y_train)
dtest = FakeDMatrix(data=dtest_data)

In [None]:
bst = xgb.cv(params, dtrain, 10000, NFOLDS, early_stopping_rounds=50, verbose_eval=200, fpreproc=fpreproc_leaky)
best_rounds = np.argmin(bst['test-mlogloss-mean'])
print (bst['test-mlogloss-mean'][best_rounds])

[0]	train-rmse:2028.32+126.79	test-rmse:2028.72+98.4538
[200]	train-rmse:1643.14+18.1906	test-rmse:1643.01+54.2032
1643.00567625
[0]	train-rmse:2036.09+93.4448	test-rmse:2038.92+79.9489
1644.18063375
[0]	train-rmse:1977.34+200.909	test-rmse:1984.64+162.267
[200]	train-rmse:1644.02+30.0261	test-rmse:1642.68+87.4003
1642.67880225
[0]	train-rmse:1958.08+91.497	test-rmse:1957.9+74.6673
1643.44158925
[0]	train-rmse:1935.69+74.271	test-rmse:1929.68+77.2772
1642.98703025
[0]	train-mlogloss:1.0917+2.33838e-05	test-mlogloss:1.09194+6.62468e-05
[20]	train-mlogloss:0.97181+0.000262769	test-mlogloss:0.977435+0.000729137
[40]	train-mlogloss:0.882608+0.000580355	test-mlogloss:0.893044+0.000855938


In [None]:
dtrain_final, dtest_final = fpreproc_leaky(dtrain, dtest)
bst = xgb.train(params, dtrain_final, best_rounds)
preds = bst.predict(dtest_final)

preds = pd.DataFrame(preds)
cols = ['high', 'medium', 'low']
preds.columns = cols
preds['listing_id'] = listing_id
preds.to_csv('my_preds.csv', index=None)

importance = bst.get_fscore()
feature_importance = [(feature, (importance['f{}'.format(i)]) if 'f{}'.format(i) in importance else 0) for i, feature in enumerate(features)]
sorted(feature_importance, key=lambda x: -x[1])

In [10]:
%%time
manager_ids = np.unique(train.manager_id)
manager_worths = {}
for manager_id in manager_ids:
    interests = y_train[train.manager_id == manager_id].apply(lambda x: 2 - x)
    if len(interests) > 5:
        manager_worths[manager_id] = sum(interests) / len(interests)
average_interest = y_train.apply(lambda x: 2 - x)
average_manager_worth = 0 #sum(average_interest) / len(average_interest)

CPU times: user 19.8 s, sys: 266 ms, total: 20.1 s
Wall time: 20.9 s


In [11]:
%%time
building_ids = np.unique(train.building_id)
building_worths = {}
for building_id in building_ids:
    interests = y_train[train.building_id == building_id].apply(lambda x: 2 - x)
    if len(interests) > 5:
        building_worths[building_id] = sum(interests) / len(interests)
average_interest = y_train.apply(lambda x: 2 - x)
average_building_worth = 0 #sum(average_interest) / len(average_interest)

CPU times: user 39.2 s, sys: 515 ms, total: 39.7 s
Wall time: 41.5 s


In [12]:
%%time
lat_long_price = train_test[['latitude', 'longitude', 'price']]
remove_outliers = (np.abs(stats.zscore(lat_long_price)) < 0.15).all(axis=1)
lat_long_price = lat_long_price[remove_outliers]
lat_max, lat_min = max(lat_long_price.latitude), min(lat_long_price.latitude)
long_max, long_min = max(lat_long_price.longitude), min(lat_long_price.longitude)
lat_scale, long_scale = lat_max - lat_min, long_max - long_min
costs = np.zeros((100,100))
num_listings = np.zeros((100,100))
for lat, long, price in lat_long_price.values:
    scaled_lat, scaled_long = int((lat - lat_min) * 99 / lat_scale), int((long - long_min) * 99 / long_scale)
    costs[scaled_lat][scaled_long] += price
    num_listings[scaled_lat][scaled_long] += 1

price_by_area = []
for lat, long, price in train_test[['latitude', 'longitude', 'price']].values:
    scaled_lat, scaled_long = int((lat - lat_min) * 99 / lat_scale), int((long - long_min) * 99 / long_scale)
    if scaled_lat < 0 or scaled_lat >= 100 or scaled_long < 0 or scaled_long >= 100:
        price_by_area.append(0)
    elif num_listings[scaled_lat][scaled_long] > 8:
        price_by_area.append(price - (costs[scaled_lat][scaled_long] / num_listings[scaled_lat][scaled_long]))
    else:
        cost = 0
        num = 0
        for i in range(scaled_lat - 1, scaled_lat + 2):
            for j in range(scaled_long - 1, scaled_long + 2):
                if i > 0 and i < 100 and j >= 0 and j < 100:
                    cost += costs[i][j]
                    num += num_listings[i][j]
        if num > 8:
            price_by_area.append(price - (cost / num))
        else:
            price_by_area.append(0)

train_test['price_by_area'] = price_by_area

CPU times: user 447 ms, sys: 3.3 ms, total: 450 ms
Wall time: 451 ms


In [13]:
%%time
# convert date to be more useable
train_test['Date'] = pd.to_datetime(train_test['created'])
# year is all the same
#train_test['Year'] = train_test['Date'].dt.year
train_test['Month'] = train_test['Date'].dt.month
train_test['Day'] = train_test['Date'].dt.day
train_test['Wday'] = train_test['Date'].dt.dayofweek
train_test['Yday'] = train_test['Date'].dt.dayofyear
train_test['hour'] = train_test['Date'].dt.hour
train_test = train_test.drop(['Date', 'created'], axis=1)


# check if this building id/manager id and add worths is 0
train_test['Zero_building_id'] = train_test['building_id'].apply(lambda x: 1 if x == '0' else 0)
train_test['Zero_manager_id'] = train_test['manager_id'].apply(lambda x: 1 if x == '0' else 0)
train_test['manager_worths'] = train_test['manager_id'].apply(lambda x: manager_worths[x] if x in manager_worths else average_manager_worth)
train_test['building_worths'] = train_test['building_id'].apply(lambda x: building_worths[x] if x in building_worths else average_building_worth)


# mess with description meta data
train_test['desc'] = train_test['description']
train_test['desc'] = train_test['desc'].apply(lambda x: x.replace('<p><a  website_redacted ', ''))
train_test['desc'] = train_test['desc'].apply(lambda x: x.replace('!<br /><br />', ''))

string.punctuation.__add__('!!')
string.punctuation.__add__('(')
string.punctuation.__add__(')')
remove_punct_map = dict.fromkeys(map(ord, string.punctuation))

train_test['desc'] = train_test['desc'].apply(lambda x: x.translate(remove_punct_map))
train_test['desc_letters_count'] = train_test['description'].apply(lambda x: len(x.strip()))
train_test['desc_words_count'] = train_test['desc'].apply(lambda x: 0 if len(x.strip()) == 0 else len(x.split(' ')))
train_test['desc_words_length'] = (train_test['desc_letters_count'] / train_test['desc_words_count']).apply(lambda x: 0 if math.isnan(x) or math.isinf(x) else x)

train_test.drop(['description', 'desc'], axis=1, inplace=True)


# address
train_test['address1'] = train_test['display_address']
train_test['address1'] = train_test['address1'].apply(lambda x: x.lower())

address_map = {
    'w': 'west',
    'st.': 'street',
    'ave': 'avenue',
    'st': 'street',
    'e': 'east',
    'n': 'north',
    's': 'south'
}

def address_map_func(s):
    s = s.split(' ')
    out = []
    for x in s:
        if x in address_map:
            out.append(address_map[x])
        else:
            out.append(x)
    return ' '.join(out)


train_test['address1'] = train_test['address1'].apply(lambda x: x.translate(remove_punct_map))
train_test['address1'] = train_test['address1'].apply(lambda x: address_map_func(x))

new_cols = ['street', 'avenue', 'east', 'west', 'north', 'south']
for col in new_cols:
    train_test[col] = train_test['address1'].apply(lambda x: 1 if col in x else 0)

train_test['other_address'] = train_test[new_cols].apply(lambda x: 1 if x.sum() == 0 else 0, axis=1)
train_test.drop(['display_address', 'street_address'], axis=1, inplace=True)


# features (better not to lower)
train_test['features_count'] = train_test['features'].apply(lambda x: len(x))
train_test['features2'] = train_test['features']
train_test['features2'] = train_test['features2'].apply(lambda x: ' '.join([''.join(i.replace('_',' ').replace('-',' ').split(' ')) for i in x]))

c_vect = CountVectorizer(stop_words='english', max_features=200, ngram_range=(1, 1))
c_vect.fit(train_test['features2'])

c_vect_sparse_1 = c_vect.transform(train_test['features2'])
c_vect_sparse1_cols = c_vect.get_feature_names()
train_test.drop(['features', 'features2'], axis=1, inplace=True)


# photos
train_test['photos_count'] = train_test['photos'].apply(lambda x: len(x))
train_test.drop(['photos'], axis=1, inplace=True)


# convert ['building_id', 'manager_id', 'address1'] to enumerated labels
categoricals = [x for x in train_test.columns if train_test[x].dtype == 'object']
for feat in categoricals:
    lbl = preprocessing.LabelEncoder()
    lbl.fit(list(train_test[feat].values))
    train_test[feat] = lbl.transform(list(train_test[feat].values))


# convert bed/bath number to enumeration
train_test['bathrooms_cat'] = train_test['bathrooms'].apply(lambda x: str(x))
train_test['bathrooms_cat'], labels = pd.factorize(train_test['bathrooms_cat'].values, sort=True)
train_test.drop('bathrooms', axis=1, inplace=True)
train_test['bedroom_cat'], labels = pd.factorize(train_test['bedrooms'].values, sort=True)
train_test.drop('bedrooms', axis=1, inplace=True)

    
# try adding real - predicted price
# Try to predict price for a listing and add real_price - expected_price as a feature
# train on [exclusive, byowner, nofee] and #bed, #bath, display address
features_to_use = ['bathrooms_cat', 'bedroom_cat', 'street', 'avenue', 'east', 'west', 'north', 'south', 'address1', 'other_address', 'building_worths', 'manager_worths']
feature_to_predict = 'price'
params = {
    'objective': 'reg:linear',
    'booster':'gblinear',
    'lambda': 0,
    'lambda_bias' : 0,
    'alpha': 0.2
}
prices = train_test[feature_to_predict]
remove_outliers = np.abs(prices-prices.mean())<=(3*prices.std())
dtrain = xgb.DMatrix(data=train_test[remove_outliers][features_to_use], label=train_test[remove_outliers][feature_to_predict])

bst = xgb.cv(params, dtrain, 10000, 4, early_stopping_rounds=50, verbose_eval=200)
best_rounds = np.argmin(bst['test-rmse-mean'])
print (bst['test-rmse-mean'][best_rounds])
bst = xgb.train(params, dtrain, best_rounds)
dtrain = xgb.DMatrix(data=train_test[features_to_use])
expected_price = bst.predict(dtrain)
train_test['real_minus_expected_price'] = train_test[feature_to_predict] - expected_price
train_test['real_over_expected_price'] = train_test[feature_to_predict] / expected_price


# transform price
bc_price, tmp = boxcox(train_test.price)
train_test['bc_price'] = bc_price
train_test.drop('price', axis=1, inplace=True)


# add sparse
train_test_cv1_sparse = sparse.hstack((train_test, c_vect_sparse_1)).tocsr()
#x_train = train_test_cv1_sparse[:20000, :]
#y_train = y_train[:20000]
x_train = train_test_cv1_sparse[:ntrain, :]
x_test = train_test_cv1_sparse[ntrain:, :]


# add feature names
features = list(train_test.columns) + ['sparse_' + vect_name for vect_name in c_vect_sparse1_cols]

[0]	train-rmse:2005.81+18.7791	test-rmse:2005.4+62.831
[200]	train-rmse:1638.46+20.0772	test-rmse:1638.04+60.2721
1638.0352785
CPU times: user 17.3 s, sys: 1.25 s, total: 18.6 s
Wall time: 11.3 s


In [16]:
%%time
SEED = 777
NFOLDS = 5

params = {
    'eta':.01,
    'colsample_bytree':.8,
    'subsample':.8,
    'seed':0,
    'nthread':16,
    'objective':'multi:softprob',
    'eval_metric':'mlogloss',
    'num_class':3,
    'silent':1
}

dtrain = xgb.DMatrix(data=x_train, label=y_train)
dtest = xgb.DMatrix(data=x_test)

bst = xgb.cv(params, dtrain, 10000, NFOLDS, early_stopping_rounds=50, verbose_eval=200)
best_rounds = np.argmin(bst['test-mlogloss-mean'])
print (bst['test-mlogloss-mean'][best_rounds])
#bst_cv_scores.append(bst['test-mlogloss-mean'][best_rounds])

[0]	train-mlogloss:1.09152+1.28062e-05	test-mlogloss:1.09169+3.0288e-05
[200]	train-mlogloss:0.592627+0.000429155	test-mlogloss:0.621589+0.00117747
[400]	train-mlogloss:0.511057+0.000422405	test-mlogloss:0.562877+0.00159155
[600]	train-mlogloss:0.473634+0.000149098	test-mlogloss:0.545578+0.00183562
[800]	train-mlogloss:0.447848+0.000361501	test-mlogloss:0.537699+0.00185401
[1000]	train-mlogloss:0.426777+0.000583778	test-mlogloss:0.533297+0.00170786
[1200]	train-mlogloss:0.408599+0.000783595	test-mlogloss:0.530551+0.00151466
[1400]	train-mlogloss:0.392322+0.000898563	test-mlogloss:0.528749+0.0015258
[1600]	train-mlogloss:0.377225+0.000903515	test-mlogloss:0.527561+0.00145174
[1800]	train-mlogloss:0.363205+0.00088881	test-mlogloss:0.526868+0.00141634
[2000]	train-mlogloss:0.349847+0.000872462	test-mlogloss:0.526541+0.00144961
0.5265242
CPU times: user 51min 56s, sys: 5min 46s, total: 57min 42s
Wall time: 21min 22s


In [26]:
bst = xgb.train(params, dtrain, best_rounds)
preds = bst.predict(dtest)
print (log_loss(x_test, preds))

#preds = pd.DataFrame(preds)
#cols = ['high', 'medium', 'low']
#preds.columns = cols
#preds['listing_id'] = listing_id
#preds.to_csv('my_preds.csv', index=None)

0.553719894183


In [None]:
importance = bst.get_fscore()
feature_importance = [(feature, (importance['f{}'.format(i)]) if 'f{}'.format(i) in importance else 0) for i, feature in enumerate(features)]
sorted(feature_importance, key=lambda x: -x[1])