In [1]:
import xgboost as xgb
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn import preprocessing



In [2]:
train = pd.read_json('./input/train.json')
test = pd.read_json("./input/test.json")
combined = pd.concat([train, test])

In [3]:
building_enc = preprocessing.LabelEncoder()
building_enc.fit(combined.building_id)

manager_enc = preprocessing.LabelEncoder()
manager_enc.fit(combined.manager_id)

LabelEncoder()

In [4]:
combined['building_enc'] = building_enc.transform(combined.building_id)
combined['manager_enc'] = manager_enc.transform(combined.manager_id)

In [5]:
feature_list = ['bathrooms', \
                'bedrooms', \
                'price', \
                'latitude', \
                'longitude', \
                'building_enc', \
                'manager_enc']

In [6]:
train = combined[~pd.isnull(combined.interest_level)]
test = combined[pd.isnull(combined.interest_level)]

In [7]:
from sklearn.model_selection import train_test_split
X_train, X_validate, y_train, y_validate = train_test_split(\
    train[feature_list], \
    train[['interest_level']].values, \
    test_size=0.33, random_state=42)

In [8]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(y_train[:,0])
y_train_encoded = le.transform(y_train[:,0])
y_validate_encoded = le.transform(y_validate[:,0])

In [9]:
dtrain = xgb.DMatrix(X_train, label=y_train_encoded)
dvalidate = xgb.DMatrix(X_validate, label=y_validate_encoded)
evallist  = [(dvalidate,'eval'), (dtrain,'train')]

In [10]:
param = {'bst:max_depth':8, \
         'bst:eta':0.3, \
         'silent':1,
         'objective':'multi:softprob',
         'num_class':3}

num_round = 100
bst = xgb.train(param, dtrain, num_round, evallist )

[0]	eval-merror:0.293486	train-merror:0.281385
[1]	eval-merror:0.290354	train-merror:0.278724
[2]	eval-merror:0.28974	train-merror:0.277998
[3]	eval-merror:0.289924	train-merror:0.278119
[4]	eval-merror:0.288881	train-merror:0.277212
[5]	eval-merror:0.28931	train-merror:0.277333
[6]	eval-merror:0.288696	train-merror:0.27576
[7]	eval-merror:0.288758	train-merror:0.275336
[8]	eval-merror:0.286855	train-merror:0.273794
[9]	eval-merror:0.286977	train-merror:0.272433
[10]	eval-merror:0.284951	train-merror:0.26956
[11]	eval-merror:0.283907	train-merror:0.268199
[12]	eval-merror:0.285135	train-merror:0.267534
[13]	eval-merror:0.284092	train-merror:0.267352
[14]	eval-merror:0.282986	train-merror:0.265296
[15]	eval-merror:0.283416	train-merror:0.263844
[16]	eval-merror:0.282004	train-merror:0.262271
[17]	eval-merror:0.280899	train-merror:0.260578
[18]	eval-merror:0.28053	train-merror:0.259459
[19]	eval-merror:0.280469	train-merror:0.257584
[20]	eval-merror:0.280223	train-merror:0.256465
[21]	ev

In [11]:
predict_validate = bst.predict(dvalidate)
from sklearn.metrics import log_loss
print(log_loss(y_validate, predict_validate))

0.601153511921


In [12]:
dtest = xgb.DMatrix(test[feature_list])
test_predict = bst.predict(dtest)
test_predict

array([[ 0.03644914,  0.65130293,  0.31224793],
       [ 0.10931576,  0.82827348,  0.06241076],
       [ 0.02048757,  0.85013294,  0.12937947],
       ..., 
       [ 0.08967831,  0.666933  ,  0.24338867],
       [ 0.22634688,  0.44596496,  0.32768816],
       [ 0.03851657,  0.77377886,  0.18770459]], dtype=float32)

In [13]:
result_frame = pd.DataFrame(test_predict)
result_frame.columns = le.classes_
result_frame['listing_id'] = test.listing_id.reset_index().listing_id
result_frame

Unnamed: 0,high,low,medium,listing_id
0,0.036449,0.651303,0.312248,7142618
1,0.109316,0.828273,0.062411,7210040
2,0.020488,0.850133,0.129379,7103890
3,0.081615,0.516072,0.402313,7143442
4,0.010779,0.894749,0.094472,6860601
5,0.000697,0.977479,0.021824,6840081
6,0.097035,0.620584,0.282381,6922337
7,0.028749,0.362738,0.608513,6913616
8,0.124624,0.437174,0.438202,6937820
9,0.233752,0.336261,0.429987,6893933


In [15]:
# Not submitted
result_frame[['listing_id','high','medium','low']].to_csv('ManagerBuilding-result.csv', \
                                                          index = False)