In [1]:
import xgboost as xgb
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn import preprocessing



In [2]:
train = pd.read_json('./input/train.json')
test = pd.read_json("./input/test.json")
combined = pd.concat([train, test])

In [3]:
building_enc = preprocessing.LabelEncoder()
building_enc.fit(combined.building_id)

manager_enc = preprocessing.LabelEncoder()
manager_enc.fit(combined.manager_id)

LabelEncoder()

In [4]:
combined['building_enc'] = building_enc.transform(combined.building_id)
combined['manager_enc'] = manager_enc.transform(combined.manager_id)

In [5]:
combined['n_photo'] = combined.photos.apply(lambda x: len(x))

In [6]:
feature_list = ['bathrooms', \
                'bedrooms', \
                'price', \
                'latitude', \
                'longitude', \
                'building_enc', \
                'manager_enc', \
                'n_photo']

In [7]:
train = combined[~pd.isnull(combined.interest_level)]
test = combined[pd.isnull(combined.interest_level)]

In [8]:
from sklearn.model_selection import train_test_split
X_train, X_validate, y_train, y_validate = train_test_split(\
    train[feature_list], \
    train[['interest_level']].values, \
    test_size=0.33, random_state=42)

In [9]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(y_train[:,0])
y_train_encoded = le.transform(y_train[:,0])
y_validate_encoded = le.transform(y_validate[:,0])

In [10]:
dtrain = xgb.DMatrix(X_train, label=y_train_encoded)
dvalidate = xgb.DMatrix(X_validate, label=y_validate_encoded)
evallist  = [(dvalidate,'eval'), (dtrain,'train')]

In [11]:
param = {'bst:max_depth':8, \
         'bst:eta':0.3, \
         'silent':1,
         'objective':'multi:softprob',
         'num_class':3}

num_round = 100
bst = xgb.train(param, dtrain, num_round, evallist )

[0]	eval-merror:0.294222	train-merror:0.283472
[1]	eval-merror:0.288635	train-merror:0.277937
[2]	eval-merror:0.290047	train-merror:0.278663
[3]	eval-merror:0.288144	train-merror:0.277272
[4]	eval-merror:0.288267	train-merror:0.276879
[5]	eval-merror:0.287898	train-merror:0.27697
[6]	eval-merror:0.287776	train-merror:0.275941
[7]	eval-merror:0.287284	train-merror:0.274974
[8]	eval-merror:0.287039	train-merror:0.274792
[9]	eval-merror:0.287162	train-merror:0.273129
[10]	eval-merror:0.285197	train-merror:0.271012
[11]	eval-merror:0.283907	train-merror:0.26962
[12]	eval-merror:0.284706	train-merror:0.268471
[13]	eval-merror:0.284583	train-merror:0.266959
[14]	eval-merror:0.284092	train-merror:0.266203
[15]	eval-merror:0.283171	train-merror:0.265084
[16]	eval-merror:0.282188	train-merror:0.263421
[17]	eval-merror:0.281513	train-merror:0.261364
[18]	eval-merror:0.281267	train-merror:0.260275
[19]	eval-merror:0.281022	train-merror:0.259126
[20]	eval-merror:0.280837	train-merror:0.256646
[21]

In [12]:
predict_validate = bst.predict(dvalidate)
from sklearn.metrics import log_loss
print(log_loss(y_validate, predict_validate))

0.596195830965


In [13]:
dtest = xgb.DMatrix(test[feature_list])
test_predict = bst.predict(dtest)
test_predict

array([[ 0.06094936,  0.54898244,  0.3900682 ],
       [ 0.04651337,  0.85233468,  0.10115199],
       [ 0.02391572,  0.79791516,  0.17816912],
       ..., 
       [ 0.06407378,  0.80704117,  0.128885  ],
       [ 0.22393315,  0.42850924,  0.3475576 ],
       [ 0.04447626,  0.74955338,  0.20597033]], dtype=float32)

In [14]:
result_frame = pd.DataFrame(test_predict)
result_frame.columns = le.classes_
result_frame['listing_id'] = test.listing_id.reset_index().listing_id
result_frame

Unnamed: 0,high,low,medium,listing_id
0,0.060949,0.548982,0.390068,7142618
1,0.046513,0.852335,0.101152,7210040
2,0.023916,0.797915,0.178169,7103890
3,0.060225,0.541236,0.398539,7143442
4,0.010852,0.894415,0.094733,6860601
5,0.000511,0.964856,0.034633,6840081
6,0.100374,0.524810,0.374816,6922337
7,0.032448,0.474580,0.492972,6913616
8,0.123219,0.487968,0.388813,6937820
9,0.209257,0.261908,0.528836,6893933


In [15]:
# Not submitted
result_frame[['listing_id','high','medium','low']].to_csv('Photos-result.csv', \
                                                          index = False)