In [17]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn import preprocessing, model_selection
import string
from sklearn.feature_extraction.text import  CountVectorizer
from scipy.stats import boxcox
from scipy import sparse


import pickle
from sklearn.metrics import log_loss
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier , ExtraTreesClassifier , AdaBoostClassifier



In [2]:
train_file = 'train.json'
test_file = 'test.json'
train = pd.read_json(train_file)
test = pd.read_json(test_file)
listing_id = test.listing_id.values

In [3]:
y_map = {'low': 2, 'medium': 1, 'high': 0}
train['interest_level'] = train['interest_level'].apply(lambda x: y_map[x])
y_train = train.interest_level.values

train = train.drop(['listing_id', 'interest_level'], axis=1)
test = test.drop('listing_id', axis=1)

ntrain = train.shape[0]

train_test = pd.concat((train, test), axis=0).reset_index(drop=True)

In [4]:
train_test['Date'] = pd.to_datetime(train_test['created'])
train_test['Year'] = train_test['Date'].dt.year
train_test['Month'] = train_test['Date'].dt.month
train_test['Day'] = train_test['Date'].dt.day
train_test['Wday'] = train_test['Date'].dt.dayofweek
train_test['Yday'] = train_test['Date'].dt.dayofyear
train_test['hour'] = train_test['Date'].dt.hour

train_test = train_test.drop(['Date', 'created'], axis=1)

train_test['Zero_building_id'] = train_test['building_id'].apply(lambda x: 1 if x == '0' else 0)

In [5]:
train_test['desc'] = train_test['description']
train_test['desc'] = train_test['desc'].apply(lambda x: x.replace('<p><a  website_redacted ', ''))
train_test['desc'] = train_test['desc'].apply(lambda x: x.replace('!<br /><br />', ''))

string.punctuation.__add__('!!')
string.punctuation.__add__('(')
string.punctuation.__add__(')')

remove_punct_map = dict.fromkeys(map(ord, string.punctuation))

train_test['desc'] = train_test['desc'].apply(lambda x: x.translate(remove_punct_map))
train_test['desc_letters_count'] = train_test['description'].apply(lambda x: len(x.strip()))
train_test['desc_words_count'] = train_test['desc'].apply(lambda x: 0 if len(x.strip()) == 0 else len(x.split(' ')))

train_test.drop(['description', 'desc'], axis=1, inplace=True)

In [6]:
train_test['address1'] = train_test['display_address']
train_test['address1'] = train_test['address1'].apply(lambda x: x.lower())

address_map = {
    'w': 'west',
    'st.': 'street',
    'ave': 'avenue',
    'st': 'street',
    'e': 'east',
    'n': 'north',
    's': 'south'
}


def address_map_func(s):
    s = s.split(' ')
    out = []
    for x in s:
        if x in address_map:
            out.append(address_map[x])
        else:
            out.append(x)
    return ' '.join(out)


train_test['address1'] = train_test['address1'].apply(lambda x: x.translate(remove_punct_map))
train_test['address1'] = train_test['address1'].apply(lambda x: address_map_func(x))

In [7]:
new_cols = ['street', 'avenue', 'east', 'west', 'north', 'south']

for col in new_cols:
    train_test[col] = train_test['address1'].apply(lambda x: 1 if col in x else 0)

train_test['other_address'] = train_test[new_cols].apply(lambda x: 1 if x.sum() == 0 else 0, axis=1)

train_test['features_count'] = train_test['features'].apply(lambda x: len(x))
train_test['features2'] = train_test['features']
train_test['features2'] = train_test['features2'].apply(lambda x: ' '.join(x))

c_vect = CountVectorizer(stop_words='english', max_features=200, ngram_range=(1, 1))
c_vect.fit(train_test['features2'])

c_vect_sparse_1 = c_vect.transform(train_test['features2'])
c_vect_sparse1_cols = c_vect.get_feature_names()

train_test.drop(['features', 'features2'], axis=1, inplace=True)

In [8]:
managers_count = train_test['manager_id'].value_counts()

train_test['top_10_manager'] = train_test['manager_id'].apply(lambda x: 1 if x in managers_count.index.values[
    managers_count.values >= np.percentile(managers_count.values, 90)] else 0)
train_test['top_25_manager'] = train_test['manager_id'].apply(lambda x: 1 if x in managers_count.index.values[
    managers_count.values >= np.percentile(managers_count.values, 75)] else 0)
train_test['top_5_manager'] = train_test['manager_id'].apply(lambda x: 1 if x in managers_count.index.values[
    managers_count.values >= np.percentile(managers_count.values, 95)] else 0)
train_test['top_50_manager'] = train_test['manager_id'].apply(lambda x: 1 if x in managers_count.index.values[
    managers_count.values >= np.percentile(managers_count.values, 50)] else 0)
train_test['top_1_manager'] = train_test['manager_id'].apply(lambda x: 1 if x in managers_count.index.values[
    managers_count.values >= np.percentile(managers_count.values, 99)] else 0)
train_test['top_2_manager'] = train_test['manager_id'].apply(lambda x: 1 if x in managers_count.index.values[
    managers_count.values >= np.percentile(managers_count.values, 98)] else 0)
train_test['top_15_manager'] = train_test['manager_id'].apply(lambda x: 1 if x in managers_count.index.values[
    managers_count.values >= np.percentile(managers_count.values, 85)] else 0)
train_test['top_20_manager'] = train_test['manager_id'].apply(lambda x: 1 if x in managers_count.index.values[
    managers_count.values >= np.percentile(managers_count.values, 80)] else 0)
train_test['top_30_manager'] = train_test['manager_id'].apply(lambda x: 1 if x in managers_count.index.values[
    managers_count.values >= np.percentile(managers_count.values, 70)] else 0)

In [9]:
buildings_count = train_test['building_id'].value_counts()

train_test['top_10_building'] = train_test['building_id'].apply(lambda x: 1 if x in buildings_count.index.values[
    buildings_count.values >= np.percentile(buildings_count.values, 90)] else 0)
train_test['top_25_building'] = train_test['building_id'].apply(lambda x: 1 if x in buildings_count.index.values[
    buildings_count.values >= np.percentile(buildings_count.values, 75)] else 0)
train_test['top_5_building'] = train_test['building_id'].apply(lambda x: 1 if x in buildings_count.index.values[
    buildings_count.values >= np.percentile(buildings_count.values, 95)] else 0)
train_test['top_50_building'] = train_test['building_id'].apply(lambda x: 1 if x in buildings_count.index.values[
    buildings_count.values >= np.percentile(buildings_count.values, 50)] else 0)
train_test['top_1_building'] = train_test['building_id'].apply(lambda x: 1 if x in buildings_count.index.values[
    buildings_count.values >= np.percentile(buildings_count.values, 99)] else 0)
train_test['top_2_building'] = train_test['building_id'].apply(lambda x: 1 if x in buildings_count.index.values[
    buildings_count.values >= np.percentile(buildings_count.values, 98)] else 0)
train_test['top_15_building'] = train_test['building_id'].apply(lambda x: 1 if x in buildings_count.index.values[
    buildings_count.values >= np.percentile(buildings_count.values, 85)] else 0)
train_test['top_20_building'] = train_test['building_id'].apply(lambda x: 1 if x in buildings_count.index.values[
    buildings_count.values >= np.percentile(buildings_count.values, 80)] else 0)
train_test['top_30_building'] = train_test['building_id'].apply(lambda x: 1 if x in buildings_count.index.values[
    buildings_count.values >= np.percentile(buildings_count.values, 70)] else 0)

In [10]:
train_test['photos_count'] = train_test['photos'].apply(lambda x: len(x))
train_test.drop(['photos', 'display_address', 'street_address'], axis=1, inplace=True)

categoricals = [x for x in train_test.columns if train_test[x].dtype == 'object']

for feat in categoricals:
    lbl = preprocessing.LabelEncoder()
    lbl.fit(list(train_test[feat].values))
    train_test[feat] = lbl.transform(list(train_test[feat].values))

In [11]:
bc_price, tmp = boxcox(train_test.price)
train_test['bc_price'] = bc_price

train_test.drop('price', axis=1, inplace=True)
train_test['bathrooms_cat'] = train_test['bathrooms'].apply(lambda x: str(x))

train_test['bathrooms_cat'], labels = pd.factorize(train_test['bathrooms_cat'].values, sort=True)
train_test.drop('bathrooms', axis=1, inplace=True)

train_test['bedroom_cat'], labels = pd.factorize(train_test['bedrooms'].values, sort=True)
train_test.drop('bedrooms', axis=1, inplace=True)

In [12]:
features = list(train_test.columns)
train_test_cv1_sparse = sparse.hstack((train_test, c_vect_sparse_1)).tocsr()
x_train = train_test_cv1_sparse[:ntrain, :]
x_test = train_test_cv1_sparse[ntrain:, :]
features += c_vect_sparse1_cols

In [13]:
SEED = 777
NFOLDS = 5

params = {
    'eta':.01,
    'colsample_bytree':.8,
    'subsample':.8,
    'seed':0,
    'nthread':16,
    'objective':'multi:softprob',
    'eval_metric':'mlogloss',
    'num_class':3,
    'silent':1
}


dtrain = xgb.DMatrix(data=x_train, label=y_train)
dtest = xgb.DMatrix(data=x_test)

In [16]:
# bst = xgb.cv(params, dtrain, 10000, NFOLDS, early_stopping_rounds=50, verbose_eval=25)
# best_rounds = np.argmin(bst['test-mlogloss-mean']
# bst = xgb.train(params, dtrain, best_rounds)
# preds = bst.predict(dtest)
# preds = pd.DataFrame(preds)
# cols = ['high', 'medium', 'low']
# preds.columns = cols
# preds['listing_id'] = listing_id
# preds.to_csv('my_preds.csv', index=None)

#instead load the pickle model
with open('xgboost.pkl', 'rb') as xgboost:
    bst = pickle.load(xgboost)

In [19]:
train_pred1= bst.predict(dtrain)
train_pred1 = pd.DataFrame(train_pred1)
cols = ['high', 'medium', 'low']
train_pred1.columns = cols
train_loss = log_loss(y_train,train_pred1)
train_loss

0.47050505216533894

In [20]:
# rf1 = RandomForestClassifier(n_estimators=1000, criterion='entropy',  n_jobs = -1)
# rf1.fit(x_train, y_train)
# y_train_pred2 = rf1.predict_proba(x_train)
# log_loss(y_train, y_train_pred2)

#load the second model rf with entropy
with open('rf1.pkl', 'rb') as rf1:
    rf1 = pickle.load(rf1)
y_train_pred2 = rf1.predict_proba(x_train)
log_loss(y_train, y_train_pred2)

0.37498744907016018

In [21]:
# rf2 = RandomForestClassifier(n_estimators=1000, criterion='gini',  n_jobs = -1)
# rf2.fit(x_train, y_train)
# y_train_pred2 = rf2.predict_proba(x_train)
# log_loss(y_train, y_train_pred2)


#load the third model rf with gini
with open('rf2.pkl', 'rb') as rf2:
    rf2 = pickle.load(rf2)
y_train_pred3 = rf2.predict_proba(x_train)
log_loss(y_train, y_train_pred2)

0.37498744907016018

In [22]:
# gbc = GradientBoostingClassifier(n_estimators=1000)
# gbc.fit(x_train, y_train)

#load the fourth model gbc
with open('gbc.pkl', 'rb') as gbc:
    gbc = pickle.load(gbc)


x_train1 = x_train.toarray()
y_train_pred4 = gbc.predict_proba(x_train1)
log_loss(y_train, y_train_pred4)

0.52720866312146875

In [24]:
# etc = ExtraTreesClassifier(n_estimators=1100, min_samples_leaf=100,max_depth=4, n_jobs=-1, random_state=104743, verbose=0)
# etc.fit(x_train, y_train)

# Model 5 extra trees classifier
with open('etc.pkl', 'rb') as etc:
    etc = pickle.load(etc)

y_train_pred5 = etc.predict_proba(x_train)
log_loss(y_train, y_train_pred5)

0.74201032684774915

In [25]:
# ada = AdaBoostClassifier(n_estimators=1100)
# ada.fit(x_train,y_train)

#model 6 ada boost

with open('ada.pkl', 'rb') as ada:
    ada = pickle.load(ada)


y_train_pred6 = ada.predict_proba(x_train)
log_loss(y_train, y_train_pred6)

1.096411162128982

In [27]:
results = np.concatenate((train_pred1,y_train_pred2,y_train_pred3,y_train_pred4,y_train_pred5,y_train_pred6),axis=1)


In [28]:
NFOLDS = 5

params = {
    'eta':.01,
    'colsample_bytree':.8,
    'subsample':.8,
    'seed':0,
    'nthread':16,
    'objective':'multi:softprob',
    'eval_metric':'mlogloss',
    'num_class':3,
    'silent':1
}

dtrain_res = xgb.DMatrix(data=results, label=y_train)



In [29]:
final = xgb.cv(params,dtrain_res, 10000, NFOLDS, early_stopping_rounds=50, verbose_eval=25)


[0]	train-mlogloss:1.08474+6.21611e-06	test-mlogloss:1.08483+5.6356e-06
[25]	train-mlogloss:0.805688+0.000286049	test-mlogloss:0.807839+0.000329037
[50]	train-mlogloss:0.615109+0.000224199	test-mlogloss:0.619027+0.000322942
[75]	train-mlogloss:0.47912+0.000261068	test-mlogloss:0.484572+0.000393588
[100]	train-mlogloss:0.379352+0.000274115	test-mlogloss:0.386227+0.000485102
[125]	train-mlogloss:0.304762+0.000279085	test-mlogloss:0.312932+0.000609099
[150]	train-mlogloss:0.248219+0.000282713	test-mlogloss:0.257628+0.000745643
[175]	train-mlogloss:0.204942+0.000288424	test-mlogloss:0.215504+0.000844449
[200]	train-mlogloss:0.171446+0.000312214	test-mlogloss:0.183132+0.000926455
[225]	train-mlogloss:0.145369+0.000358824	test-mlogloss:0.158076+0.000983585
[250]	train-mlogloss:0.124882+0.000368016	test-mlogloss:0.138622+0.00106131
[275]	train-mlogloss:0.108648+0.000391561	test-mlogloss:0.123385+0.00113041
[300]	train-mlogloss:0.0957126+0.00038936	test-mlogloss:0.1114+0.00121355
[325]	train-m

In [34]:
best_rounds_f = np.argmin(final['test-mlogloss-mean'])
# best_rounds_f
final_round_f = xgb.train(params, dtrain, best_rounds_f)

NameError: name 'bst' is not defined

In [36]:
preds_f = final_round_f.predict(dtest)
preds_f = pd.DataFrame(preds_f)
cols = ['high', 'medium', 'low']
preds_f.columns = cols
preds_f['listing_id'] = listing_id


In [38]:
preds_f.to_csv('my_preds_ens.csv', index=None)