In [1]:
%matplotlib inline
import math
import numpy as np
import pandas as pd
import random
import xgboost as xgb
from sklearn import preprocessing, model_selection
from sklearn.metrics import log_loss
import string
from sklearn.metrics import log_loss
from sklearn.feature_extraction.text import  CountVectorizer
from scipy.stats import boxcox
from scipy import stats
from scipy import sparse



In [9]:
def fpreproc_leaky(dtrain, dtest, param):
    train, test = dtrain.data, dtest.data
    ntrain = train.shape[0]
    train_test = pd.concat((train, test), axis=0).reset_index(drop=True)
    y_train = dtrain.labels
    
    # average listing worth
    average_listing_worth = 0 #sum(average_interest) / len(average_interest)
    
    # add manager worths
    manager_ids = np.unique(train.manager_id)
    manager_worths = {}
    for manager_id in manager_ids:
        interests = y_train[train.manager_id == manager_id].apply(lambda x: 2 - x)
        if len(interests) > 5:
            manager_worths[manager_id] = sum(interests.values)[0] / len(interests)
    average_interest = y_train.apply(lambda x: 2 - x)
    train_test['manager_worths'] = train_test['manager_id'].apply(lambda x: manager_worths[x] if x in manager_worths else average_listing_worth)

    
    # add building worths
    building_ids = np.unique(train.building_id)
    building_worths = {}
    for building_id in building_ids:
        interests = y_train[train.building_id == building_id].apply(lambda x: 2 - x)
        if len(interests) > 5:
            building_worths[building_id] = sum(interests.values)[0] / len(interests)
    average_interest = y_train.apply(lambda x: 2 - x)
    train_test['building_worths'] = train_test['building_id'].apply(lambda x: building_worths[x] if x in building_worths else average_listing_worth)

    
    # add price by area
    lat_long_price = train_test[['latitude', 'longitude', 'price_per_bedroom']]
    remove_outliers = (np.abs(stats.zscore(lat_long_price)) < 0.15).all(axis=1)
    lat_long_price = lat_long_price[remove_outliers]
    lat_max, lat_min = max(lat_long_price.latitude), min(lat_long_price.latitude)
    long_max, long_min = max(lat_long_price.longitude), min(lat_long_price.longitude)
    lat_scale, long_scale = lat_max - lat_min, long_max - long_min
    costs = np.zeros((100,100))
    num_listings = np.zeros((100,100))
    for lat, long, price_per_bedroom in lat_long_price.values:
        scaled_lat, scaled_long = int((lat - lat_min) * 99 / lat_scale), int((long - long_min) * 99 / long_scale)
        costs[scaled_lat][scaled_long] += price_per_bedroom
        num_listings[scaled_lat][scaled_long] += 1

    price_by_area = []
    for lat, long, price_per_bedroom in train_test[['latitude', 'longitude', 'price_per_bedroom']].values:
        scaled_lat, scaled_long = int((lat - lat_min) * 99 / lat_scale), int((long - long_min) * 99 / long_scale)
        if scaled_lat < 0 or scaled_lat >= 100 or scaled_long < 0 or scaled_long >= 100:
            price_by_area.append(1)
        elif num_listings[scaled_lat][scaled_long] > 8:
            price_by_area.append(price_per_bedroom / (costs[scaled_lat][scaled_long] / num_listings[scaled_lat][scaled_long]))
        else:
            cost = 0
            num = 0
            for i in range(scaled_lat - 1, scaled_lat + 2):
                for j in range(scaled_long - 1, scaled_long + 2):
                    if i > 0 and i < 100 and j >= 0 and j < 100:
                        cost += costs[i][j]
                        num += num_listings[i][j]
            if num > 8:
                price_by_area.append(price_per_bedroom / (cost / num))
            else:
                price_by_area.append(1)

    train_test['price_by_area'] = price_by_area
    
    
    # try adding real - predicted price
    # Try to predict price for a listing and add real_price - expected_price as a feature
    features_to_use = ['BoroCode', 'NTACode', 'bathrooms', 'bedrooms', 'address1', 'other_address', 'building_worths', 'manager_worths']
    feature_to_predict = 'price'
    params = {
        'objective': 'reg:linear',
        'booster':'gblinear',
        'lambda': 0,
        'lambda_bias' : 0,
        'alpha': 0.2
    }
    prices = train_test[feature_to_predict]
    remove_outliers = np.abs(prices-prices.mean())<=(3*prices.std())
    dtrain = xgb.DMatrix(data=train_test[remove_outliers][features_to_use], label=train_test[remove_outliers][feature_to_predict])

    bst = xgb.cv(params, dtrain, 10000, 4, early_stopping_rounds=50, verbose_eval=200)
    best_rounds = np.argmin(bst['test-rmse-mean'])
    print (bst['test-rmse-mean'][best_rounds])
    bst = xgb.train(params, dtrain, best_rounds)
    dtrain = xgb.DMatrix(data=train_test[features_to_use])
    expected_price = bst.predict(dtrain)
    train_test['real_minus_expected_price'] = train_test[feature_to_predict] - expected_price
    train_test['real_over_expected_price'] = train_test[feature_to_predict] / expected_price
    
    
    # enumerated streets / price / price per street (/ num bedrooms) / price / price per street (/ num bedrooms + 0.1*bath)
    reasonable_prices = train_test[np.abs(prices-prices.mean())<=(3*prices.std())]
    for split in ['address1', 'BoroCode', 'NTACode']:
        unique_labels = np.unique(reasonable_prices[split])
        label_to_price = {}
        label_to_interest = {}
        for label in unique_labels:
            listings = reasonable_prices[reasonable_prices[split] == label]
            if len(listings) > 10:
                label_to_price[label] = sum(listings.price_per_bedroom) / len(listings)
                
            interests = y_train[train[split] == label].apply(lambda x: 2 - x)
            if len(interests) > 5:
                label_to_interest[label] = sum(interests.values)[0] / len(interests)
                
        train_test['price_by_{}'.format(split)] = train_test.apply(lambda x: (x.price_per_bedroom / label_to_price[x[split]]) if x[split] in label_to_price else 1, axis=1)
        train_test['worth_by_{}'.format(split)] = train_test[split].apply(lambda x: label_to_interest[x] if x in label_to_interest else average_listing_worth)
    
    # remove extra
    train_test.drop('price', axis=1, inplace=True)
    
    dtrain = xgb.DMatrix(data=train_test[:ntrain], label=y_train)
    dtest = xgb.DMatrix(data=train_test[ntrain:], label=dtest.labels)
    return dtrain, dtest, param

In [3]:
current_filter = ['bc_price', 'price_per_bedroom', 'price_per_bathroom', 'price_per_bed_and_bath', 'BoroCode', 'NTACode', 'price', 'latitude', 'price_by_area', 'longitude', 'manager_worths', 'address1', 'real_over_expected_price', 'desc_words_length', 'real_minus_expected_price', 'manager_id', 'Yday', 'building_id', 'building_worths', 'desc_letters_count', 'Day', 'desc_words_count', 'hour', 'bedrooms', 'photos_count', 'features_count', 'Wday', 'sparse_nofee', 'bathrooms', 'sparse_furnished', 'sparse_laundryinbuilding', 'sparse_hardwoodfloors', 'sparse_laundryinunit', 'Month', 'sparse_catsallowed', 'sparse_exclusive', 'street', 'sparse_elevator', 'sparse_prewar', 'sparse_dogsallowed', 'sparse_reducedfee', 'avenue', 'east', 'sparse_doorman', 'sparse_dishwasher', 'other_address', 'sparse_fitnesscenter', 'sparse_privateoutdoorspace', 'sparse_commonoutdoorspace', 'sparse_outdoorspace', 'sparse_loft', 'sparse_diningroom', 'sparse_balcony', 'sparse_highspeedinternet', 'sparse_parkingspace', 'sparse_terrace', 'sparse_swimmingpool', 'west', 'sparse_roofdeck', 'sparse_actualapt', 'sparse_wheelchairaccess', 'sparse_newconstruction', 'Zero_building_id', 'sparse_simplex', 'sparse_patio', 'sparse_garden', 'sparse_multilevel', 'sparse_hardwood', 'sparse_shorttermallowed', 'south', 'sparse_stainlesssteelappliances', 'sparse_fireplace', 'sparse_highceiling', 'sparse_renovated', 'sparse_liveinsuper', 'sparse_storage', 'sparse_garage', 'sparse_dryerinunit', 'sparse_outdoorareas', 'sparse_petsok', 'sparse_lndrybldg', 'sparse_concierge', 'sparse_new', 'sparse_highceilings', 'sparse_onsitelaundry', 'sparse_centrala', 'sparse_flex3', 'sparse_photos', 'sparse_view', 'sparse_publicoutdoor', 'sparse_allutilitiesincluded', 'sparse_residentslounge', 'sparse_newlyrenovated', 'sparse_washerinunit', 'sparse_onsitegarage', 'sparse_assignedparkingspace', 'north', 'sparse_washer', 'sparse_light', 'sparse_dryer', 'sparse_lowrise', 'sparse_sublet', 'sparse_granitekitchen', 'sparse_elev', 'sparse_virtualdoorman', 'sparse_sundeck', 'sparse_rooftopdeck', 'sparse_wallsofwindows', 'sparse_sharesok', 'sparse_duplex', 'sparse_nopets', 'sparse_cable', 'sparse_microwave', 'sparse_wifiaccess', 'sparse_walkincloset', 'sparse_petsonapproval', 'sparse_pool', 'sparse_eatinkitchen', 'sparse_marblebath', 'sparse_live', 'sparse_sauna', 'sparse_greenbuilding', 'sparse_exposedbrick', 'sparse_largelivingroom', 'sparse_bikeroom', 'sparse_highrise', 'sparse_laundry', 'sparse_privateroofdeck', 'sparse_laundryroom', 'sparse_commonbackyard', 'sparse_privatebackyard', 'sparse_parking', 'sparse_privateparking', 'sparse_childrensplayroom', 'sparse_privatebalcony', 'sparse_indoorpool']
leaky = ['manager_worths', 'building_worths', 'price_by_area', 'real_minus_expected_price', 'real_over_expected_price']
excluded = [] #'BoroCode', 'NTACode'] #, 'price_per_bedroom', 'price_per_bathroom', 'price_per_bed_and_bath']
def filter_data_to_columns(data):
    return data #[[col for col in current_filter if col not in leaky and col not in excluded]]

# Run CV

In [4]:
%%time
class FakeDMatrix:
    def __init__(self, data, labels=None):
        self.data = data
        self.labels = labels
        self.num = len(data)

    def num_row(self):
        return self.num

    def slice(self, rindex):
        indices = np.zeros(self.num, dtype=np.bool)
        for index in rindex:
            indices[index] = True
        return FakeDMatrix(data=self.data[indices], labels=self.labels[indices])
    
SEED = 777
NFOLDS = 3
y_map = {'low': 2, 'medium': 1, 'high': 0}

params = {
    'eta':.01,
    'colsample_bytree':.8,
    'subsample':.8,
    'seed':0,
    'nthread':16,
    'objective':'multi:softprob',
    'eval_metric':'mlogloss',
    'num_class':3,
    'silent':1
}

processed_train = filter_data_to_columns(pd.read_json('data/train.json'))
y_train = pd.read_json('data/train_interest.json')

dtrain = FakeDMatrix(data=processed_train, labels=y_train)
processed_train = None
y_train = None

bst = xgb.cv(params, dtrain, 10000, NFOLDS, early_stopping_rounds=50, verbose_eval=200, fpreproc=fpreproc_leaky)
best_rounds = np.argmin(bst['test-mlogloss-mean'])
print (bst['test-mlogloss-mean'][best_rounds])
print (best_rounds)

[0]	train-rmse:2028.02+4.95931	test-rmse:2026.61+63.9387
1577.82290675
[0]	train-rmse:2042.58+48.7314	test-rmse:2040.43+112.343
1572.832428
[0]	train-rmse:2040.83+24.4325	test-rmse:2040.08+66.6658
[200]	train-rmse:1576.46+25.7335	test-rmse:1575.47+74.8311
1575.4721375
[0]	train-mlogloss:1.09149+4.76189e-05	test-mlogloss:1.09179+3.25201e-05
[200]	train-mlogloss:0.590529+0.00206352	test-mlogloss:0.632937+0.00437172
[400]	train-mlogloss:0.509094+0.00245771	test-mlogloss:0.577507+0.00577111
[600]	train-mlogloss:0.473041+0.00278344	test-mlogloss:0.5614+0.00599762
[800]	train-mlogloss:0.448223+0.00272998	test-mlogloss:0.553912+0.00611605
[1000]	train-mlogloss:0.428439+0.00247294	test-mlogloss:0.549505+0.00607799
[1200]	train-mlogloss:0.411654+0.00215984	test-mlogloss:0.546701+0.00602279
[1400]	train-mlogloss:0.396108+0.00176339	test-mlogloss:0.544981+0.00610467
[1600]	train-mlogloss:0.382032+0.00165274	test-mlogloss:0.543897+0.00606365
[1800]	train-mlogloss:0.368611+0.001446	test-mlogloss:0.

- 0.548561666667 for all features (lb 0.54993)
- 0.542629666667 for all features after fixing from all worths being 0

# Run To Submit

In [5]:
%%time
processed_test = filter_data_to_columns(pd.read_json('data/test.json'))
listing_id = pd.read_json('data/test_ids.json').values

dtest = FakeDMatrix(data=processed_test)
processed_test = None

dtrain_final, dtest_final, _ = fpreproc_leaky(dtrain, dtest, None)
dtrain = None
dtest = None

bst = xgb.train(params, dtrain_final, best_rounds)
dtrain_final = None

preds = bst.predict(dtest_final)
# save for column names
#dtest_final = None

preds = pd.DataFrame(preds)
preds.columns = ['high', 'medium', 'low']
preds['listing_id'] = listing_id
preds.to_csv('data/my_preds.csv', index=None)

importance = bst.get_fscore()
feature_importance = [(feature, (importance[feature])) for feature in importance]
print (sorted(feature_importance, key=lambda x: -x[1]))

[0]	train-rmse:2001.04+15.3439	test-rmse:1999.82+26.0661
1772.45883175
[('bc_price', 16597), ('manager_worths', 14684), ('price_by_NTACode', 13343), ('latitude', 12299), ('longitude', 12051), ('desc_words_length', 11477), ('manager_id', 11136), ('price_by_address1', 11125), ('Yday', 10884), ('address1', 10458), ('building_worths', 10298), ('building_id', 10142), ('desc_letters_count', 9832), ('real_minus_expected_price', 9815), ('price_by_area', 9742), ('real_over_expected_price', 9374), ('Day', 9066), ('worth_by_address1', 8782), ('hour', 7722), ('price_per_bathroom', 7590), ('desc_words_count', 7481), ('price_per_bed_and_bath', 7387), ('worth_by_NTACode', 6993), ('photos_count', 6730), ('NTACode', 6324), ('price_by_BoroCode', 6098), ('features_count', 5668), ('Wday', 4445), ('price_per_bedroom', 3903), ('sparse_nofee', 3056), ('bedrooms', 2833), ('sparse_furnished', 1865), ('bathrooms', 1702), ('sparse_laundryinbuilding', 1425), ('sparse_hardwoodfloors', 1132), ('sparse_laundryinunit

In [8]:
#best_preds = pd.read_csv('data/my_best_preds.csv')
#truths = np.argmax(best_preds[['high', 'medium', 'low']].values, axis=1)
#should be < 0.5 for a good submission
#log_loss(truths, preds[['high', 'medium', 'low']].values)

1.0488940684429542

In [10]:
processed_train = filter_data_to_columns(pd.read_json('data/train.json'))
y_train = pd.read_json('data/train_interest.json')
dtrain = FakeDMatrix(data=processed_train, labels=y_train)

processed_test = filter_data_to_columns(pd.read_json('data/test.json'))
listing_id = pd.read_json('data/test_ids.json').values
dtest = FakeDMatrix(data=processed_test)

dtrain_final, dtest_final, _ = fpreproc_leaky(dtrain, dtest, None)

[0]	train-rmse:2095.17+87.0239	test-rmse:2091.08+95.2272
1770.60357675


In [147]:
all_columns = list(dtrain_final.columns)
to_remove = [] #['BoroCode', 'NTACode', 'Day', 'Month', 'address1', 'building_id', 'manager_id', 'hour', 'latitude', 'longitude']
final_columns = [x for x in all_columns if x not in to_remove]

train_X = dtrain_final[final_columns]
train_y = y_train[0]

In [None]:
train_X_scaled = StandardScaler().fit_transform(train_X[nn_columns])

In [148]:
from sklearn.linear_model import RidgeClassifier
clf = RidgeClassifier()
clf.fit(train_X_scaled[ridge_columns][:30000], train_y[:30000])
print (clf.score(train_X_scaled[ridge_columns][30000:], train_y[30000:]))
predictions = clf.predict(train_X_scaled[ridge_columns][30000:])
print (log_loss(train_y[30000:], np.eye(3)[predictions]))

0.69465688301
10.5461776415


In [149]:
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
las = Lasso()
enet = ElasticNet()
las.fit(train_X_scaled[:30000], train_y[:30000])
enet.fit(train_X_scaled[:30000], train_y[:30000])



ElasticNet(alpha=1.0, copy_X=True, fit_intercept=True, l1_ratio=0.5,
      max_iter=1000, normalize=False, positive=False, precompute=False,
      random_state=None, selection='cyclic', tol=0.0001, warm_start=False)

In [150]:
print (las.score(train_X_scaled[30000:], train_y[30000:]))
predictions = las.predict(train_X_scaled[30000:])
print (log_loss(train_y[30000:], np.eye(3)[np.round(predictions).astype(int)]))

0.0352097843285
11.809791412


In [151]:
enet.score(train_X_scaled[30000:], train_y[30000:])
predictions = enet.predict(train_X_scaled[30000:])
print (log_loss(train_y[30000:], np.eye(3)[np.round(predictions).astype(int)]))

11.8222847685


In [152]:
for coef, column in zip(enet.coef_, train_X.columns):
    if coef != 0:
        print (column, ": ", coef)

address1 :  2.32365491057e-05
building_id :  -1.84829459252e-05
desc_letters_count :  -6.79551087636e-05
manager_id :  -2.40155330067e-05
price_per_bathroom :  1.34252912617e-05
price_per_bed_and_bath :  5.86961411328e-06
price_per_bedroom :  1.73797748797e-06
real_minus_expected_price :  -1.71338175675e-05


In [153]:
for coef, column in zip(las.coef_, train_X.columns):
    if coef != 0:
        print (column, ": ", coef)

address1 :  2.31898448989e-05
building_id :  -1.84795507727e-05
desc_letters_count :  -6.48799177199e-05
manager_id :  -2.37266506205e-05
price_per_bathroom :  1.29454934733e-05
price_per_bed_and_bath :  5.61099255267e-06
price_per_bedroom :  1.93396750553e-06
real_minus_expected_price :  -1.66276671859e-05


In [107]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import ExtraTreesClassifier

In [108]:
clf = AdaBoostClassifier(
    ExtraTreesClassifier(),
    n_estimators=600,
    learning_rate=1.5,
    algorithm="SAMME.R")

In [112]:
%%time
clf.fit(train_X[:30000], train_y[0][:30000])

AdaBoostClassifier(algorithm='SAMME',
          base_estimator=ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
           max_depth=None, max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False),
          learning_rate=1.5, n_estimators=600, random_state=None)

In [114]:
clf.score(train_X[30000:], train_y[0][30000:])

0.72307771806531629

In [119]:
%%time
a = clf.predict_proba(train_X[30000:])
print (log_loss(train_y[30000:], a))

1.07066192075
CPU times: user 20.2 s, sys: 3.21 s, total: 23.4 s
Wall time: 25.8 s


In [120]:
clf.feature_importances_

array([             nan,              nan,              nan,
                    nan,              nan,              nan,
                    nan,   0.00000000e+00,              nan,
                    nan,              nan,              nan,
                    nan,              nan,              nan,
                    nan,              nan,              nan,
                    nan,              nan,              nan,
                    nan,              nan,   2.31225636e-06,
                    nan,              nan,              nan,
                    nan,              nan,   5.02411074e-06,
         3.55567116e-07,   8.76669585e-08,   2.35113652e-06,
         6.74598557e-07,   2.20465599e-07,   2.08069778e-07,
         2.10954890e-08,   3.16375940e-07,   4.37580078e-10,
                    nan,   8.96049351e-08,   7.56719487e-07,
         2.58871252e-07,   3.51489228e-08,   3.57170115e-09,
                    nan,   5.85453690e-08,   5.37793560e-07,
                    nan,

In [33]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.layers.normalization import BatchNormalization
from keras.layers.advanced_activations import PReLU
from keras.callbacks import EarlyStopping, ModelCheckpoint

Using Theano backend.


In [176]:
current_filter = ['bc_price', 'price_per_bedroom', 'price_per_bathroom', 'price_per_bed_and_bath', 'BoroCode', 'NTACode', 'price', 'latitude', 'price_by_area', 'longitude', 'manager_worths', 'address1', 'real_over_expected_price', 'desc_words_length', 'real_minus_expected_price', 'manager_id', 'Yday', 'building_id', 'building_worths', 'desc_letters_count', 'Day', 'desc_words_count', 'hour', 'bedrooms', 'photos_count', 'features_count', 'Wday', 'sparse_nofee', 'bathrooms', 'sparse_furnished', 'sparse_laundryinbuilding', 'sparse_hardwoodfloors', 'sparse_laundryinunit', 'Month', 'sparse_catsallowed', 'sparse_exclusive', 'street', 'sparse_elevator', 'sparse_prewar', 'sparse_dogsallowed', 'sparse_reducedfee', 'avenue', 'east', 'sparse_doorman', 'sparse_dishwasher', 'other_address', 'sparse_fitnesscenter', 'sparse_privateoutdoorspace', 'sparse_commonoutdoorspace', 'sparse_outdoorspace', 'sparse_loft', 'sparse_diningroom', 'sparse_balcony', 'sparse_highspeedinternet', 'sparse_parkingspace', 'sparse_terrace', 'sparse_swimmingpool', 'west', 'sparse_roofdeck', 'sparse_actualapt', 'sparse_wheelchairaccess', 'sparse_newconstruction', 'Zero_building_id', 'sparse_simplex', 'sparse_patio', 'sparse_garden', 'sparse_multilevel', 'sparse_hardwood', 'sparse_shorttermallowed', 'south', 'sparse_stainlesssteelappliances', 'sparse_fireplace', 'sparse_highceiling', 'sparse_renovated', 'sparse_liveinsuper', 'sparse_storage', 'sparse_garage', 'sparse_dryerinunit', 'sparse_outdoorareas', 'sparse_petsok', 'sparse_lndrybldg', 'sparse_concierge', 'sparse_new', 'sparse_highceilings', 'sparse_onsitelaundry', 'sparse_centrala', 'sparse_flex3', 'sparse_photos', 'sparse_view', 'sparse_publicoutdoor', 'sparse_allutilitiesincluded', 'sparse_residentslounge', 'sparse_newlyrenovated', 'sparse_washerinunit', 'sparse_onsitegarage', 'sparse_assignedparkingspace', 'north', 'sparse_washer', 'sparse_light', 'sparse_dryer', 'sparse_lowrise', 'sparse_sublet', 'sparse_granitekitchen', 'sparse_elev', 'sparse_virtualdoorman', 'sparse_sundeck', 'sparse_rooftopdeck', 'sparse_wallsofwindows', 'sparse_sharesok', 'sparse_duplex', 'sparse_nopets', 'sparse_cable', 'sparse_microwave', 'sparse_wifiaccess', 'sparse_walkincloset', 'sparse_petsonapproval', 'sparse_pool', 'sparse_eatinkitchen', 'sparse_marblebath', 'sparse_live', 'sparse_sauna', 'sparse_greenbuilding', 'sparse_exposedbrick', 'sparse_largelivingroom', 'sparse_bikeroom', 'sparse_highrise', 'sparse_laundry', 'sparse_privateroofdeck', 'sparse_laundryroom', 'sparse_commonbackyard', 'sparse_privatebackyard', 'sparse_parking', 'sparse_privateparking', 'sparse_childrensplayroom', 'sparse_privatebalcony', 'sparse_indoorpool']
leaky = ['manager_worths', 'building_worths', 'price_by_area', 'real_minus_expected_price', 'real_over_expected_price', 'price_by_address1', 'worth_by_address1', 'price_by_BoroCode', 'worth_by_BoroCode', 'price_by_NTACode', 'worth_by_NTACode']
excluded = ['price']
nn_columns = [col for col in train_X.columns if col not in leaky and col not in excluded]

In [190]:
from sklearn.preprocessing import StandardScaler
processed_train_scaled = StandardScaler().fit_transform(processed_train)

xtr = processed_train_scaled[:30000]
ytr = np.eye(3)[train_y[:30000].values]
xte = processed_train_scaled[30000:]
yte = np.eye(3)[train_y[30000:].values]

In [194]:
def create_model():
    model = Sequential()
    
    model.add(Dense(500, input_dim = processed_train.shape[1], init = 'he_normal', activation='sigmoid'))
    model.add(BatchNormalization())
    model.add(Dropout(0.5))
    model.add(PReLU())

    model.add(Dense(50, init = 'he_normal', activation='sigmoid'))
    model.add(BatchNormalization())    
    model.add(Dropout(0.5))
    model.add(PReLU())

    model.add(Dense(3, init = 'he_normal', activation='softmax'))
    model.compile(loss = 'categorical_crossentropy', optimizer = 'adam')
    return model

In [195]:
from keras.callbacks import Callback
filepath = 'data/nn_weights'

class NBatchLogger(Callback):
    def __init__(self):
        self.seen = 0

    def on_epoch_end(self, batch, logs={}):
        self.seen += 1
        if self.seen % 20 == 0:
            print('epoch {}: loss = {}, val = {}'.format(self.seen, logs['loss'], logs['val_loss'])) 

logger = NBatchLogger()
early_stop = EarlyStopping(monitor='val_loss', patience=40, verbose=0)
checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose=0, save_best_only=True)
#model.fit(xtr, ytr, nb_epoch = 1200, batch_size=1024, verbose = 0, validation_data=[xte, yte], callbacks=[NBatchLogger()])
model = create_model()
model.fit(xtr, ytr, nb_epoch = 1200, batch_size=1000, verbose = 0, validation_data=[xte, yte], callbacks=[logger, early_stop, checkpoint])
model = create_model()
model.load_weights(filepath)

INFO (theano.gof.compilelock): Refreshing lock /Users/jgzuke/.theano/compiledir_Darwin-15.6.0-x86_64-i386-64bit-i386-3.5.3-64/lock_dir/lock


epoch 20: loss = 0.6220818996429444, val = 0.6195430807852459
epoch 40: loss = 0.5851057330767314, val = 0.6069866230340968
epoch 60: loss = 0.561422390739123, val = 0.607030198513944
epoch 80: loss = 0.5377430597941081, val = 0.6089193273694519


In [193]:
log_loss(yte, model.predict(xte))

0.60965967797495035