# Sberbank Russian Housing Market - Kaggle
### 25 June, 2017
# Light GBM

Use price/sqm for full_sq.

In [1]:
## Variables specific for competition

DATE = '170626'                                      # included in the name of submit files and other output files

ID = 'id'                                            
TARGET = 'price_doc'                                 
LOGTARGET = 'log_price_doc'                         
DIRECTORY = 'D:/J/Kaggle/6_Sberbank_HousingPrices_2017/'     

# The final training, validation and test set are saved into these files:
TRAIN_PREPROCESSED = DIRECTORY + 'data/train_preprocessed.csv'    
VALID_PREPROCESSED = DIRECTORY + 'data/valid_preprocessed.csv'    
LOCALTEST_PREPROCESSED = DIRECTORY + 'data/localtest_preprocessed.csv'    

# The predictions will be made for this dataset:
TEST_PREPROCESSED = DIRECTORY + 'data/test_preprocessed.csv'   

# The file for the submission:
# in the notebook cells the file name will be
# SUBMISSION_FILE + 'validscore_num.csv'
# where 'validscore' and 'num' refer to score on validation set and prediction respectively
SUBMISSION_FILE = DIRECTORY + 'submissions/submit_lgbpy_' + DATE   

# First part of file name to save predictions on valid and local test sets (similar to the submission):
VALIDPRED_FILE = DIRECTORY + 'predictions/valid_lgbpy_' + DATE     
LOCALTESTPRED_FILE = DIRECTORY + 'predictions/localtest_lgbpy_' + DATE

# File for recording validation scores:
SCORES_FILE = DIRECTORY + 'records/validscores_lgbpy_' + DATE + '.txt' 
RECORD_SUBMISSION = 'submit_lgbpy_' + DATE     # to record the name of submission

In [2]:
TARGET_PER_FULLSQ = 'price_per_fullsq'
LOG_TARGET_PER_FULLSQ = 'log_of_price_per_fullsq'

In [3]:
import numpy as np
import pandas as pd
import lightgbm as lgb
import matplotlib.pyplot as plt
import math

np.random.seed(int(DATE))

In [4]:
def rmsle(logtarget, logpred):
    # logtarget = np.log(target + 1)
    # logpred is the prediction of logtarget
    return np.sqrt(np.sum(np.square(logpred - logtarget)) / logtarget.shape[0])

In [5]:
features_importance = pd.read_csv(DIRECTORY + 'data/important_features_fullsq.csv', header = None)
features_importance = list(features_importance.loc[:, 0])
features_restr = features_importance[:6]
features_restr

['cafe_count_3000_price_1500',
 'cafe_count_2000',
 'full_sq',
 'cafe_count_5000',
 'cafe_count_3000',
 'sport_count_2000']

In [6]:
train = pd.read_csv(TRAIN_PREPROCESSED)
valid = pd.read_csv(VALID_PREPROCESSED)
localtest = pd.read_csv(LOCALTEST_PREPROCESSED)
test = pd.read_csv(TEST_PREPROCESSED)

# impute missing full_sq with either life_sq or the median in the neighborhood
test.loc[464, 'full_sq'] = 37.8
test.loc[5383, 'full_sq'] = 42.07
test.loc[6350, 'full_sq'] = test[test['apartment_name'] == test.loc[6350, 'apartment_name']]['full_sq'].median()

train[TARGET_PER_FULLSQ] = train[TARGET] / train['full_sq']
train[LOG_TARGET_PER_FULLSQ] = np.log(train[TARGET_PER_FULLSQ] + 1)
valid[TARGET_PER_FULLSQ] = valid[TARGET] / valid['full_sq']
valid[LOG_TARGET_PER_FULLSQ] = np.log(valid[TARGET_PER_FULLSQ] + 1)
localtest[TARGET_PER_FULLSQ] = localtest[TARGET] / localtest['full_sq']
localtest[LOG_TARGET_PER_FULLSQ] = np.log(localtest[TARGET_PER_FULLSQ] + 1)

# drop rows with nan in column LOG_TARGET_PER_LIFESQ
train.dropna(subset=[LOG_TARGET_PER_FULLSQ], inplace = True)
valid.dropna(subset=[LOG_TARGET_PER_FULLSQ], inplace = True)
localtest.dropna(subset=[LOG_TARGET_PER_FULLSQ], inplace = True)

train_y = train[LOG_TARGET_PER_FULLSQ]
valid_id = valid[ID].copy()
valid_y = valid[LOG_TARGET_PER_FULLSQ].copy()
localtest_id = localtest[ID]
localtest_y = localtest[LOG_TARGET_PER_FULLSQ]
alltrain_y = pd.concat([train_y, valid_y, localtest_y], ignore_index = True)
test_id = test[ID]

train.drop([ID, TARGET, LOGTARGET, TARGET_PER_FULLSQ, LOG_TARGET_PER_FULLSQ, 'timestamp'], axis = 1, inplace = True)
valid.drop([ID, TARGET, LOGTARGET, TARGET_PER_FULLSQ, LOG_TARGET_PER_FULLSQ, 'timestamp'], axis = 1, inplace = True)
localtest.drop([ID, TARGET, LOGTARGET, TARGET_PER_FULLSQ, LOG_TARGET_PER_FULLSQ, 'timestamp'], axis = 1, inplace = True)
test.drop([ID, 'timestamp'], axis = 1, inplace = True)
print("Data: X_train: {}, X_valid: {}, X_localtest: {}, X_test: {}".format(train.shape, valid.shape,
                                                                           localtest.shape, test.shape))

pos1 = train.shape[0]
pos2 = train.shape[0] + valid.shape[0]
pos3 = train.shape[0] + valid.shape[0] + localtest.shape[0]

data = pd.concat([train, valid, localtest, test], ignore_index = True)
del train, valid, localtest, test

data_num = data.select_dtypes(exclude = ['object'])
data_obj = data.select_dtypes(include = ['object']).copy()
for c in data_obj:
    data_obj[c] = pd.factorize(data_obj[c])[0]

data = pd.concat([data_num, data_obj], axis = 1)

train = data[ : pos1]
valid = data[pos1 : pos2]
localtest = data[pos2 : pos3]
test = data[pos3 : ]

features_restr = features_importance[:300]
train_restr = train[features_restr]
valid_restr = valid[features_restr]
localtest_restr = localtest[features_restr]
test_restr = test[features_restr]

del train, valid, localtest, test

train_lgb = lgb.Dataset(train_restr, train_y)

NUM = 170901
ROUNDS = 1500
params = {
	'objective': 'regression',
    'metric': 'rmse',
    'boosting': 'gbdt',
    'learning_rate': 0.01,
    'verbose': 1,
    'num_leaves': 2 ** 5,
    'bagging_fraction': 0.95,
    'bagging_freq': 1,
    'bagging_seed': DATE,
    'feature_fraction': 0.7,
    'feature_fraction_seed': DATE,
    'max_bin': 100,
    'max_depth': 5,
    'num_rounds': ROUNDS
}

model = lgb.train(params, train_lgb, num_boost_round = ROUNDS)

print('Predicting validation and local test file...')
pred_valid = model.predict(valid_restr)
pred_localtest = model.predict(localtest_restr)
print(type(pred_valid))
print(pred_valid.shape)
validscore = rmsle(valid_y, pred_valid)
print('RMSLE on validation set:', validscore)

pred_valid = np.exp(pred_valid) - 1
pred_valid = pred_valid * valid_restr['full_sq']
pred_localtest = np.exp(pred_localtest) - 1
pred_localtest = pred_localtest * localtest_restr['full_sq']

pred_valid.reset_index(drop = True, inplace = True)
valid_id.reset_index(drop = True, inplace = True)
pred_localtest.reset_index(drop = True, inplace = True)
localtest_id.reset_index(drop = True, inplace = True)
out_valid = pd.DataFrame({ID : valid_id, TARGET : pred_valid})
out_localtest = pd.DataFrame({ID : localtest_id, TARGET: pred_localtest})

# Record validation score:
print('Recording scores...')
with open(SCORES_FILE, 'a') as outfile:
    outfile.write('\n' + RECORD_SUBMISSION + str(validscore)[2:5] + '_' + str(NUM) + 
                  '.csv, valid score: ' + str(validscore) + '\n')

# Save predictions for valid:
out_valid.to_csv(VALIDPRED_FILE + str(validscore)[2:5] + '_' + str(NUM) +'.csv', index = False)

# Save prediction for localtest:
out_localtest.to_csv(LOCALTESTPRED_FILE + str(validscore)[2:5] + '_' + str(NUM) +'.csv', 
                      index = False)

print("Features importance...")
gain = model.feature_importance('gain')
ft = pd.DataFrame({'feature' : model.feature_name(), 'split' : model.feature_importance('split'), 
                   'gain' : 100 * gain / gain.sum()}).sort_values('gain', ascending = False)
print(ft.head(25))

plt.figure()
ft[['feature', 'gain']].head(25).plot(kind = 'barh', x = 'feature', y = 'gain', legend = False, 
                                      figsize = (10, 20))
plt.gcf().savefig('features_importance' + str(validscore)[2:5] + '_' + str(NUM) + '.png')

# Train the final model and create prediction
alltrain_restr = pd.concat([train_restr, valid_restr, localtest_restr], ignore_index = True)
alltrain_lgb = lgb.Dataset(alltrain_restr, alltrain_y)
print('Training model on alltrain dataset and creating predictions to submit...') 
allmodel = lgb.train(params, alltrain_lgb, num_boost_round = ROUNDS)
pred_test = allmodel.predict(test_restr)
pred_test = np.exp(pred_test) - 1
pred_test = pred_test * test_restr['full_sq']

# Predictions to submit:
pred_test.reset_index(drop = True, inplace = True)
out_test = pd.DataFrame({ID : test_id, TARGET : pred_test})
out_test.to_csv(SUBMISSION_FILE + str(validscore)[2:5] + '_' + str(NUM) +'.csv', index = False) 
                                                                        # do not include index column

print("Done.")
del alltrain_restr, alltrain_lgb
##### public score: 0.31880, private score: 0.32223

Data: X_train: (21201, 1110), X_valid: (4710, 1110), X_localtest: (4531, 1110), X_test: (7662, 1110)
Predicting validation and local test file...
<class 'numpy.ndarray'>
(4710,)
RMSLE on validation set: 0.430532614533
Recording scores...
Features importance...
                                  feature      gain  split
0              cafe_count_3000_price_1500  8.288014     74
1                         cafe_count_2000  6.370490    115
2                                 full_sq  5.943501   1348
4                         cafe_count_3000  2.918350     73
3                         cafe_count_5000  1.832805    114
10                apartment_name_yr_month  1.573582    318
5                        sport_count_2000  1.515725     98
8               ratio_nonlifesq_to_lifesq  1.473284    446
7   raion_build_count_with_builddate_info  1.411538     90
9              cafe_count_5000_price_high  1.181969     57
6                          apartment_name  1.180948    354
14                             