In [15]:
import os
import sys
import operator
import numpy as np
import pandas as pd
from scipy import sparse
import xgboost as xgb
from xgboost import XGBClassifier,XGBRegressor
from sklearn import model_selection, preprocessing, ensemble
from sklearn.metrics import log_loss
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import hashlib
import re
from sklearn.model_selection import GridSearchCV,StratifiedKFold

In [16]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [17]:
from feature_extraction import *

from model import *

In [18]:
from utils import *

In [19]:
train_df,test_df = read_data()

In [20]:
cat_cv_fea = Categorical_cv(nfold = 5,bEncoding=True)

In [21]:
cat_lit_fea = CategoricalLit(nfold=5,bEncoding=True)

In [22]:
txt_fea = TextFeature()

In [23]:
addr_fea = AddressFeature()

In [24]:
quant_fea = PriceQuantileFeature(0)

In [25]:
gbm_quant_fea = GbmQuantPrice(['days','latitude','longitude'],'gbm_quant_lat_long')

In [26]:
mis_fea = Miscellous()

In [499]:
gmb = GBM_model(train_df,test_df,feature_list = [txt_fea,addr_fea,cat_cv_fea,mis_fea],target_train= 'interest_level')

Keyword features extract from the column features
Stree feature: street number, address number, etc.
['address_num', 'west_east', 'street_num', 'ave_num']
Categorical feature generated by CV
['building_id', 'manager_id', 'display_address', 'building_id_medium', 'building_id_high', 'manager_id_medium', 'manager_id_high']
Miscellous features
['weekdays', 'manager_count', 'building_count', 'num_features', 'num_description_words', 'days', 'num_photos', 'created_month', 'created_day', 'created_hour']


In [502]:
param_grid = {'loss':['deviance'], 
              'learning_rate':[0.01], 
              'n_estimators':[1500,2000],
              'subsample':[0.8], 
              'min_samples_split':[10], 
              'min_samples_leaf':[2], 
              'max_depth':[6,8]}
gmb.gridsearch_cv(param_grid,verbose=1)

Fitting 3 folds for each of 4 candidates, totalling 12 fits


[Parallel(n_jobs=-1)]: Done  10 out of  12 | elapsed: 355.7min remaining: 71.1min
[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed: 385.0min finished


{'std_train_score': array([ 0.00191856,  0.00191043,  0.0013407 ,  0.00220532]), 'rank_test_score': array([2, 1, 3, 4], dtype=int32), 'param_loss': masked_array(data = ['deviance' 'deviance' 'deviance' 'deviance'],
             mask = [False False False False],
       fill_value = ?)
, 'split1_train_score': array([-0.37216733, -0.33199127, -0.21044394, -0.15935371]), 'split2_train_score': array([-0.37367434, -0.33369592, -0.21106314, -0.16201764]), 'param_subsample': masked_array(data = [0.8 0.8 0.8 0.8],
             mask = [False False False False],
       fill_value = ?)
, 'split0_train_score': array([-0.36906589, -0.32906942, -0.20796049, -0.15661591]), 'split2_test_score': array([-0.53377725, -0.53156216, -0.53524813, -0.5404311 ]), 'mean_score_time': array([ 10.31839299,  13.23844926,   7.85778205,   5.61056749]), 'mean_fit_time': array([ 10407.86071897,  13623.52687844,  12930.08530903,  10592.81665881]), 'param_max_depth': masked_array(data = [6 6 8 8],
             mask = [Fal

{'learning_rate': 0.01,
 'loss': 'deviance',
 'max_depth': 6,
 'min_samples_leaf': 2,
 'min_samples_split': 10,
 'n_estimators': 2000,
 'subsample': 0.8}

In [493]:
a={'a':4,'b':5}

In [14]:
xgb1 = XGB_model(train_df,test_df,feature_list = [txt_fea,mis_fea],target_train= 'interest_level')
xgb1.cv_study(nImportance=10,verbose=False,n_estimators = 5000)

Keyword features extract from the column features
Miscellous features
['hour_size', 'weekdays', 'manager_count', 'building_count', 'num_features', 'num_description_words', 'days', 'num_photos', 'created_month', 'created_day', 'created_hour']
The most 10 important features are
price, importance=0.0896404981613
latitude, importance=0.0864164680243
longitude, importance=0.0795583277941
listing_id, importance=0.0726969465613
num_description_words, importance=0.068866699934
manager_count, importance=0.0681470036507
hour_size, importance=0.0594329014421
days, importance=0.0582739412785
building_count, importance=0.0552914328873
num_photos, importance=0.0366329476237
[0.53980579011229757]
best iterations:4473, best_score=0.53979, last_score=0.539805790112
The most 10 important features are
price, importance=0.0902360454202
latitude, importance=0.0877131223679
longitude, importance=0.0807876437902
listing_id, importance=0.0726312473416
num_description_words, importance=0.0681154951453
manager_

KeyboardInterrupt: 

In [27]:
xgb1 = XGB_model(train_df,test_df,feature_list = [addr_fea,mis_fea,gbm_quant_fea],target_train= 'interest_level')
xgb1.cv_study(nImportance=10,verbose=False,n_estimators = 5000)

Stree feature: street number, address number, etc.
['address_num', 'west_east', 'street_num', 'ave_num']
Miscellous features
['hour_size', 'weekdays', 'manager_count', 'building_count', 'num_features', 'num_description_words', 'days', 'num_photos', 'created_month', 'created_day', 'created_hour']
Use GBM to predict quantile house price using GBM quantile regression
['gbm_quant_lat_long']
The most 10 important features are
price, importance=0.0804840326309
latitude, importance=0.0792404040694
longitude, importance=0.0751406699419
listing_id, importance=0.0730630084872
manager_count, importance=0.0720640271902
num_description_words, importance=0.0715321004391
days, importance=0.0629360154271
address_num, importance=0.0623002946377
hour_size, importance=0.0590160638094
building_count, importance=0.0553092584014
[0.55485163186374142]
best iterations:3726, best_score=0.554837, last_score=0.554851631864
The most 10 important features are
price, importance=0.0810680910945
latitude, importance=

KeyboardInterrupt: 

In [233]:
xgb1.set_classifier(4000)
pred1 = xgb1.fit_predict_proba()

In [234]:
write_output(pred1,test_df,'txt_addr_catcv_mis_')

In [235]:
xgb2 = XGB_model(train_df,test_df,feature_list = [txt_fea,addr_fea,cat_cv_fea,mis_fea,gbm_quant_fea],target_train= 'interest_level')
xgb2.cv_study(nImportance=10,verbose=False,n_estimators = 5000)

Keyword features extract from the column features
Stree feature: street number, address number, etc.
Categorical feature generated by CV
Miscellous features
Use GBM to predict quantile house price using GBM quantile regression


In [238]:
xgb2.set_classifier(4000)
pred2 = xgb2.fit_predict_proba()
write_output(pred2,test_df,'txt_addr_catcv_mis_gbmQaunt_')

In [363]:
model_rf = RandomForest_model(train_df,test_df,feature_list = [txt_fea,addr_fea,cat_cv_fea,mis_fea,gbm_quant_fea],target_train= 'interest_level')

Keyword features extract from the column features
Stree feature: street number, address number, etc.
['address_num', 'west_east', 'street_num', 'ave_num']
Categorical feature generated by CV
['building_id', 'manager_id', 'display_address', 'building_id_medium', 'building_id_high', 'manager_id_medium', 'manager_id_high']
Miscellous features
['weekdays', 'manager_count', 'building_count', 'num_features', 'num_description_words', 'days', 'num_photos', 'created_month', 'created_day', 'created_hour']
Use GBM to predict quantile house price using GBM quantile regression
['gbm_quant_lat_long']


In [364]:
model_rf.cv_study(nImportance=10,n_estimators = 1500)

The most 10 important features are
manager_id_medium, importance=0.0665863069499
manager_id_high, importance=0.0622129739155
gbm_quant_lat_long, importance=0.0552751844989
price, importance=0.051376863147
building_id_medium, importance=0.0476516055195
building_id_high, importance=0.0452753255301
listing_id, importance=0.0343831113772
days, importance=0.0341003087583
num_description_words, importance=0.0338107106163
building_id, importance=0.0327003727601
[0.56828084670103385]
The most 10 important features are
manager_id_medium, importance=0.0667871900053
manager_id_high, importance=0.062113909366
gbm_quant_lat_long, importance=0.0547547305857
price, importance=0.0516672412976
building_id_medium, importance=0.0489324224662
building_id_high, importance=0.0454554521322
listing_id, importance=0.0344670387168
days, importance=0.0340783551556
num_description_words, importance=0.0338967508572
building_id, importance=0.0326302932589
[0.56828084670103385, 0.57031815518447682]
The most 10 impor

In [None]:
500: 0.5682
800: 0.5669
1200: 0.5657

In [470]:
model_lr = Logistic_model(train_df,test_df,feature_list = [txt_fea,addr_fea,cat_cv_fea,mis_fea,gbm_quant_fea],target_train= 'interest_level')

Keyword features extract from the column features
Stree feature: street number, address number, etc.
['address_num', 'west_east', 'street_num', 'ave_num']
Categorical feature generated by CV
['building_id', 'manager_id', 'display_address', 'building_id_medium', 'building_id_high', 'manager_id_medium', 'manager_id_high']
Miscellous features
['weekdays', 'manager_count', 'building_count', 'num_features', 'num_description_words', 'days', 'num_photos', 'created_month', 'created_day', 'created_hour']
Use GBM to predict quantile house price using GBM quantile regression
['gbm_quant_lat_long']


In [471]:
model_lr.cv_study(nImportance=0,scoring='neg_log_loss',n_jobs=-1)

[0.73742073083311099]
[0.73742073083311099, 0.73293931020179703]
[0.73742073083311099, 0.73293931020179703, 0.73571465203896957]
[0.73742073083311099, 0.73293931020179703, 0.73571465203896957, 0.73452639184212698]
[0.73742073083311099, 0.73293931020179703, 0.73571465203896957, 0.73452639184212698, 0.73737914031443685]
proba mean score=0.735596045046, std=0.00171612194122
pred mean score=0.694662836854, std=9.25649009163e-05


In [475]:
model_svm = SVM_model(train_df,test_df,feature_list = [txt_fea,addr_fea,cat_cv_fea,mis_fea,gbm_quant_fea],target_train= 'interest_level')


Keyword features extract from the column features
Stree feature: street number, address number, etc.
['address_num', 'west_east', 'street_num', 'ave_num']
Categorical feature generated by CV
['building_id', 'manager_id', 'display_address', 'building_id_medium', 'building_id_high', 'manager_id_medium', 'manager_id_high']
Miscellous features
['weekdays', 'manager_count', 'building_count', 'num_features', 'num_description_words', 'days', 'num_photos', 'created_month', 'created_day', 'created_hour']
Use GBM to predict quantile house price using GBM quantile regression
['gbm_quant_lat_long']


In [476]:
model_svm.cv_study(nImportance=0)

Accuracy = 0.694661128558
Accuracy = 0.694661128558
Accuracy = 0.694661128558
Accuracy = 0.694661128558
Accuracy = 0.694770976895
proba mean score=nan, std=nan
all accuracies----
[0.69466112855840345, 0.69466112855840345, 0.69466112855840345, 0.69466112855840345, 0.69477097689501421]
accuracy mean score=0.694683098226, std=4.39393346443e-05


  ret = ret.dtype.type(ret / rcount)
  arrmean, rcount, out=arrmean, casting='unsafe', subok=False)
  ret = ret.dtype.type(ret / rcount)
