In [477]:
import os
import sys
import operator
import numpy as np
import pandas as pd
from scipy import sparse
import xgboost as xgb
from xgboost import XGBClassifier,XGBRegressor
from sklearn import model_selection, preprocessing, ensemble
from sklearn.metrics import log_loss
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import hashlib
import re
from sklearn.model_selection import GridSearchCV,StratifiedKFold

In [478]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [479]:
from feature_extraction import *

In [498]:
from model import *

In [481]:
from utils import *

In [482]:
train_df,test_df = read_data()

In [483]:
cat_cv_fea = Categorical_cv(nfold = 5)

In [484]:
cat_lit_fea = CategoricalLit(nfold=5)

In [485]:
txt_fea = TextFeature()

In [486]:
addr_fea = AddressFeature()

In [487]:
quant_fea = PriceQuantileFeature(0)

In [488]:
gbm_quant_fea = GbmQuantPrice(['latitude','longitude'],'gbm_quant_lat_long')

In [489]:
mis_fea = Miscellous()

In [499]:
gmb = GBM_model(train_df,test_df,feature_list = [txt_fea,addr_fea,cat_cv_fea,mis_fea],target_train= 'interest_level')

Keyword features extract from the column features
Stree feature: street number, address number, etc.
['address_num', 'west_east', 'street_num', 'ave_num']
Categorical feature generated by CV
['building_id', 'manager_id', 'display_address', 'building_id_medium', 'building_id_high', 'manager_id_medium', 'manager_id_high']
Miscellous features
['weekdays', 'manager_count', 'building_count', 'num_features', 'num_description_words', 'days', 'num_photos', 'created_month', 'created_day', 'created_hour']


In [None]:
param_grid = {'loss':['deviance'], 
              'learning_rate':[0.01], 
              'n_estimators':[2000,3000],
              'subsample':[0.8], 
              'min_samples_split':[10], 
              'min_samples_leaf':[2], 
              'max_depth':[6,8]}
gmb.gridsearch_cv(param_grid,verbose=1)

Fitting 3 folds for each of 4 candidates, totalling 12 fits


In [493]:
a={'a':4,'b':5}

In [232]:
xgb1 = XGB_model(train_df,test_df,feature_list = [txt_fea,addr_fea,cat_cv_fea,mis_fea],target_train= 'interest_level')
xgb1.cv_study(nImportance=10,verbose=False,n_estimators = 5000)

Keyword features extract from the column features
Stree feature: street number, address number, etc.
Categorical feature generated by CV
Miscellous features


In [233]:
xgb1.set_classifier(4000)
pred1 = xgb1.fit_predict_proba()

In [234]:
write_output(pred1,test_df,'txt_addr_catcv_mis_')

In [235]:
xgb2 = XGB_model(train_df,test_df,feature_list = [txt_fea,addr_fea,cat_cv_fea,mis_fea,gbm_quant_fea],target_train= 'interest_level')
xgb2.cv_study(nImportance=10,verbose=False,n_estimators = 5000)

Keyword features extract from the column features
Stree feature: street number, address number, etc.
Categorical feature generated by CV
Miscellous features
Use GBM to predict quantile house price using GBM quantile regression


In [238]:
xgb2.set_classifier(4000)
pred2 = xgb2.fit_predict_proba()
write_output(pred2,test_df,'txt_addr_catcv_mis_gbmQaunt_')

In [363]:
model_rf = RandomForest_model(train_df,test_df,feature_list = [txt_fea,addr_fea,cat_cv_fea,mis_fea,gbm_quant_fea],target_train= 'interest_level')

Keyword features extract from the column features
Stree feature: street number, address number, etc.
['address_num', 'west_east', 'street_num', 'ave_num']
Categorical feature generated by CV
['building_id', 'manager_id', 'display_address', 'building_id_medium', 'building_id_high', 'manager_id_medium', 'manager_id_high']
Miscellous features
['weekdays', 'manager_count', 'building_count', 'num_features', 'num_description_words', 'days', 'num_photos', 'created_month', 'created_day', 'created_hour']
Use GBM to predict quantile house price using GBM quantile regression
['gbm_quant_lat_long']


In [364]:
model_rf.cv_study(nImportance=10,n_estimators = 1500)

The most 10 important features are
manager_id_medium, importance=0.0665863069499
manager_id_high, importance=0.0622129739155
gbm_quant_lat_long, importance=0.0552751844989
price, importance=0.051376863147
building_id_medium, importance=0.0476516055195
building_id_high, importance=0.0452753255301
listing_id, importance=0.0343831113772
days, importance=0.0341003087583
num_description_words, importance=0.0338107106163
building_id, importance=0.0327003727601
[0.56828084670103385]
The most 10 important features are
manager_id_medium, importance=0.0667871900053
manager_id_high, importance=0.062113909366
gbm_quant_lat_long, importance=0.0547547305857
price, importance=0.0516672412976
building_id_medium, importance=0.0489324224662
building_id_high, importance=0.0454554521322
listing_id, importance=0.0344670387168
days, importance=0.0340783551556
num_description_words, importance=0.0338967508572
building_id, importance=0.0326302932589
[0.56828084670103385, 0.57031815518447682]
The most 10 impor

In [None]:
500: 0.5682
800: 0.5669
1200: 0.5657

In [470]:
model_lr = Logistic_model(train_df,test_df,feature_list = [txt_fea,addr_fea,cat_cv_fea,mis_fea,gbm_quant_fea],target_train= 'interest_level')

Keyword features extract from the column features
Stree feature: street number, address number, etc.
['address_num', 'west_east', 'street_num', 'ave_num']
Categorical feature generated by CV
['building_id', 'manager_id', 'display_address', 'building_id_medium', 'building_id_high', 'manager_id_medium', 'manager_id_high']
Miscellous features
['weekdays', 'manager_count', 'building_count', 'num_features', 'num_description_words', 'days', 'num_photos', 'created_month', 'created_day', 'created_hour']
Use GBM to predict quantile house price using GBM quantile regression
['gbm_quant_lat_long']


In [471]:
model_lr.cv_study(nImportance=0,scoring='neg_log_loss',n_jobs=-1)

[0.73742073083311099]
[0.73742073083311099, 0.73293931020179703]
[0.73742073083311099, 0.73293931020179703, 0.73571465203896957]
[0.73742073083311099, 0.73293931020179703, 0.73571465203896957, 0.73452639184212698]
[0.73742073083311099, 0.73293931020179703, 0.73571465203896957, 0.73452639184212698, 0.73737914031443685]
proba mean score=0.735596045046, std=0.00171612194122
pred mean score=0.694662836854, std=9.25649009163e-05


In [475]:
model_svm = SVM_model(train_df,test_df,feature_list = [txt_fea,addr_fea,cat_cv_fea,mis_fea,gbm_quant_fea],target_train= 'interest_level')


Keyword features extract from the column features
Stree feature: street number, address number, etc.
['address_num', 'west_east', 'street_num', 'ave_num']
Categorical feature generated by CV
['building_id', 'manager_id', 'display_address', 'building_id_medium', 'building_id_high', 'manager_id_medium', 'manager_id_high']
Miscellous features
['weekdays', 'manager_count', 'building_count', 'num_features', 'num_description_words', 'days', 'num_photos', 'created_month', 'created_day', 'created_hour']
Use GBM to predict quantile house price using GBM quantile regression
['gbm_quant_lat_long']


In [476]:
model_svm.cv_study(nImportance=0)

Accuracy = 0.694661128558
Accuracy = 0.694661128558
Accuracy = 0.694661128558
Accuracy = 0.694661128558
Accuracy = 0.694770976895
proba mean score=nan, std=nan
all accuracies----
[0.69466112855840345, 0.69466112855840345, 0.69466112855840345, 0.69466112855840345, 0.69477097689501421]
accuracy mean score=0.694683098226, std=4.39393346443e-05


  ret = ret.dtype.type(ret / rcount)
  arrmean, rcount, out=arrmean, casting='unsafe', subok=False)
  ret = ret.dtype.type(ret / rcount)
