In [1]:
import operator
import numpy as np
import pandas as pd
from scipy import sparse
import xgboost as xgb
from xgboost import XGBClassifier,XGBRegressor
from sklearn import model_selection, preprocessing, ensemble
from sklearn.metrics import log_loss
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import re
from sklearn.model_selection import GridSearchCV,StratifiedKFold
from sklearn.linear_model import LinearRegression



In [2]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
from feature_extraction import *

from model import *

In [4]:
def read_data(data_path='../input/'):
    
    train_file = data_path + "train.json"
    test_file = data_path + "test.json"
    train_df = pd.read_json(train_file)
    test_df = pd.read_json(test_file)
    interest_map = {'low':0,'medium':1,'high':2}
    train_df['interest_level'] = train_df['interest_level'].map(interest_map)
    fmt = lambda s: s.replace("\u00a0", "").strip().lower()
    train_df["street_address"] = train_df['street_address'].apply(fmt)
    train_df["display_address"] = train_df["display_address"].apply(fmt)
    
    test_df["street_address"] = test_df['street_address'].apply(fmt)
    test_df["display_address"] = test_df["display_address"].apply(fmt)
            
    return train_df,test_df

def write_output(preds,test_df,prefix=''):
    out_df = pd.DataFrame(preds)
    out_df.columns = ["low", "medium", "high"]
    out_df["listing_id"] = test_df.listing_id.values
    
    import time
    filename = prefix + time.strftime("%m.%d.") + '.csv'
    out_df.to_csv(filename, index=False)

In [5]:
def basic_feature(train_df_,test_df_):    
    train_df = train_df_.copy()
    test_df = test_df_.copy()
    
    train_df["created"], test_df["created"] = pd.to_datetime(train_df["created"]), pd.to_datetime(test_df["created"])
    test_df['days'], train_df['days'] = test_df['created']-pd.to_datetime('2016-04-01'), train_df['created'] - pd.to_datetime('2016-04-01')
    test_df['hours'], train_df['hours'] = test_df['days']/np.timedelta64(1,'h'), train_df['days']/np.timedelta64(1,'h')
    test_df['days'], train_df['days'] = test_df['days']/np.timedelta64(1, 'D'), train_df['days']/np.timedelta64(1,'D')
    
    train_df['hours'], test_df['hours'] = train_df['hours'].map(int), test_df['hours'].map(int)
    gp = train_df.append(test_df).groupby('hours').size()
    gp.name = 'hour_size'
    gp = gp.reset_index()
    train_df = pd.merge(train_df,gp,on='hours',how='left')
    test_df = pd.merge(test_df,gp,on='hours',how='left')
    del test_df['hours']; del train_df['hours']
    
    train_df['weekdays'], test_df['weekdays'] = train_df['created'].map(lambda x:x.weekday()), test_df['created'].map(lambda x:x.weekday())

    # Features from date columns #
    train_df["created_month"], test_df["created_month"] = train_df["created"].dt.month, test_df["created"].dt.month
    train_df["created_day"], test_df["created_day"] = train_df["created"].dt.day, test_df["created"].dt.day
    train_df["created_hour"], test_df["created_hour"]= train_df["created"].dt.hour, test_df["created"].dt.hour
    # count of photos #
    train_df["num_photos"], test_df["num_photos"]  = train_df["photos"].apply(len), test_df["photos"].apply(len)

    # count of "features" #
    train_df["num_features"], test_df["num_features"] = train_df["features"].apply(len), test_df["features"].apply(len)

    # count of words present in description column #
    train_df["num_description_words"] = train_df["description"].apply(lambda x: len(x.split(" ")))
    test_df["num_description_words"] = test_df["description"].apply(lambda x: len(x.split(" ")))
    
    tmp = train_df.append(test_df)
    gp = tmp.groupby('manager_id').size()
    gp.name = 'manager_count'
    gp = gp.reset_index()
    train_df, test_df = pd.merge(train_df,gp,how='left'), pd.merge(test_df,gp,how='left')
    
    gp = tmp.groupby('building_id').size()
    gp.name = 'building_count'
    gp = gp.reset_index()
    train_df, test_df = pd.merge(train_df,gp,how='left'), pd.merge(test_df,gp,how='left')
    
    train_df['zero_building_id'] = train_df['building_id'].map(lambda x:int(x=='0'))
    test_df['zero_building_id'] = test_df['building_id'].map(lambda x:int(x=='0'))
    fea_list = ['hour_size','zero_building_id','weekdays',"num_features","num_description_words",
                "days","num_photos", "created_month", "created_day", "created_hour",'manager_count','building_count']
    
    return train_df,test_df,fea_list

In [6]:
def residue_fea(train_df,test_df):
    if 'days' not in train_df.columns:
        train_df["created"], test_df["created"] = pd.to_datetime(train_df["created"]), pd.to_datetime(test_df["created"])
        test_df['days'], train_df['days'] = test_df['created']-pd.to_datetime('2016-04-01'), train_df['created'] - pd.to_datetime('2016-04-01')
        
    train_df['dayx20'] = train_df['days'].map(lambda x:int(x*20))
    gp = train_df.groupby('dayx20',as_index=False)['listing_id'].min()
    gp['dayx20'] = gp['dayx20']/20.
    
    lin = LinearRegression()
    X = gp['dayx20'].as_matrix().reshape(len(gp),1)
    y = np.array(gp['listing_id'])
    lin.fit(X,y)
    
    train_df['listing_id_residue'] = train_df['days'].map(lambda x:x*lin.coef_[0] + lin.intercept_)
    train_df['listing_id_residue'] = train_df['listing_id'] - train_df['listing_id_residue'] 
    test_df['listing_id_residue'] = test_df['days'].map(lambda x:x*lin.coef_[0] + lin.intercept_)
    test_df['listing_id_residue'] = test_df['listing_id'] - train_df['listing_id_residue'] 
    
    train_df['int_days'] = train_df['days'].map(lambda x:int(x))
    train_df['intra'] = train_df['days'] - train_df['int_days']
    train_df['intra'] = train_df['intra'].map(lambda x:int(x>0.3))
    
    test_df['int_days'] = test_df['days'].map(lambda x:int(x))
    test_df['intra'] = test_df['days'] - train_df['int_days']
    test_df['intra'] = test_df['intra'].map(lambda x:int(x>0.3))
    del train_df['dayx20']
    return train_df,test_df,['int_days','intra','listing_id_residue']

In [7]:
def careful_avg(train_df_,test_df_,agg_keys,mask,suffix,debug=False):
    """
    agg_keys: the list of column names groupby is based on, e.g.['manager_id','bedrooms','bathrooms']
    mask has same length of train_df, used for cross validation
    mask=1 training, masking=0 validation
    suffix is for the resulted feature names
    """
    train_df = train_df_.copy()
    test_df = test_df_.copy()
    
    print train_df.shape
    print test_df.shape
    
    train_df['target'] = train_df['interest_level']
    test_df['target'] = -1
    
    train_df.loc[train_df['mask']==0,'target'] = -1
    alldf = train_df.append(test_df)
    alldf['target'] = alldf['target'].fillna(-1)
    alldf['mask'] = alldf['mask'].fillna(0)
    
    if debug:
        debug_list =  set(['401 w 25th st.','30 west 63rd street'])
        alldf = alldf[alldf['street_address'].isin(debug_list)]
        import pdb;pdb.set_trace()
    
    global_avg = train_df[train_df['mask']==1].groupby('target').count()*1.0/np.sum(mask)
    columns = ['#records_'+suffix,'#train_record_'+suffix,'mean_price_apt_'+suffix,'low_avg_'+suffix,
               'med_avg_'+suffix,'high_avg_'+suffix]
    low_avg_name = 'low_avg_'+suffix
    med_avg_name = 'med_avg_'+suffix
    high_avg_name = 'high_avg_'+suffix
    gp = alldf.groupby(agg_keys)
    res = gp['mask'].agg({'#train_record_'+suffix:np.sum,
                         '#records_'+suffix:np.size})
    res['mean_price_apt_'+suffix] = gp['price'].mean()
    res2 = gp['target'].agg({low_avg_name:lambda x:np.sum(x==0)*1.,
                            med_avg_name:lambda x:np.sum(x==1)*1.,
                            high_avg_name:lambda x:np.sum(x==2)*1.,
                            'total':lambda x:np.sum(x!=-1)})
    res = res.join(res2)
    res = res.reset_index()
    res = pd.merge(alldf[['listing_id','mask','target'] + agg_keys],res,on=agg_keys,how='left')
    res.loc[res['target']==0,low_avg_name] = res.loc[res['target']==0,low_avg_name] - 1
    res.loc[res['target']==1,med_avg_name] = res.loc[res['target']==1,med_avg_name] - 1
    res.loc[res['target']==2,high_avg_name] = res.loc[res['target']==2,high_avg_name] - 1
    res.loc[res['target']>=0,'total'] = res.loc[res.target>=0,'total'] - 1
    res.loc[res['total']>0,low_avg_name] = res.loc[res['total']>0,low_avg_name]/res.loc[res['total']>0,'total']
    res.loc[res['total']>0,med_avg_name] = res.loc[res['total']>0,med_avg_name]/res.loc[res['total']>0,'total']
    res.loc[res['total']>0,high_avg_name] = res.loc[res['total']>0,high_avg_name]/res.loc[res['total']>0,'total']
    res.loc[res['total']==0,low_avg_name] = res.loc[res['total']>0,low_avg_name].mean()
    res.loc[res['total']==0,med_avg_name] = res.loc[res['total']>0,med_avg_name].mean()
    res.loc[res['total']==0,high_avg_name] = res.loc[res['total']>0,high_avg_name].mean()
    
    
    if debug:
        res.to_csv('debug_avg.csv',index=False,encoding='utf8')
        raise NameError('Debug exit')
    train_df = pd.merge(train_df,res[columns+['listing_id']],how='left',on=['listing_id'])
    test_df = pd.merge(test_df,res[columns+['listing_id']],how='left',on=['listing_id'])
    
    del train_df['target']
    
    del test_df['target']
    
    return train_df,test_df,columns

In [8]:
def past_future(train_df_,test_df_,agg_keys,mask,suffix,debug=False):
    
    train_df = train_df_.copy()
    test_df = test_df_.copy()
    
    test_df['target'] = -1
    
    train_df['target'] = train_df['interest_level']
    train_df.loc[train_df['mask']==0,'target'] = -1
    alldf = train_df.append(test_df)
    alldf['target'] = alldf['target'].fillna(-1)
    alldf['mask'] = alldf['mask'].fillna(0)
    if debug:
        debug_list =  set(['401 w 25th st.','30 west 63rd street'])
        alldf = alldf[alldf['street_address'].isin(debug_list)]
        import pdb;pdb.set_trace()
    if 'days' not in alldf.columns:
        raise NameError("This function should be called after basic_feature()")
    alldf = alldf.sort_values('days')
    alldf.index=range(len(alldf))
    #gp = alldf.groupby(['manager_id','building_id','bathrooms','bedrooms','street_address'])
    gp = alldf.groupby(agg_keys)
    alldf['last_listing_day_'+suffix] = gp['days'].transform(lambda x:x.diff(1)).fillna(0)    
    alldf['next_listing_days_'+suffix] = gp['days'].transform(lambda x:x.diff(-1)).fillna(0)    
    alldf['last_listing_price_'+suffix] = gp['price'].transform(lambda x:x.diff(1)).fillna(0)
    alldf['next_listing_price_'+suffix] = gp['price'].transform(lambda x:x.diff(-1)).fillna(0)
    
    alldf['last_listing_interest_'+suffix] = gp['target'].transform(lambda x:x.shift(1)).fillna(-1)
    alldf['next_listing_interest_'+suffix] = gp['target'].transform(lambda x:x.shift(-1)).fillna(-1)
    
    if debug:
        alldf.to_csv('debug.csv',index=False,encoding='utf8')
        raise NameError('Exit by Debug')
        
    columns = ['last_listing_day_'+suffix,'next_listing_days_'+suffix,'last_listing_price_'+suffix,
              'next_listing_price_'+suffix,'last_listing_interest_'+suffix,'next_listing_interest_'+suffix]
    
    train_df = pd.merge(train_df,alldf[columns+['listing_id']],how='left',on=['listing_id'])
    test_df = pd.merge(test_df,alldf[columns+['listing_id']],how='left',on=['listing_id'])
    
    del train_df['target']
    
    del test_df['target']
    return train_df,test_df,columns
    
        


In [30]:
def runXGB_sklearn(train_X, train_y, test_X, test_y=None, feature_names=None, seed_val=10, num_rounds=5000,verbose=False):

    clf = XGBClassifier(n_estimators=num_rounds,
                            objective='multi:softprob',
                            learning_rate=0.003,
                            max_depth=6,
                            min_child_weight=1,
                            subsample=.7,
                            colsample_bytree=.7,
                            colsample_bylevel=.5,
                            gamma=0.005,
                            scale_pos_weight=1,
                            base_score=.5,
                            #reg_lambda=0,
                            #reg_alpha=0,
                            #missing=0,
                            seed=seed_val)
    
    if test_y is not None:
        clf.fit(train_X, train_y,eval_set=[(train_X, train_y), (test_X, test_y)],verbose=verbose,eval_metric='mlogloss',
            early_stopping_rounds=50)
    else:        
        clf.fit(train_X, train_y,verbose=False)
    pred_test_y = clf.predict_proba(test_X)
    return pred_test_y, clf


In [10]:
def simple_cv(train_df,test_df,fealist):
    train_X = train_df[fealist].as_matrix()
    test_X = test_df[fealist].as_matrix()
    train_y = np.array(train_df['interest_level'])
    
    cv_scores = [] 
    #print fea_categorical
    #print fea_additional
    #print feature_params
    kf = model_selection.StratifiedKFold(n_splits=5, shuffle=True, random_state=22)
    for dev_index, val_index in kf.split(train_X,train_y):
        dev_X, val_X = train_X[dev_index,:], train_X[val_index,:]
        dev_y, val_y = train_y[dev_index], train_y[val_index]
        preds, model = runXGB_sklearn(dev_X, dev_y, val_X, val_y,verbose=False)
        print('best iterations:{}, best_score={}, last_score={}'.format(model.best_iteration,
                                                                   model.best_score,log_loss(val_y, preds)))
        importance_inx = np.argsort(model.feature_importances_*-1)
        print('Most important 40 features:')
        ff = [(fealist[x],model.feature_importances_[x]) for x in importance_inx[:40]]
        print(ff)
        print('-------------------------')
          
    
        cv_scores.append(log_loss(val_y, preds))
    print 'mean score={}'.format(np.mean(cv_scores))

In [11]:
class AddrBuildingAggregate(BaseFeatureExtraction):
    def __init__(self):
        super(AddrBuildingAggregate,self).__init__()
        self.method_info = "aggregation based on building or street address, take the price difference"
        
    

    def transform(self,train_df_,test_df_):    
        if 'west_east' not in train_df_.columns:
            raise NameError('This function should be called after the AddressFeature')
        df1 = train_df_.copy()
        df2 = test_df_.copy()
        df1['source'] = 1
        df2['source'] = 2
        df = df1.append(df2)
        df = df.reset_index(drop=True)
        
        def local_agg(df,agg_keys,fea_name):            
            p = df.groupby(agg_keys)['price'].mean()
            p.name = fea_name
            p = p.reset_index()
            df = pd.merge(df,p,how='left',on=agg_keys)
            df[fea_name] = df['price']-df[fea_name]
            return df
        
        local_agg(df,['building_id','bathrooms','bedrooms'],'building_price')
        local_agg(df,['building_id','bedrooms'],'building_bed_price')
        local_agg(df,['street_address','bathrooms','bedrooms'],'stradd_price')
        local_agg(df,['street_address','bedrooms'],'stradd_bed_price')
        
        df1 = df[df['source']==1].copy()
        df2 = df[df['source']==2].copy()
        del df1['source']
        del df2['source']
        del df
    
        return df1,df2,['building_price','building_bed_price','stradd_price','stradd_bed_price']
        

In [27]:
def BayesCategoricalDistribution(train_df_,test_df_,agg_keys,mask,suffix,multiple=5.):
    
    if len(mask)!=len(train_df_):
        raise ValueError("Length of mask should be equal to length of train_df")
    
    train_df = train_df_.copy()
    test_df = test_df_.copy()    
    train_df['target'] = train_df['interest_level']
    test_df['target'] = -1
    test_df['mask'] = 0
    #train_df['mask'] = mask    
    train_df.loc[train_df['mask']==0,'target'] = -1
    alldf = train_df.append(test_df)
    
    gp = alldf.groupby(agg_keys).size()
    gp.name = 'size'
    gp = gp.reset_index()
    alldf = pd.merge(alldf,gp,how='left')
    
    tmpdf = alldf.loc[alldf['mask']==1,['size','target']].copy()
    tmpdf['low_global'] = tmpdf['target'].map(lambda x:int(x==0))
    tmpdf['medium_global'] = tmpdf['target'].map(lambda x:int(x==1))
    tmpdf['high_global'] = tmpdf['target'].map(lambda x:int(x==2))
    tmpdf['total_global'] = 1.
    tmpdf = tmpdf.groupby('size').sum()
    tmpdf['low_global'] = tmpdf['low_global']/tmpdf['total_global']
    tmpdf['medium_global'] = tmpdf['medium_global']/tmpdf['total_global']
    tmpdf['high_global'] = tmpdf['high_global']/tmpdf['total_global']
    tmpdf = tmpdf.reset_index()
    del tmpdf['target']
    import pdb;pdb.set_trace()
    
    alldf = pd.merge(alldf,tmpdf,how='left',on='size')
    columns = ['#train_record_'+suffix,'low_avg_'+suffix,'med_avg_'+suffix,'high_avg_'+suffix]
    low_avg_name = 'low_avg_'+suffix
    med_avg_name = 'med_avg_'+suffix
    high_avg_name = 'high_avg_'+suffix
    
    gp = alldf.groupby(agg_keys)
    res = gp['target'].agg({low_avg_name:lambda x:np.sum(x==0)*1.,
                            med_avg_name:lambda x:np.sum(x==1)*1.,
                            high_avg_name:lambda x:np.sum(x==2)*1.,
                            'total':lambda x:np.sum(x!=-1)})
    res2 = gp['mask'].agg({'#train_record_'+suffix:np.sum,})
    res = res.join(res2)
    res = res.reset_index()
    
    res = pd.merge(alldf[['listing_id','mask','target','low_global','medium_global','high_global'] + agg_keys],
                   res,on=agg_keys,how='left')
    res.loc[res['target']==0,low_avg_name] = res.loc[res['target']==0,low_avg_name] - 1
    res.loc[res['target']==1,med_avg_name] = res.loc[res['target']==1,med_avg_name] - 1
    res.loc[res['target']==2,high_avg_name] = res.loc[res['target']==2,high_avg_name] - 1
    res.loc[res['target']>=0,'total'] = res.loc[res.target>=0,'total'] - 1
    res[low_avg_name] = (res[low_avg_name] + res['low_global']*multiple)/(res['total']+multiple)
    res[med_avg_name] = (res[med_avg_name] + res['medium_global']*multiple)/(res['total']+multiple)
    res[high_avg_name] = (res[high_avg_name] + res['high_global']*multiple)/(res['total']+multiple)
    
    train_df = pd.merge(train_df,res[columns+['listing_id']],how='left',on=['listing_id'])
    test_df = pd.merge(test_df,res[columns+['listing_id']],how='left',on=['listing_id'])
    del train_df['target']
    del test_df['target']
    del train_df['mask']
    del test_df['mask']
    
    return train_df,test_df,columns

In [12]:
print_col = ['created','listing_id','price']

In [32]:
txt_fea = TextFeature()
mis_fea = Miscellous()
addr_fea = AddressFeature()
gbm_quant_fea = GbmQuantPrice(['days','latitude','longitude'],'gbm_quant_lat_long')
addr_aggr_fea = AddrBuildingAggregate()
cat_encoding_fea = CategoricalEncoding()
cat_cv_fea = Categorical_cv(nfold = 5)

In [14]:
core_feature = ['bathrooms','bedrooms','latitude','longitude','price','listing_id']

In [20]:
train_df,test_df = read_data()
print train_df[print_col].head()

                    created  listing_id  price
10      2016-06-24 07:54:24     7211212   3000
10000   2016-06-12 12:19:27     7150865   5465
100004  2016-04-17 03:26:41     6887163   2850
100007  2016-04-18 02:22:02     6888711   3275
100013  2016-04-28 01:32:41     6934781   3350


In [21]:
train_df,test_df,addr_fealist = addr_fea.transform(train_df,test_df)
train_df,test_df,basic_fealist = mis_fea.transform(train_df,test_df)
train_df,test_df,fealist_txt = txt_fea.transform(train_df,test_df)
train_df,test_df,residue_fealist = residue_fea(train_df,test_df)
train_df,test_df,gbm_quant_fealist = gbm_quant_fea.transform(train_df,test_df)
train_df,test_df,addr_aggr_fealist = addr_aggr_fea.transform(train_df,test_df)
train_df,test_df,cate_encoding_fealist = cat_encoding_fea.transform(train_df,test_df)
print train_df[print_col].head()

              created  listing_id  price
0 2016-06-24 07:54:24     7211212   3000
1 2016-06-12 12:19:27     7150865   5465
2 2016-04-17 03:26:41     6887163   2850
3 2016-04-18 02:22:02     6888711   3275
4 2016-04-28 01:32:41     6934781   3350


In [31]:
train_df_original = train_df.copy()
test_df_original = test_df.copy()

kf = model_selection.StratifiedKFold(n_splits=5,shuffle=True,random_state=22)
train_df_original.index=range(len(train_df))
train_y = train_df_original['interest_level'].as_matrix().ravel()
ii = 1
cv_scores=[]
core_feature = ['bathrooms','bedrooms','latitude','longitude','price','listing_id']
print_col = ['created','listing_id','price']
for dev_index,val_index in kf.split(train_df[['listing_id','price']].as_matrix(),train_y):
    
    mask = np.ones(len(train_df))
    mask[val_index] = 0
    train_df_original['mask'] = mask
    test_df_original['mask'] = 0
    #fea_past_fut_manager_building = []
    #train_df,test_df,fea_past_fut_manager_building = past_future(train_df_original,test_df_original,
    #                                agg_keys=['manager_id','building_id','bathrooms','bedrooms','street_address'],
    #                                mask=mask,suffix='manager_building')
    #print train_df[print_col].head(5)
    
    #train_df,test_df,fea_avg_building_bed_bath = careful_avg(train_df,test_df,
    #                                agg_keys=['building_id','street_address','bedrooms','bathrooms'],
    #                                mask=mask,suffix='building_bed_bath')    
    #print train_df_[print_col].head(5)
    #train_df,test_df,fea_avg_building = careful_avg(train_df,test_df,agg_keys=['building_id'],
    #                                   mask=mask,suffix='building')
    #print train_df[print_col].head(5)
    #nmd = train_df.copy()
    #del nmd['description']
    #del nmd['features']
    #del nmd['photos']
    #nmd.to_csv('nmd.csv',index=False,encoding='utf8')
    
    #import pdb;pdb.set_trace()
    train_df,test_df,fea_avg_manager = BayesCategoricalDistribution(train_df,test_df,agg_keys=['manager_id'],
                                       mask=mask,suffix='manager')
    print train_df[print_col].head(5)
    
    #train_df,test_df,fea_past_fut_building = past_future(train_df,test_df,
    #                                agg_keys=['building_id','bathrooms','bedrooms','street_address'],
    #                                mask=mask,suffix='building')
    #print train_df[print_col].head(5)

    #fealist = core_feature + basic_fealist + residue_fealist + fea_avg_building_bed_bath + fea_avg_building + fea_avg_manager + fea_past_fut_manager_building + fea_past_fut_building
    #fealist = core_feature + basic_fealist + residue_fealist  + fea_avg_building + fea_avg_manager + fea_past_fut_manager_building 
    fealist0 = core_feature + basic_fealist + fealist_txt + residue_fealist + addr_fealist + gbm_quant_fealist + addr_aggr_fealist + cate_encoding_fealist
    fealist1 = fealist0 + fea_avg_manager
    def runone(train_df,train_y,mask,fealist):
        X_train = train_df.loc[mask==1,fealist]
        print X_train.shape
        y_train = train_y[mask==1]
        X_test = train_df.loc[mask==0,fealist]
        y_test = train_y[mask==0]
        preds,model = runXGB_sklearn(X_train,y_train,X_test,y_test)
        print('best iterations:{}, best_score={}, last_score={}'.format(model.best_iteration,
                                                                   model.best_score,log_loss(y_test, preds)))
        print log_loss(y_test,preds)
    
    #print('best iterations:{}, best_score={}, last_score={}'.format(model.best_iteration,
#                                                                   model.best_score,log_loss(y_test, preds)))
    #importance_inx = np.argsort(model.feature_importances_*-1)
    #print('Most important 40 features:')
    #ff = [(fealist[x],model.feature_importances_[x]) for x in importance_inx[:40]]
    #print(ff)    
    #cv_scores.append(log_loss(y_test,preds)); print(cv_scores)
    #print 'basic ---'
    #runone(train_df,train_y,mask,fealist0)
    
    print 'more -----'
    runone(train_df,train_y,mask,fealist1)
    
    
print 'mean score={}'.format(np.mean(cv_scores))
    

> <ipython-input-27-503abeab8235>(33)BayesCategoricalDistribution()
-> alldf = pd.merge(alldf,tmpdf,how='left',on='size')
(Pdb) c
              created  listing_id  price
0 2016-06-24 07:54:24     7211212   3000
1 2016-06-12 12:19:27     7150865   5465
2 2016-04-17 03:26:41     6887163   2850
3 2016-04-18 02:22:02     6888711   3275
4 2016-04-28 01:32:41     6934781   3350
more -----
(39481, 316)
best iterations:3454, best_score=0.553947, last_score=0.554042467929
0.554042467929
> <ipython-input-27-503abeab8235>(33)BayesCategoricalDistribution()
-> alldf = pd.merge(alldf,tmpdf,how='left',on='size')
(Pdb) q


BdbQuit: 

In [None]:
tmp = train_df.groupby('manager_id')

In [None]:
              created  listing_id  price
0 2016-06-24 07:54:24     7211212   3000
1 2016-06-12 12:19:27     7150865   5465
2 2016-04-17 03:26:41     6887163   2850
3 2016-04-18 02:22:02     6888711   3275
4 2016-04-28 01:32:41     6934781   3350
              created  listing_id  price
0 2016-06-24 07:54:24     7211212   3000
1 2016-06-12 12:19:27     7150865   5465
2 2016-04-17 03:26:41     6887163   2850
3 2016-04-18 02:22:02     6888711   3275
4 2016-04-28 01:32:41     6934781   3350
basic ---
(39481, 21)
0.555284040866
more -----
(39481, 27)
0.553684755985
              created  listing_id  price
0 2016-06-24 07:54:24     7211212   3000
1 2016-06-12 12:19:27     7150865   5465
2 2016-04-17 03:26:41     6887163   2850
3 2016-04-18 02:22:02     6888711   3275
4 2016-04-28 01:32:41     6934781   3350
              created  listing_id  price
0 2016-06-24 07:54:24     7211212   3000
1 2016-06-12 12:19:27     7150865   5465
2 2016-04-17 03:26:41     6887163   2850
3 2016-04-18 02:22:02     6888711   3275
4 2016-04-28 01:32:41     6934781   3350
basic ---
(39481, 21)
0.563221004316
more -----
(39481, 27)
0.561319460851
              created  listing_id  price
0 2016-06-24 07:54:24     7211212   3000
1 2016-06-12 12:19:27     7150865   5465
2 2016-04-17 03:26:41     6887163   2850
3 2016-04-18 02:22:02     6888711   3275
4 2016-04-28 01:32:41     6934781   3350
              created  listing_id  price
0 2016-06-24 07:54:24     7211212   3000
1 2016-06-12 12:19:27     7150865   5465
2 2016-04-17 03:26:41     6887163   2850
3 2016-04-18 02:22:02     6888711   3275
4 2016-04-28 01:32:41     6934781   3350
basic ---
(39481, 21)
0.554102758386
more -----
(39481, 27)
0.553038442705
              created  listing_id  price
0 2016-06-24 07:54:24     7211212   3000
1 2016-06-12 12:19:27     7150865   5465
2 2016-04-17 03:26:41     6887163   2850
3 2016-04-18 02:22:02     6888711   3275
4 2016-04-28 01:32:41     6934781   3350
              created  listing_id  price
0 2016-06-24 07:54:24     7211212   3000
1 2016-06-12 12:19:27     7150865   5465
2 2016-04-17 03:26:41     6887163   2850
3 2016-04-18 02:22:02     6888711   3275
4 2016-04-28 01:32:41     6934781   3350
basic ---
(39481, 21)
0.559244335347
more -----
(39481, 27)
0.557624528681
              created  listing_id  price
0 2016-06-24 07:54:24     7211212   3000
1 2016-06-12 12:19:27     7150865   5465
2 2016-04-17 03:26:41     6887163   2850
3 2016-04-18 02:22:02     6888711   3275
4 2016-04-28 01:32:41     6934781   3350
              created  listing_id  price
0 2016-06-24 07:54:24     7211212   3000
1 2016-06-12 12:19:27     7150865   5465
2 2016-04-17 03:26:41     6887163   2850
3 2016-04-18 02:22:02     6888711   3275
4 2016-04-28 01:32:41     6934781   3350
basic ---
(39484, 21)
0.561456910202
more -----
(39484, 27)
0.560915910552
mean score=nan