In [1]:
import os
import sys
import operator
import numpy as np
import pandas as pd
from scipy import sparse
import xgboost as xgb
from xgboost import XGBClassifier,XGBRegressor
from sklearn import model_selection, preprocessing, ensemble
from sklearn.metrics import log_loss
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import hashlib
import re
from sklearn.model_selection import GridSearchCV,StratifiedKFold

#from sklearn.preprocessing import LabelEncoder



# On Column 'features' -- cleaning, transforming

In [2]:
def text_fea(df1_,df2_,nTop=300,combine=True):
    ''' df1_ is training set
        df2_ is test set
    '''
    from collections import defaultdict
    df1 = df1_.copy() # Maybe ther eis better way to avoid SettingWithCopyWarning 
    df2 = df2_.copy()
    df1['source'] = 1
    df2['source'] = 2
    df = df1.append(df2)
    df = df.reset_index(drop=True)
    #df['features'] = df['features'].map(lambda x:[tt.lower() for tt in x])
    df['description'] = df['description'].map(lambda x:x.lower())
    #---------------------------
    #original length of the first feature, meant to capture those features typed in with wrong 
    # format -- all features are cramed into one phrase
    df['len_feature0'] = df['features'].map(lambda x:0 if len(x)==0 else len(x[0])) 
    #-----------------------------
    
    def fea_clean(x): 
        if len(x) == 1:
            tmp = x[0].strip('*').split('*')
            if len(tmp) ==1:
                tmp = tmp[0].split(u'\u2022')
            x = tmp
        ret = [tt.encode('utf-8').decode('unicode_escape').encode('ascii','ignore').lower().strip() for tt in x]
        return ret
    df['features'] = df['features'].map(fea_clean)
    
    
    all_fea = defaultdict(int)
    for _,row in df.iterrows():
        for xx in row['features']:
            all_fea[xx] += 1
    sorted_fea = sorted(all_fea.iteritems(),key=lambda (k,v): v,reverse=True)
    
    combined_fea = {'laundry in unit':['laundry in unit','in-unit washer/dryer','washer & dryer',
                                        'washer/dryer','washer/dryer in unit'],
                   'laundry in building':['laundry in building','laundry room',
                                           'washer/dryer in building','on-site laundry'],
                   'gym/fitness':['gym/fitness','fitness center','gym','gym in building'],
                   'pre-war':['pre-war','prewar'],
                    'live-in superintendent':['live-in superintendent','live-in super','live in super'],
                    'hardwood floors':['hardwood floors','hardwood','hardwood floor','hardwood flooring'],
                    'high ceiling':['high ceiling','high ceilings'],
                    'full-time doorman':['full-time doorman','ft doorman','24/7 doorman','24 hour doorman',
                                        '24-hour doorman','24hr doorman','full time doorman','24 hr doorman']
                   #'garage':['garage','parking']
                   }
    
    fea_list = set([v[0] for v in sorted_fea[:nTop]])
    for k,v in combined_fea.iteritems():
        fea_list = fea_list.union(set(v))
    for fea in fea_list:
        df[fea] = 0
    #import pdb;pdb.set_trace()
    for inx,row in df.iterrows():
        notlist = []
        if len(row['features']) == 0:
            continue
        for ff in row['features']:
            if ff in fea_list:
                df.set_value(inx,ff,1)
            else:
                notlist.append(ff)
        #df.set_value(inx,'description',row['description'] + ' '.join(notlist))
    print 'fea_list length is {}'.format(len(fea_list))
    fea_list = set(fea_list)
    if combine:        
        for k,v in combined_fea.iteritems():
            df[k] = df[v[0]]
            #print k,v
            for ii in range(1,len(v)):
                df[k] = df[k] + df[v[ii]]
                del df[v[ii]]
                fea_list.remove(v[ii])
    fea_list = list(fea_list) + ['len_feature0']
    #print 'fea_list length is {}'.format(len(fea_list))
    df1 = df[df['source']==1].copy()
    df2 = df[df['source']==2].copy()
    del df1['source']
    del df2['source']
    del df
    return df1,df2,fea_list

# Street_address and display address

In [3]:
def normalize_num(x):
    if x[-1]=='1':
        return x+'st'
    elif x[-1]=='2':
        return x+'nd'
    elif x[-1]=='3':
        return x+'rd'
    else:
        return x+'th'

def normalize_street(x):
    street_name_mapping = {'st.':'street','st':'street','st,':'street','st..':'street','street,':'street',
                       'ave':'avenue','ave.':'avenue','ave,':'avenue','avenue,':'avenue','pl':'place',
                       'blvd':'boulevard','pkwy':'parkway','dr':'drive','rd.':'road','rd,':'road','rd':'road',
                       'ln':'lane',
                       'e':'east','e.':'east','w.':'west','w':'west','west,':'west','s':'south','&':'and',
                       'second':'2nd','first':'1st','third':'3rd','fourth':'4th','fifth':'5th',
                       'sixth':'6th','seventh':'7th','eighth':'8th','ninth':'9th','tenth':'10th',                       
                       #'1':'1st','2':'2nd','43':'43rd','37':'37th','34':'34th',
                      }
    xlist = x.lower().strip(' .,').split()
    output = []
    for tt in xlist:
        tmp = tt.strip(',.*')
        if len(tmp)>0:
            if tmp in street_name_mapping:
                tmp = street_name_mapping[tmp]
            elif tmp.isdigit():
                tmp = normalize_num(tmp)
            else:
                pass
            output.append(tmp)
    return ' '.join(output).strip()      

def rem_streetname_xy(x,y):
    '''remove x from y
    '''
    pos = y.find(x)
    if pos>0:
        tmp = y.replace(x,'').strip(' ,.')
        tmp = tmp.split(' ')[0].strip(',. #')
        if '-' in tmp:        
            tmp = tmp.split('-')[0].strip(', .#')
        try:
            a = int(tmp)
            return a
        except ValueError:
            #print tmp
            return None
    else:
        return None   

def rem_streetname(row):    
    x = row['display_address']
    y = row['street_address']
    return rem_streetname_xy(x,y)


def get_address_num_simple(x):
    if len(x)==0:
        return -1
    
    x1 = x.strip().split()[0]
    if x1.isdigit():
        return float(x1)
    return -1

def manhattan_locale(df1_,df2_):
    '''
    extract street or avenue number from displayed address. 
    
    For those displayed address like 'w 3rd street and 5th avenue', all three fields will have values. 
    For a majority cases, one of the (west_east, street) or (avenue) will be null (0)
    '''
    
    df1 = df1_.copy()
    df2 = df2_.copy()
    df1['source'] = 1
    df2['source'] = 2
    df = df1.append(df2)
    df = df.reset_index(drop=True)
    
    ave_mapping = {'lexington avenue': '3.5 avenue',
                  'park avenue':'4 avenue',
                  'madison avenue':'4.5 avenue',
                  'central park west':'8 avenue',
                  'columbus avenue':'9 avenue',
                  'amsterdam avenue':'10 avenue',
                  'west end avenue':'11 avenue'
                  }
    addr_adjust = {'central park west':5000,
                  'columbus avenue':5000,
                  'amsterdam avenue':5000,
                  'west end avenue':5000        
                }       
    
    df['west_east'] = 0 #west = -1,east = 1, null=0
    df['street_num'] = 0 # street number e.g. 12nd street will be 12, if not on a street, empty
    df['ave_num'] = 0 #avenue number, if not on avenue, empty
    df['addr_num_adjust'] = 0 # for several avenues 
    
    #import pdb;pdb.set_trace()
    
    for inx,row in df.iterrows():
        addr_str = ''
        addr_ave = ''
        
        if ' and ' in row['display_address']:
            ss = row['display_address'].split(' and ')
            if len(ss)>2:
                continue
            if ' street' in ss[0] and ' avenue' in ss[1]:
                addr_str = ss[0].strip()
                addr_ave = ss[1].strip()
            elif ' street' in ss[1] and ' avenue' in ss[0]:
                addr_str = ss[1].strip()
                addr_str = ss[0].strip()
            else:
                continue
        else:            
            if ' street' in row['display_address']:
                addr_str = row['display_address'].strip()
            if ' avenue' in row['display_address']:
                addr_ave = row['display_address'].strip()
        
        if len(addr_str)>0:
            num = re.sub('\D+','',row['display_address'])
            if len(num)>0:
                fields = addr_str.split()
                west_east = 0
                ii=0
                while ii < len(fields):
                    if fields[ii].strip() == 'west':
                        west_east = -1
                        break
                    elif fields[ii].strip() == 'east':
                        west_east = 1
                        break
                    ii += 1
                if ii<len(fields)-2:                    
                    df.set_value(inx,'west_east',west_east)   
                    try:
                        df.set_value(inx,'street_num',float(re.sub('\D+','',fields[ii+1])))
                    except ValueError:
                        pass
                        #print addr_str
                        #return None,None
        if len(addr_ave)>0:
            adjust = 0
            if addr_ave in addr_adjust:
                adjust = addr_adjust[addr_ave]
            if addr_ave in ave_mapping:
                addr_ave = ave_mapping[addr_ave]
            try:
                df.set_value(inx,'ave_num',float(re.sub('\D+','',addr_ave.split()[0])))
            except ValueError:
                #print addr_ave
                pass
            df.set_value(inx,'addr_num_adjust',adjust)
    df['address_num'] = df['address_num'] + df['addr_num_adjust']    
    del df['addr_num_adjust']
    df1 = df[df['source']==1].copy()
    df2 = df[df['source']==2].copy()
    del df1['source']
    del df2['source']
    del df
    
    fea_list = ['west_east','street_num','ave_num']
    return df1,df2,fea_list
    

def address_proc(train_df_,test_df_):        
    train_df = train_df_.copy()
    test_df = test_df_.copy()
    
    train_df['address_num'] = train_df['street_address'].map(get_address_num_simple)
    test_df['address_num'] = test_df['street_address'].map(get_address_num_simple)
    train_df['display_address'] = train_df['display_address'].map(lambda x:normalize_street(x))
    test_df['display_address'] = test_df['display_address'].map(lambda x:normalize_street(x))
    train_df['street_address'] = train_df['street_address'].map(lambda x:normalize_street(x))
    test_df['street_address'] = test_df['street_address'].map(lambda x:normalize_street(x))
    
    flist = ['address_num']
    train_df,test_df,ff1 = manhattan_locale(train_df,test_df)
    #train_df,test_df,ff2 = multiple_hashing(train_df,test_df,'display_address')
    #train_df,test_df,ff2 = simple_hashing(train_df,test_df,'street_address')
    
    flist = flist + ff1
    return train_df,test_df,flist

# price quantile

In [4]:
def get_quantile_by_key(df,keys,nLevel,newcol_name):
    #keys is a list of keys to groupby on
    gp = df.groupby(keys)
    levels = np.arange(0,1,1./nLevel)
    res = pd.DataFrame()
    
    for inx,data in gp:
        quantiles = [data['price'].quantile(x) for x in levels]
        #import pdb;pdb.set_trace()
        tmp = data.copy()
        tmp[newcol_name] = data['price'].map(lambda x:np.searchsorted(quantiles,x))
        
        res = res.append(tmp)
    #import pdb;pdb.set_trace()
    df = res
    return df

def quantile_price(df1_,df2_,nLevel=10):
    df1 = df1_.copy()
    df2 = df2_.copy()
    df1['source'] = 1
    df2['source'] = 2
    df = df1.append(df2)
    df = df.reset_index(drop=True)
    
    df = get_quantile_by_key(df,['bedrooms','bathrooms'],nLevel,'price_quantile')
                
    df1 = df[df['source']==1].copy()
    df2 = df[df['source']==2].copy()
    del df1['source']
    del df2['source']
    del df
    
    return df1,df2,['price_quantile']

def quantile_price_lat_long(df1_,df2_,step_size=0.02,nLevel=10):
    df1 = df1_.copy()
    df2 = df2_.copy()
    df1['source'] = 1
    df2['source'] = 2
    df = df1.append(df2)
    df = df.reset_index(drop=True)
    
    west, south, east, north = -74.02, 40.64, -73.85, 40.86
    df['long_grid'] = df['longitude'].map(lambda x: int(round((x-west)/step_size)))
    df['lat_grid'] = df['latitude'].map(lambda x:int(round((x-south)/step_size)))
    
    df = get_quantile_by_key(df,['long_grid','lat_grid','bedrooms','bathrooms'],nLevel,'price_quantile_lat_long')
                
    df1 = df[df['source']==1].copy()
    df2 = df[df['source']==2].copy()
    del df1['source']
    del df2['source']
    del df
    
    return df1,df2,['price_quantile_lat_long','long_grid','lat_grid']


In [5]:
#Taken from Stanislav Ushakov's script, which is based on 'it is lit'by branden
lambda_val = None
k=5.0
f=1.0
r_k=0.01 
g = 1.0

def categorical_average(Xtrain_,Xtest_, variable, y, pred_0, feature_name):
    X_train = Xtrain_.copy()
    X_test = Xtest_.copy()
    
    def calculate_average(sub1, sub2):
        #import pdb;pdb.set_trace()
        s = pd.DataFrame(data = {
                                 variable: sub1.groupby(variable, as_index = False).count()[variable],                              
                                 'sumy': sub1.groupby(variable, as_index = False).sum()['y'],
                                 'avgY': sub1.groupby(variable, as_index = False).mean()['y'],
                                 'cnt': sub1.groupby(variable, as_index = False).count()['y']
                                 })
                                 
        tmp = sub2.merge(s.reset_index(), how='left', left_on=variable, right_on=variable) 
                              
        tmp.loc[pd.isnull(tmp['cnt']), 'cnt'] = 0.0
        tmp.loc[pd.isnull(tmp['cnt']), 'sumy'] = 0.0

        def compute_beta(row):
            cnt = row['cnt'] if row['cnt'] < 200 else float('inf')
            return 1.0 / (g + np.exp((cnt - k) / f))
            
        if lambda_val is not None:
            tmp['beta'] = lambda_val
        else:
            tmp['beta'] = tmp.apply(compute_beta, axis = 1)
            
        tmp['adj_avg'] = tmp.apply(lambda row: (1.0 - row['beta']) * row['avgY'] + row['beta'] * row['pred_0'],
                                   axis = 1)
                                   
        tmp.loc[pd.isnull(tmp['avgY']), 'avgY'] = tmp.loc[pd.isnull(tmp['avgY']), 'pred_0']
        tmp.loc[pd.isnull(tmp['adj_avg']), 'adj_avg'] = tmp.loc[pd.isnull(tmp['adj_avg']), 'pred_0']
        tmp['random'] = np.random.uniform(size = len(tmp))
        tmp['adj_avg'] = tmp.apply(lambda row: row['adj_avg'] *(1 + (row['random'] - 0.5) * r_k),
                                   axis = 1)
    
        return tmp['adj_avg'].ravel()
     
    #cv for training set 
    k_fold = StratifiedKFold(5)
    X_train[feature_name] = -999 
    for (train_index, cv_index) in k_fold.split(np.zeros(len(X_train)),
                                                X_train['interest_level'].ravel()):
        sub = pd.DataFrame(data = {variable: X_train[variable],
                                   'y': X_train[y],
                                   'pred_0': X_train[pred_0]})
            
        sub1 = sub.iloc[train_index]        
        sub2 = sub.iloc[cv_index]
        
        X_train.loc[cv_index, feature_name] = calculate_average(sub1, sub2)
    
    #for test set
    sub1 = pd.DataFrame(data = {variable: X_train[variable],
                                'y': X_train[y],
                                'pred_0': X_train[pred_0]})
    sub2 = pd.DataFrame(data = {variable: X_test[variable],
                                'y': np.zeros(X_test.shape[0]),
                                'pred_0': np.zeros(X_test.shape[0])})
    X_test.loc[:, feature_name] = calculate_average(sub1, sub2)   
    
    return X_train,X_test
    


In [6]:
np.zeros(4)

array([ 0.,  0.,  0.,  0.])

In [21]:
def prepare_categorical(df1_,df2_):
    df1 = df1_.copy()
    df2 = df2_.copy()
    df1['source'] = 1
    df2['source'] = 2
    df = df1.append(df2)
    df = df.reset_index(drop=True)
    
    #some of the building id is zero, whereas other rows with same street address have nonzero building_id
    nonzero = df[(df['building_id']!='0')&(df['street_address']!='')&(df['street_address'].notnull())]
    id_addr = nonzero.groupby('street_address')['building_id'].first().reset_index()
    del df['building_id']
    df = pd.merge(df,id_addr,on='street_address',how='left')
    df['building_id'] = df['building_id'].fillna('0')
    
    #assign those categorical data that only appear once to the same value
    def objects_with_only_one_record(df,feature_name):
        #import pdb;pdb.set_trace()
        temp = df.groupby(feature_name, as_index = False).count()
        return temp[temp['source'] == 1]

    #import pdb;pdb.set_trace()
    managers_with_one_lot = objects_with_only_one_record(df,'manager_id')
    buildings_with_one_lot = objects_with_only_one_record(df,'building_id')
    addresses_with_one_lot = objects_with_only_one_record(df,'display_address')

    df.loc[df['manager_id'].isin(managers_with_one_lot['manager_id'].ravel()), 
          'manager_id'] = "once"
    df.loc[df['building_id'].isin(buildings_with_one_lot['building_id'].ravel()), 
          'building_id'] = "once"
    df.loc[df['display_address'].isin(addresses_with_one_lot['display_address'].ravel()), 
          'display_address'] = "once"
                
    df1 = df[df['source']==1].copy()
    df2 = df[df['source']==2].copy()
    del df1['source']
    del df2['source']
    del df
    
    return df1,df2

def transform_categorical(xtrain_,xtest_):
    Xtrain,Xtest = prepare_categorical(xtrain_,xtest_)
    
    train_size = len(Xtrain)
    low_count = len(Xtrain[Xtrain['interest_level'] == 0])
    medium_count = len(Xtrain[Xtrain['interest_level'] == 1])
    high_count = len(Xtrain[Xtrain['interest_level'] == 2])

    Xtrain['low'] = 0
    Xtrain.loc[Xtrain['interest_level'] == 0, 'low'] = 1
    Xtrain['medium'] = 0
    Xtrain.loc[Xtrain['interest_level'] == 1, 'medium'] = 1
    Xtrain['high'] = 0
    Xtrain.loc[Xtrain['interest_level'] == 2, 'high'] = 1
    
    Xtrain['pred0_low'] = low_count * 1.0 / train_size
    Xtrain['pred0_medium'] = medium_count * 1.0 / train_size
    Xtrain['pred0_high'] = high_count * 1.0 / train_size
    
    fea_list = []
    for col in ["building_id", "manager_id"]:        
        Xtrain,Xtest = categorical_average(Xtrain,Xtest,col, "medium", "pred0_medium",col+'_mean_medium')
        Xtrain,Xtest = categorical_average(Xtrain,Xtest,col, "high", "pred0_high",col+'_mean_high')
        fea_list += [col+'_mean_medium',col+'_mean_high']
    
    categorical = ['building_id', 'manager_id', 'display_address']

    #import pdb;pdb.set_trace()
    for f in categorical:
        print f
        encoder = preprocessing.LabelEncoder()
        encoder.fit(list(Xtrain[f]) + list(Xtest[f])) 
        Xtrain[f] = encoder.transform(Xtrain[f].ravel())
        Xtest[f] = encoder.transform(Xtest[f].ravel())  
    fea_list += categorical
    return Xtrain,Xtest,fea_list    

In [8]:
def additional_feature(train_df_,test_df_):    
    train_df = train_df_.copy()
    test_df = test_df_.copy()
    
    train_df["created"] = pd.to_datetime(train_df["created"])
    test_df["created"] = pd.to_datetime(test_df["created"])

    test_df['days'] = test_df['created']-pd.to_datetime('2016-04-01')
    train_df['days'] = train_df['created'] - pd.to_datetime('2016-04-01')

    test_df['days'] = test_df['days']/np.timedelta64(1, 'D')
    train_df['days'] = train_df['days']/np.timedelta64(1,'D')

    # Let us extract some features like year, month, day, hour from date columns #
    train_df["created_month"] = train_df["created"].dt.month
    test_df["created_month"] = test_df["created"].dt.month
    train_df["created_day"] = train_df["created"].dt.day
    test_df["created_day"] = test_df["created"].dt.day
    train_df["created_hour"] = train_df["created"].dt.hour
    test_df["created_hour"] = test_df["created"].dt.hour
    # count of photos #
    train_df["num_photos"] = train_df["photos"].apply(len)
    test_df["num_photos"] = test_df["photos"].apply(len)

    # count of "features" #
    train_df["num_features"] = train_df["features"].apply(len)
    test_df["num_features"] = test_df["features"].apply(len)

    # count of words present in description column #
    train_df["num_description_words"] = train_df["description"].apply(lambda x: len(x.split(" ")))
    test_df["num_description_words"] = test_df["description"].apply(lambda x: len(x.split(" ")))

    fea_list = ["num_features","num_description_words","days","num_photos", "created_month", "created_day", "created_hour"]
    return train_df,test_df,fea_list

In [9]:
def read_data():
    data_path = "../input/"
    train_file = data_path + "train.json"
    test_file = data_path + "test.json"
    train_df = pd.read_json(train_file)
    test_df = pd.read_json(test_file)
    interest_map = {'low':0,'medium':1,'high':2}
    train_df['interest_level'] = train_df['interest_level'].map(interest_map)
    #original_col = train_df.columns
    return train_df,test_df

In [104]:
def runXGB_sklearn(train_X, train_y, test_X, test_y=None, feature_names=None, seed_val=0, num_rounds=5000):

    clf = XGBClassifier(n_estimators=num_rounds,
                            objective='multi:softprob',
                            learning_rate=0.01,
                            max_depth=6,
                            min_child_weight=1,
                            subsample=.7,
                            colsample_bytree=.7,
                            colsample_bylevel=.5,
                            gamma=0.005,
                            scale_pos_weight=1,
                            base_score=.5,
                            #reg_lambda=0,
                            #reg_alpha=0,
                            #missing=0,
                            seed=seed_val)
    
    if test_y is not None:
        clf.fit(train_X, train_y,eval_set=[(train_X, train_y), (test_X, test_y)],verbose=True,eval_metric='mlogloss',
            early_stopping_rounds=50)
    else:        
        clf.fit(train_X, train_y,verbose=False)
    pred_test_y = clf.predict_proba(test_X)
    return pred_test_y, clf

In [86]:
def runXGB(train_X, train_y, test_X=None, test_y=None, feature_names=None, seed_val=0, num_rounds=5000):
    param = {}
    param['objective'] = 'multi:softprob'
    #param['eta'] = 0.1
    param['max_depth'] = 6
    param['silent'] = 1
    param['num_class'] = 3
    param['eval_metric'] = "mlogloss"
    param['min_child_weight'] = 1
    param['subsample'] = 0.7
    param['colsample_bytree'] = 0.7
    param['seed'] = seed_val
    num_rounds = num_rounds

    plst = list(param.items())
    xgtrain = xgb.DMatrix(train_X, label=train_y)
    
    def custom_rates(boosting_round, num_boost_round):
        total_round = max(boosting_round,num_boost_round)
        curr_round = min(boosting_round,num_boost_round)
        
        if curr_round<100:
            return 0.1
        elif curr_round<300:
            #print 'learning rate 0.01'
            return 0.05
        elif curr_round<1000:
            return 0.01
        elif curr_round<1500:
            return 0.005
        else:
            return 0.001

    if test_y is not None:
        xgtest = xgb.DMatrix(test_X, label=test_y)
        watchlist = [ (xgtrain,'train'), (xgtest, 'test') ]
        model = xgb.train(plst, xgtrain, num_rounds, watchlist, early_stopping_rounds=30,verbose_eval=500, 
                          callbacks = [xgb.callback.reset_learning_rate(custom_rates)])
    else:
        
        model = xgb.train(plst, xgtrain, num_rounds,verbose_eval=False,
                          callbacks = [xgb.callback.reset_learning_rate(custom_rates)])
    pred_test_y = None
    if test_X is not None:
        xgtest = xgb.DMatrix(test_X)
        pred_test_y = model.predict(xgtest)
    
    #plot_hist(test_y,pred_test_y)
    return pred_test_y, model

In [41]:
# Utility function to report best scores
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")


In [64]:
np.logspace(2.8,3.8,4,dtype=int)

array([ 630, 1359, 2928, 6309])

In [70]:
def runXGB_gridsearch(X, y,seed_val=1234):
    from sklearn.model_selection import GridSearchCV
    from time import time
    
    #import pdb;pdb.set_trace()
    params = {}
    params['n_estimators'] = np.logspace(3,3.8,3,dtype=int)
    params['learning_rate'] = np.logspace(-4,-1,4)
    params['max_depth'] = [6]
    params['subsample'] = [0.7]
    params['colsample_bytree'] = [0.7,0.4]
    params['colsample_bylevel'] = [0.7]
    params['gamma'] = np.logspace(-3,-1,3)

    clf = XGBClassifier(objective='multi:softprob',
                            min_child_weight=1,
                            scale_pos_weight=1,
                            base_score=.5,
                            #reg_lambda=0,
                            #reg_alpha=0,
                            #missing=0,
                            seed=seed_val)
    
    grid_search = GridSearchCV(clf, param_grid=params,verbose=1,n_jobs=-1,scoring= 'neg_log_loss')
    start = time()
    grid_search.fit(X, y)

    print("GridSearchCV took %.2f seconds for %d candidate parameter settings."
      % (time() - start, len(grid_search.cv_results_['params'])))
    report(grid_search.cv_results_)
    return grid_search



In [10]:
#Global parameter
feature_params = {}
feature_params['nTextFea'] = 300 # Number of text features used}
feature_params['nQuantLevel'] = 10 # number of levels for quantile computation
feature_params['step_size'] = 0.02 # the step size when discreting latitude and longitude


In [23]:
train_df,test_df = read_data()
train_df,test_df,fealist_txt = text_fea(train_df,test_df,nTop = feature_params['nTextFea'])
train_df,test_df,fealist_addr = address_proc(train_df,test_df)
train_df,test_df,fealist_quant1 = quantile_price_lat_long(train_df,test_df,
                                step_size = feature_params['step_size'],nLevel=feature_params['nQuantLevel'])
train_df,test_df,fealist_quant2 = quantile_price(train_df,test_df,nLevel=feature_params['nQuantLevel'])



fea_list length is 303


In [24]:
train_df,test_df,fea_categorical = transform_categorical(train_df,test_df)
train_df,test_df,fea_additional = additional_feature(train_df,test_df)

building_id
manager_id
display_address


In [102]:
fealist = ["bathrooms", "bedrooms", "latitude", "longitude", "price",'listing_id'] 
#fealist = fealist+ fealist_quant1 + fealist_quant2+fealist_txt+fealist_addr + fea_categorical+fea_additional
fealist = fealist+ fealist_quant1 + fealist_quant2+fealist_txt+fealist_addr + fea_additional

In [34]:
fea_categorical

['building_id_mean_medium',
 'building_id_mean_high',
 'manager_id_mean_medium',
 'manager_id_mean_high',
 'building_id',
 'manager_id',
 'display_address']

In [35]:
len(fealist)

308

In [103]:
train_X = train_df[fealist].as_matrix()
test_X = test_df[fealist].as_matrix()
train_y = np.array(train_df['interest_level'])

In [105]:
cv_scores = [] 
kf = model_selection.StratifiedKFold(n_splits=5, shuffle=True, random_state=2016)
for dev_index, val_index in kf.split(train_X,train_y):
    dev_X, val_X = train_X[dev_index,:], train_X[val_index,:]
    dev_y, val_y = train_y[dev_index], train_y[val_index]
    preds, model = runXGB_sklearn(dev_X, dev_y, val_X, val_y)
    cv_scores.append(log_loss(val_y, preds))
    print(cv_scores)
    break
print 'mean score={}'.format(np.mean(cv_scores))

[0]	validation_0-mlogloss:1.09252	validation_1-mlogloss:1.0926
Multiple eval metrics have been passed: 'validation_1-mlogloss' will be used for early stopping.

Will train until validation_1-mlogloss hasn't improved in 50 rounds.
[1]	validation_0-mlogloss:1.0865	validation_1-mlogloss:1.08662
[2]	validation_0-mlogloss:1.08062	validation_1-mlogloss:1.0808
[3]	validation_0-mlogloss:1.07484	validation_1-mlogloss:1.07507
[4]	validation_0-mlogloss:1.06918	validation_1-mlogloss:1.06947
[5]	validation_0-mlogloss:1.06359	validation_1-mlogloss:1.06394
[6]	validation_0-mlogloss:1.05808	validation_1-mlogloss:1.05848
[7]	validation_0-mlogloss:1.05258	validation_1-mlogloss:1.05307
[8]	validation_0-mlogloss:1.04716	validation_1-mlogloss:1.0477
[9]	validation_0-mlogloss:1.04179	validation_1-mlogloss:1.0424
[10]	validation_0-mlogloss:1.03668	validation_1-mlogloss:1.03731
[11]	validation_0-mlogloss:1.03149	validation_1-mlogloss:1.03218
[12]	validation_0-mlogloss:1.02644	validation_1-mlogloss:1.02718
[13

In [106]:
preds, model = runXGB_sklearn(train_X, train_y, test_X,num_rounds=4000)
out_df = pd.DataFrame(preds)
out_df.columns = ["low", "medium", "high"]
out_df["listing_id"] = test_df.listing_id.values


In [107]:
import time
filename = time.asctime() +'_submission.csv'
out_df.to_csv(filename, index=False)