In [1]:
import os
import sys
import operator
import numpy as np
import pandas as pd
from scipy import sparse
import xgboost as xgb
from xgboost import XGBClassifier,XGBRegressor
from sklearn import model_selection, preprocessing, ensemble
from sklearn.metrics import log_loss
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import hashlib
import re
from sklearn.model_selection import GridSearchCV,StratifiedKFold

#from sklearn.preprocessing import LabelEncoder



# On Column 'features' -- cleaning, transforming

In [2]:
def text_fea(df1_,df2_,nTop=300,combine=True):
    ''' df1_ is training set
        df2_ is test set
    '''
    from collections import defaultdict
    df1 = df1_.copy() # Maybe ther eis better way to avoid SettingWithCopyWarning 
    df2 = df2_.copy()
    df1['source'] = 1
    df2['source'] = 2
    df = df1.append(df2)
    df = df.reset_index(drop=True)
    #df['features'] = df['features'].map(lambda x:[tt.lower() for tt in x])
    df['description'] = df['description'].map(lambda x:x.lower())
    #---------------------------
    #original length of the first feature, meant to capture those features typed in with wrong 
    # format -- all features are cramed into one phrase
    df['len_feature0'] = df['features'].map(lambda x:0 if len(x)==0 else len(x[0])) 
    #-----------------------------
    
    def fea_clean(x): 
        if len(x) == 1:
            tmp = x[0].strip('*').split('*')
            if len(tmp) ==1:
                tmp = tmp[0].split(u'\u2022')
            x = tmp
        ret = [tt.encode('utf-8').decode('unicode_escape').encode('ascii','ignore').lower().strip() for tt in x]
        return ret
    df['features'] = df['features'].map(fea_clean)
    
    
    all_fea = defaultdict(int)
    for _,row in df.iterrows():
        for xx in row['features']:
            all_fea[xx] += 1
    sorted_fea = sorted(all_fea.iteritems(),key=lambda (k,v): v,reverse=True)
    
    combined_fea = {'laundry in unit':['laundry in unit','in-unit washer/dryer','washer & dryer',
                                        'washer/dryer','washer/dryer in unit'],
                   'laundry in building':['laundry in building','laundry room',
                                           'washer/dryer in building','on-site laundry'],
                   'gym/fitness':['gym/fitness','fitness center','gym','gym in building'],
                   'pre-war':['pre-war','prewar'],
                    'live-in superintendent':['live-in superintendent','live-in super','live in super'],
                    'hardwood floors':['hardwood floors','hardwood','hardwood floor','hardwood flooring'],
                    'high ceiling':['high ceiling','high ceilings'],
                    'full-time doorman':['full-time doorman','ft doorman','24/7 doorman','24 hour doorman',
                                        '24-hour doorman','24hr doorman','full time doorman','24 hr doorman']
                   #'garage':['garage','parking']
                   }
    
    fea_list = set([v[0] for v in sorted_fea[:nTop]])
    for k,v in combined_fea.iteritems():
        fea_list = fea_list.union(set(v))
    for fea in fea_list:
        df[fea] = 0
    #import pdb;pdb.set_trace()
    for inx,row in df.iterrows():
        notlist = []
        if len(row['features']) == 0:
            continue
        for ff in row['features']:
            if ff in fea_list:
                df.set_value(inx,ff,1)
            else:
                notlist.append(ff)
        #df.set_value(inx,'description',row['description'] + ' '.join(notlist))
    print 'fea_list length is {}'.format(len(fea_list))
    fea_list = set(fea_list)
    if combine:        
        for k,v in combined_fea.iteritems():
            df[k] = df[v[0]]
            #print k,v
            for ii in range(1,len(v)):
                df[k] = df[k] + df[v[ii]]
                del df[v[ii]]
                fea_list.remove(v[ii])
    fea_list = list(fea_list) + ['len_feature0']
    #print 'fea_list length is {}'.format(len(fea_list))
    df1 = df[df['source']==1].copy()
    df2 = df[df['source']==2].copy()
    del df1['source']
    del df2['source']
    del df
    return df1,df2,fea_list

# Street_address and display address

In [3]:
def normalize_num(x):
    if x[-1]=='1':
        return x+'st'
    elif x[-1]=='2':
        return x+'nd'
    elif x[-1]=='3':
        return x+'rd'
    else:
        return x+'th'

def normalize_street(x):
    street_name_mapping = {'st.':'street','st':'street','st,':'street','st..':'street','street,':'street',
                       'ave':'avenue','ave.':'avenue','ave,':'avenue','avenue,':'avenue','pl':'place',
                       'blvd':'boulevard','pkwy':'parkway','dr':'drive','rd.':'road','rd,':'road','rd':'road',
                       'ln':'lane',
                       'e':'east','e.':'east','w.':'west','w':'west','west,':'west','s':'south','&':'and',
                       'second':'2nd','first':'1st','third':'3rd','fourth':'4th','fifth':'5th',
                       'sixth':'6th','seventh':'7th','eighth':'8th','ninth':'9th','tenth':'10th',                       
                       #'1':'1st','2':'2nd','43':'43rd','37':'37th','34':'34th',
                      }
    xlist = x.lower().strip(' .,').split()
    output = []
    for tt in xlist:
        tmp = tt.strip(',.*')
        if len(tmp)>0:
            if tmp in street_name_mapping:
                tmp = street_name_mapping[tmp]
            elif tmp.isdigit():
                tmp = normalize_num(tmp)
            else:
                pass
            output.append(tmp)
    return ' '.join(output).strip()      

def rem_streetname_xy(x,y):
    '''remove x from y
    '''
    pos = y.find(x)
    if pos>0:
        tmp = y.replace(x,'').strip(' ,.')
        tmp = tmp.split(' ')[0].strip(',. #')
        if '-' in tmp:        
            tmp = tmp.split('-')[0].strip(', .#')
        try:
            a = int(tmp)
            return a
        except ValueError:
            #print tmp
            return None
    else:
        return None   

def rem_streetname(row):    
    x = row['display_address']
    y = row['street_address']
    return rem_streetname_xy(x,y)


def get_address_num_simple(x):
    if len(x)==0:
        return -1
    
    x1 = x.strip().split()[0]
    if x1.isdigit():
        return float(x1)
    return -1

def manhattan_locale(df1_,df2_):
    '''
    extract street or avenue number from displayed address. 
    
    For those displayed address like 'w 3rd street and 5th avenue', all three fields will have values. 
    For a majority cases, one of the (west_east, street) or (avenue) will be null (0)
    '''
    
    df1 = df1_.copy()
    df2 = df2_.copy()
    df1['source'] = 1
    df2['source'] = 2
    df = df1.append(df2)
    df = df.reset_index(drop=True)
    
    ave_mapping = {'lexington avenue': '3.5 avenue',
                  'park avenue':'4 avenue',
                  'madison avenue':'4.5 avenue',
                  'central park west':'8 avenue',
                  'columbus avenue':'9 avenue',
                  'amsterdam avenue':'10 avenue',
                  'west end avenue':'11 avenue'
                  }
    addr_adjust = {'central park west':5000,
                  'columbus avenue':5000,
                  'amsterdam avenue':5000,
                  'west end avenue':5000        
                }       
    
    df['west_east'] = 0 #west = -1,east = 1, null=0
    df['street_num'] = 0 # street number e.g. 12nd street will be 12, if not on a street, empty
    df['ave_num'] = 0 #avenue number, if not on avenue, empty
    df['addr_num_adjust'] = 0 # for several avenues 
    
    #import pdb;pdb.set_trace()
    
    for inx,row in df.iterrows():
        addr_str = ''
        addr_ave = ''
        
        if ' and ' in row['display_address']:
            ss = row['display_address'].split(' and ')
            if len(ss)>2:
                continue
            if ' street' in ss[0] and ' avenue' in ss[1]:
                addr_str = ss[0].strip()
                addr_ave = ss[1].strip()
            elif ' street' in ss[1] and ' avenue' in ss[0]:
                addr_str = ss[1].strip()
                addr_str = ss[0].strip()
            else:
                continue
        else:            
            if ' street' in row['display_address']:
                addr_str = row['display_address'].strip()
            if ' avenue' in row['display_address']:
                addr_ave = row['display_address'].strip()
        
        if len(addr_str)>0:
            num = re.sub('\D+','',row['display_address'])
            if len(num)>0:
                fields = addr_str.split()
                west_east = 0
                ii=0
                while ii < len(fields):
                    if fields[ii].strip() == 'west':
                        west_east = -1
                        break
                    elif fields[ii].strip() == 'east':
                        west_east = 1
                        break
                    ii += 1
                if ii<len(fields)-2:                    
                    df.set_value(inx,'west_east',west_east)   
                    try:
                        df.set_value(inx,'street_num',float(re.sub('\D+','',fields[ii+1])))
                    except ValueError:
                        pass
                        #print addr_str
                        #return None,None
        if len(addr_ave)>0:
            adjust = 0
            if addr_ave in addr_adjust:
                adjust = addr_adjust[addr_ave]
            if addr_ave in ave_mapping:
                addr_ave = ave_mapping[addr_ave]
            try:
                df.set_value(inx,'ave_num',float(re.sub('\D+','',addr_ave.split()[0])))
            except ValueError:
                #print addr_ave
                pass
            df.set_value(inx,'addr_num_adjust',adjust)
    df['address_num'] = df['address_num'] + df['addr_num_adjust']    
    del df['addr_num_adjust']
    df1 = df[df['source']==1].copy()
    df2 = df[df['source']==2].copy()
    del df1['source']
    del df2['source']
    del df
    
    fea_list = ['west_east','street_num','ave_num']
    return df1,df2,fea_list
    

def address_proc(train_df_,test_df_):        
    train_df = train_df_.copy()
    test_df = test_df_.copy()
    
    train_df['address_num'] = train_df['street_address'].map(get_address_num_simple)
    test_df['address_num'] = test_df['street_address'].map(get_address_num_simple)
    train_df['display_address'] = train_df['display_address'].map(lambda x:normalize_street(x))
    test_df['display_address'] = test_df['display_address'].map(lambda x:normalize_street(x))
    train_df['street_address'] = train_df['street_address'].map(lambda x:normalize_street(x))
    test_df['street_address'] = test_df['street_address'].map(lambda x:normalize_street(x))
    
    flist = ['address_num']
    train_df,test_df,ff1 = manhattan_locale(train_df,test_df)
    #train_df,test_df,ff2 = multiple_hashing(train_df,test_df,'display_address')
    #train_df,test_df,ff2 = simple_hashing(train_df,test_df,'street_address')
    
    flist = flist + ff1
    return train_df,test_df,flist

# price quantile

In [4]:
def get_quantile_by_key(df,keys,nLevel,newcol_name):
    #keys is a list of keys to groupby on
    gp = df.groupby(keys)
    levels = np.arange(0,1,1./nLevel)
    res = pd.DataFrame()
    
    for inx,data in gp:
        quantiles = [data['price'].quantile(x) for x in levels]
        #import pdb;pdb.set_trace()
        tmp = data.copy()
        tmp[newcol_name] = data['price'].map(lambda x:np.searchsorted(quantiles,x))
        
        res = res.append(tmp)
    #import pdb;pdb.set_trace()
    df = res
    return df

def quantile_price(df1_,df2_,nLevel=10):
    df1 = df1_.copy()
    df2 = df2_.copy()
    df1['source'] = 1
    df2['source'] = 2
    df = df1.append(df2)
    df = df.reset_index(drop=True)
    
    df = get_quantile_by_key(df,['bedrooms','bathrooms'],nLevel,'price_quantile')
                
    df1 = df[df['source']==1].copy()
    df2 = df[df['source']==2].copy()
    del df1['source']
    del df2['source']
    del df
    
    return df1,df2,['price_quantile']

def quantile_price_lat_long(df1_,df2_,step_size=0.02,nLevel=10):
    df1 = df1_.copy()
    df2 = df2_.copy()
    df1['source'] = 1
    df2['source'] = 2
    df = df1.append(df2)
    df = df.reset_index(drop=True)
    
    west, south, east, north = -74.02, 40.64, -73.85, 40.86
    df['long_grid'] = df['longitude'].map(lambda x: int(round((x-west)/step_size)))
    df['lat_grid'] = df['latitude'].map(lambda x:int(round((x-south)/step_size)))
    
    df = get_quantile_by_key(df,['long_grid','lat_grid','bedrooms','bathrooms'],nLevel,'price_quantile_lat_long')
                
    df1 = df[df['source']==1].copy()
    df2 = df[df['source']==2].copy()
    del df1['source']
    del df2['source']
    del df
    
    return df1,df2,['price_quantile_lat_long','long_grid','lat_grid']


In [5]:
class CategoricalFeature:
    def __init__(self):
        pass
    
    def prepare_categorical(self,df1_,df2_):
        df1 = df1_.copy()
        df2 = df2_.copy()
        df1['source'] = 1
        df2['source'] = 2
        df = df1.append(df2)
        df = df.reset_index(drop=True)
    
    #some of the building id is zero, whereas other rows with same street address have nonzero building_id
    #nonzero = df[(df['building_id']!='0')&(df['street_address']!='')&(df['street_address'].notnull())]
    #id_addr = nonzero.groupby('street_address')['building_id'].first().reset_index()
    #del df['building_id']
    #df = pd.merge(df,id_addr,on='street_address',how='left')
    #df['building_id'] = df['building_id'].fillna('0')
    
    #assign those categorical data that only appear once to the same value
        def objects_with_only_one_record(df,feature_name):
            #import pdb;pdb.set_trace()
            temp = df.groupby(feature_name, as_index = False).count()
            return temp[temp['source'] == 1]

        #import pdb;pdb.set_trace()
        managers_with_one_lot = objects_with_only_one_record(df,'manager_id')
        buildings_with_one_lot = objects_with_only_one_record(df,'building_id')
        addresses_with_one_lot = objects_with_only_one_record(df,'display_address')

        df.loc[df['manager_id'].isin(managers_with_one_lot['manager_id'].ravel()), 
          'manager_id'] = "once"
        df.loc[df['building_id'].isin(buildings_with_one_lot['building_id'].ravel()), 
          'building_id'] = "once"
        df.loc[df['display_address'].isin(addresses_with_one_lot['display_address'].ravel()), 
          'display_address'] = "once"
                
        df1 = df[df['source']==1].copy()
        df2 = df[df['source']==2].copy()
        del df1['source']
        del df2['source']
        del df
    
        return df1,df2



In [6]:
class categorical_cv(CategoricalFeature):
    def __init__(self,nfold,k=5.0,f=1.0,r_k=0.01,g=1.0):
        self.k = k
        self.f = f
        self.r_k = r_k
        self.g = g
        self.nfold = nfold
        
    def cat2num(self,df_tr_,df_te_,cat_var,target):
        #example: cat_var = 'building_id', target='medium'
        dftrain = df_tr_.copy()
        dftest = df_te_.copy()
    
        global_avg = dftrain[target].sum()*1.0/len(dftrain)
        gp = dftrain.groupby(cat_var)
        cat_avg = gp[target].agg({'avg':np.mean,
                             'cnt':lambda x:len(x)})
        cat_avg = cat_avg.reset_index()
        cat_avg['beta'] = cat_avg['cnt'].map(lambda x:1./(self.g+np.exp((x-self.k)/self.f)) if x<200 else 0.)
    
        cat_avg['cat2num'] = cat_avg['avg']*(1-cat_avg['beta']) + global_avg*cat_avg['beta']
        dftest = pd.merge(dftest,cat_avg[[cat_var,'cat2num']],on=cat_var,how='left')
        dftest['cat2num'] = dftest['cat2num'].fillna(global_avg)
    
        return dftest['cat2num'].as_matrix()
    def categorical_average(self,Xtrain_,Xtest_,variable,target):       
        X_train = Xtrain_.copy()
        X_test = Xtest_.copy()
    
        k_fold = StratifiedKFold(self.nfold,shuffle=True,random_state=222)
        fea_name = variable + '_' + target
        fea_train = np.zeros(len(X_train))
        fea_test = np.zeros(len(X_test))
    
        for train_inx,cv_inx in k_fold.split(np.zeros((len(X_train),2)),X_train['interest_level'].ravel()):
            fea_train[cv_inx] = self.cat2num(X_train.iloc[train_inx,:],X_train.iloc[cv_inx,:],variable,target)
    
        X_train[fea_name] = fea_train
        X_test.loc[:,fea_name] = self.cat2num(X_train,X_test,variable,target)
        return X_train,X_test,fea_name
    
    def transform(self,xtrain_,xtest_):
        Xtrain,Xtest = self.prepare_categorical(xtrain_,xtest_)
    
        categorical = ['building_id', 'manager_id', 'display_address']
        fea_list = categorical
        for f in categorical:
            encoder = preprocessing.LabelEncoder()
            encoder.fit(list(Xtrain[f]) + list(Xtest[f])) 
            Xtrain[f] = encoder.transform(Xtrain[f].ravel())
            Xtest[f] = encoder.transform(Xtest[f].ravel())
    
        Xtrain['low'] = 0
        Xtrain.loc[Xtrain['interest_level'] == 0, 'low'] = 1
        Xtrain['medium'] = 0
        Xtrain.loc[Xtrain['interest_level'] == 1, 'medium'] = 1
        Xtrain['high'] = 0
        Xtrain.loc[Xtrain['interest_level'] == 2, 'high'] = 1
    
        for col in ["building_id", "manager_id"]:        
            Xtrain,Xtest,fea1 = self.categorical_average(Xtrain,Xtest,col, "medium")
            Xtrain,Xtest,fea2 = self.categorical_average(Xtrain,Xtest,col, "high")
            fea_list += [fea1,fea2]
    
    
        return Xtrain,Xtest,fea_list    

In [7]:
class categorical_lit(CategoricalFeature):
    def __init__(self,nfold,k=5.0,f=1.0,r_k=0.01,g=1.0):
        self.k = k
        self.f = f
        self.r_k = r_k
        self.g = g
        self.nfold = nfold
        self.global_avg = None
        
    def cat2num(self,df_tr_,df_te_,cat_var,target):
        #example: cat_var = 'building_id', target='medium'
        dftrain = df_tr_.copy()
        dftest = df_te_.copy()
    
        
        gp = dftrain.groupby(cat_var)
        cat_avg = gp[target].agg({'avg':np.mean,
                             'cnt':lambda x:len(x)})
        cat_avg = cat_avg.reset_index()
        cat_avg['beta'] = cat_avg['cnt'].map(lambda x:1./(self.g+np.exp((x-self.k)/self.f)) if x<200 else 0.)
    
        cat_avg['cat2num'] = cat_avg['avg']*(1-cat_avg['beta']) + self.global_avg*cat_avg['beta']
        
        dftest = pd.merge(dftest,cat_avg[[cat_var,'cat2num']],on=cat_var,how='left')
        dftest['cat2num'] = dftest['cat2num'].fillna(self.global_avg)
        if self.r_k: 
            ratio = np.random.uniform(1 - self.r_k, 1 + self.r_k, len(dftest))
        else:
            ratio = 1
        return dftest['cat2num'].as_matrix()*ratio
    
    def categorical_average(self,Xtrain_,Xtest_,variable,target):       
        X_train = Xtrain_.copy()
        X_test = Xtest_.copy()
    
        self.global_avg = X_train[target].sum()*1.0/len(X_train)
        
        k_fold = StratifiedKFold(self.nfold,shuffle=True,random_state=222)
        fea_name = variable + '_' + target +'_lit'
        fea_train = np.zeros(len(X_train))
        fea_test = np.zeros(len(X_test))
    
        for train_inx,cv_inx in k_fold.split(np.zeros((len(X_train),2)),X_train['interest_level'].ravel()):
            fea_train[cv_inx] = self.cat2num(X_train.iloc[train_inx,:],X_train.iloc[cv_inx,:],variable,target)
    
        X_train[fea_name] = fea_train
        X_test.loc[:,fea_name] = self.cat2num(X_train,X_test,variable,target)
        return X_train,X_test,fea_name
    
    def transform(self,xtrain_,xtest_):
        Xtrain,Xtest = self.prepare_categorical(xtrain_,xtest_)
    
        categorical = ['building_id', 'manager_id', 'display_address']
        fea_list = categorical
        for f in categorical:
            encoder = preprocessing.LabelEncoder()
            encoder.fit(list(Xtrain[f]) + list(Xtest[f])) 
            Xtrain[f] = encoder.transform(Xtrain[f].ravel())
            Xtest[f] = encoder.transform(Xtest[f].ravel())
    
        Xtrain['low'] = 0
        Xtrain.loc[Xtrain['interest_level'] == 0, 'low'] = 1
        Xtrain['medium'] = 0
        Xtrain.loc[Xtrain['interest_level'] == 1, 'medium'] = 1
        Xtrain['high'] = 0
        Xtrain.loc[Xtrain['interest_level'] == 2, 'high'] = 1
    
        for col in ["building_id", "manager_id"]:        
            Xtrain,Xtest,fea1 = self.categorical_average(Xtrain,Xtest,col, "medium")
            Xtrain,Xtest,fea2 = self.categorical_average(Xtrain,Xtest,col, "high")
            fea_list += [fea1,fea2]
    
    
        return Xtrain,Xtest,fea_list    

In [8]:
def getQuant(df1_,df2_,independent_var,output_name):
    #independent_var can be:
    # 1. ['longitude','latitude']
    # 2. ['longitude','latitude','bathrooms']
    # 3. ['polar_rho','polar_theta']
    # 4. ['polar_rho','polar_theta','bathrooms']
    
    from sklearn.ensemble import GradientBoostingRegressor
    
    df1 = df1_.copy()
    df2 = df2_.copy()
    df1['source'] = 1
    df2['source'] = 2
    df = df1.append(df2)
    df = df.reset_index(drop=True)
    
    df[output_name] = 4
    
    for room in range(0,6):
        if room>=5:
            mask = (df['bedrooms']>=5)
        else:
            mask = (df['bedrooms']==room)
        tmp = df[mask].copy()
        
        jj = 0
        all_quantile = np.zeros((len(tmp),9))
        for alpha in np.arange(0.1,1,0.1):
            rgr = GradientBoostingRegressor(loss='quantile',alpha=alpha,n_estimators=100,max_depth=2)
            rgr.fit(tmp[independent_var],tmp['price'].as_matrix().ravel())
            pred = rgr.predict(tmp[independent_var])
            all_quantile[:,jj] = pred
            jj += 1
        quant_res = [np.searchsorted(all_quantile[ii,:],tmp['price'].iloc[ii]) for ii in range(len(tmp))]
        df.loc[mask,output_name] = quant_res
        
    
    df1 = df[df['source']==1].copy()
    df2 = df[df['source']==2].copy()
    del df1['source']
    del df2['source']
    del df
    
    return df1,df2

In [9]:
def additional_feature(train_df_,test_df_):    
    train_df = train_df_.copy()
    test_df = test_df_.copy()
    
    train_df["created"] = pd.to_datetime(train_df["created"])
    test_df["created"] = pd.to_datetime(test_df["created"])

    test_df['days'] = test_df['created']-pd.to_datetime('2016-04-01')
    train_df['days'] = train_df['created'] - pd.to_datetime('2016-04-01')
    test_df['hours'] = test_df['days']/np.timedelta64(1,'h')
    train_df['hours'] = train_df['days']/np.timedelta64(1,'h')
    test_df['days'] = test_df['days']/np.timedelta64(1, 'D')
    train_df['days'] = train_df['days']/np.timedelta64(1,'D')
    
    train_df['hours'] = train_df['hours'].map(int)
    test_df['hours'] = test_df['hours'].map(int)
    gp = train_df.append(test_df).groupby('hours').size()
    gp.name = 'hour_size'
    gp = gp.reset_index()
    train_df = pd.merge(train_df,gp,on='hours')
    test_df = pd.merge(test_df,gp,on='hours')
    del test_df['hours']
    del train_df['hours']
    
    train_df['weekdays'] = train_df['created'].map(lambda x:x.weekday())
    test_df['weekdays'] = test_df['created'].map(lambda x:x.weekday())

    # Let us extract some features like year, month, day, hour from date columns #
    train_df["created_month"] = train_df["created"].dt.month
    test_df["created_month"] = test_df["created"].dt.month
    train_df["created_day"] = train_df["created"].dt.day
    test_df["created_day"] = test_df["created"].dt.day
    train_df["created_hour"] = train_df["created"].dt.hour
    test_df["created_hour"] = test_df["created"].dt.hour
    # count of photos #
    train_df["num_photos"] = train_df["photos"].apply(len)
    test_df["num_photos"] = test_df["photos"].apply(len)

    # count of "features" #
    train_df["num_features"] = train_df["features"].apply(len)
    test_df["num_features"] = test_df["features"].apply(len)

    # count of words present in description column #
    train_df["num_description_words"] = train_df["description"].apply(lambda x: len(x.split(" ")))
    test_df["num_description_words"] = test_df["description"].apply(lambda x: len(x.split(" ")))
    
    tmp = train_df.append(test_df)
    gp = tmp.groupby('manager_id').size()
    gp.name = 'manager_count'
    gp = gp.reset_index()
    train_df = pd.merge(train_df,gp,how='left')
    test_df = pd.merge(test_df,gp,how='left')
    
    gp = tmp.groupby('building_id').size()
    gp.name = 'building_count'
    gp = gp.reset_index()
    train_df = pd.merge(train_df,gp,how='left')
    test_df = pd.merge(test_df,gp,how='left')
    
    #train_df['price_per_bath'] = train_df['price'] / train_df['bathrooms']
    #train_df['price_per_room'] = train_df['price'] / (train_df['bathrooms'] + train_df['bedrooms'] )

    #test_df['price_per_bath'] = test_df['price'] / test_df['bathrooms']
    #test_df['price_per_room'] = test_df['price'] / (0.5*test_df['bathrooms'] + test_df['bedrooms'] )
    
    fea_list = ['weekdays','manager_count','building_count',"num_features","num_description_words","days","num_photos", "created_month", "created_day", "created_hour"]
    return train_df,test_df,fea_list

In [10]:
def read_data():
    data_path = "../input/"
    train_file = data_path + "train.json"
    test_file = data_path + "test.json"
    train_df = pd.read_json(train_file)
    test_df = pd.read_json(test_file)
    interest_map = {'low':0,'medium':1,'high':2}
    train_df['interest_level'] = train_df['interest_level'].map(interest_map)
    fmt = lambda s: s.replace("\u00a0", "").strip().lower()
    train_df["street_address"] = train_df['street_address'].apply(fmt)
    train_df["display_address"] = train_df["display_address"].apply(fmt)
    #original_col = train_df.columns
    test_df["street_address"] = test_df['street_address'].apply(fmt)
    test_df["display_address"] = test_df["display_address"].apply(fmt)
    
    return train_df,test_df

In [11]:
def runXGB_sklearn_calibration(train_X, train_y, test_X, test_y=None, feature_names=None, seed_val=0, num_rounds=2800,verbose=True):

    clf = XGBClassifier(n_estimators=num_rounds,
                            objective='multi:softprob',
                            learning_rate=0.01,
                            max_depth=6,
                            min_child_weight=1,
                            subsample=.7,
                            colsample_bytree=.7,
                            colsample_bylevel=.5,
                            gamma=0.005,
                            scale_pos_weight=1,
                            base_score=.5,
                            #reg_lambda=0,
                            #reg_alpha=0,
                            #missing=0,
                            seed=seed_val)
    
    from sklearn.calibration import CalibratedClassifierCV
    clf_isotonic = CalibratedClassifierCV(clf, cv=3, method='isotonic')
    clf_isotonic.fit(train_X, train_y)
    prob_iso = clf_isotonic.predict_proba(test_X)

    # Gaussian Naive-Bayes with sigmoid calibration
    clf_sigmoid = CalibratedClassifierCV(clf, cv=3, method='sigmoid')
    clf_sigmoid.fit(train_X, train_y)
    prob_sig = clf_sigmoid.predict_proba(test_X)
    
    return prob_iso,prob_sig

In [12]:
def runXGB_sklearn(train_X, train_y,  test_X, test_y=None,sample_weight=None, feature_names=None, seed_val=0, num_rounds=5000,verbose=True):

    clf = XGBClassifier(n_estimators=num_rounds,
                            objective='multi:softprob',
                            learning_rate=0.01,
                            max_depth=6,
                            min_child_weight=1,
                            subsample=.7,
                            colsample_bytree=.7,
                            colsample_bylevel=.5,
                            gamma=0.005,
                            scale_pos_weight=1,
                            base_score=.5,
                            #reg_lambda=0,
                            #reg_alpha=0,
                            #missing=0,
                            seed=seed_val)
    
    if test_y is not None:
        clf.fit(train_X, train_y,sample_weight= sample_weight,eval_set=[(train_X, train_y), (test_X, test_y)],verbose=verbose,eval_metric='mlogloss',
            early_stopping_rounds=50)
    else:        
        clf.fit(train_X, train_y,sample_weight = sample_weight,verbose=False)
    pred_test_y = clf.predict_proba(test_X)
    return pred_test_y, clf

In [13]:
clf = XGBClassifier(n_estimators=3000,
                            objective='multi:softprob',
                            learning_rate=0.01,
                            max_depth=6,
                            min_child_weight=1,
                            subsample=.7,
                            colsample_bytree=.7,
                            colsample_bylevel=.5,
                            gamma=0.005,
                            scale_pos_weight=1,
                            base_score=.5,
                            #reg_lambda=0,
                            #reg_alpha=0,
                            #missing=0,
                            seed=22)

In [45]:
def simple_cv(train_df,test_df):
    train_X = train_df[fealist].as_matrix()
    test_X = test_df[fealist].as_matrix()
    train_y = np.array(train_df['interest_level'])
    
    import pdb;pdb.set_trace()
    
    cv_scores = [] 
    #print fea_categorical
    #print fea_additional
    #print feature_params
    kf = model_selection.StratifiedKFold(n_splits=5, shuffle=True, random_state=2016)
    for dev_index, val_index in kf.split(train_X,train_y):
        dev_X, val_X = train_X[dev_index,:], train_X[val_index,:]
        dev_y, val_y = train_y[dev_index], train_y[val_index]
        preds, model = runXGB_sklearn(dev_X, dev_y, val_X, val_y,verbose=False)
        print('best iterations:{}, best_score={}, last_score={}'.format(model.best_iteration,
                                                                   model.best_score,log_loss(val_y, preds)))
        importance_inx = np.argsort(model.feature_importances_*-1)
        print('Most important 40 features:')
        ff = [(fealist[x],model.feature_importances_[x]) for x in importance_inx[:40]]
        print(ff)
        print('-------------------------')
          
    
        cv_scores.append(log_loss(val_y, preds))
    print 'mean score={}'.format(np.mean(cv_scores))

In [14]:
#Global parameter
feature_params = {}
feature_params['nTextFea'] = 300 # Number of text features used}
feature_params['nQuantLevel'] = 10 # number of levels for quantile computation
feature_params['step_size'] = 0.02 # the step size when discreting latitude and longitude
feature_params['category_nfold'] = 5 # n-fold to transform the categorical variable


In [39]:
train_df,test_df = read_data()

In [40]:
train_df['interest_level'].head(10)

10        1
10000     0
100004    2
100007    0
100013    0
100014    1
100016    0
100020    0
100026    1
100027    0
Name: interest_level, dtype: int64

In [38]:

train_df,test_df,fealist_txt = text_fea(train_df,test_df,nTop = feature_params['nTextFea'])
#train_df,test_df,fealist_addr = address_proc(train_df,test_df)


#train_df,test_df,fealist_quant1 = quantile_price_lat_long(train_df,test_df,
#                                step_size = feature_params['step_size'],nLevel=feature_params['nQuantLevel'])
#train_df,test_df,fealist_quant2 = quantile_price(train_df,test_df,nLevel=feature_params['nQuantLevel'])



fea_list length is 303


In [21]:
#train_df,test_df = getQuant(train_df,test_df,['latitude','longitude'],'gbm_quant_lat_long')
#fealist_quant_gbm = ['gbm_quant_lat_long']

In [22]:
#ca = categorical_cv(feature_params['category_nfold'])
#train_df,test_df,fea_categorical = ca.transform(train_df,test_df)

In [41]:
train_df,test_df,fea_additional = additional_feature(train_df,test_df)

In [47]:
train_df[fealist+['interest_level']].head()

Unnamed: 0,bathrooms,bedrooms,latitude,longitude,price,listing_id,weekdays,manager_count,building_count,num_features,num_description_words,days,num_photos,created_month,created_day,created_hour,interest_level
0,1.5,3,40.7145,-73.9425,3000,7211212,4,235,5,0,95,84.329444,5,6,24,7,1
1,1.0,0,40.7715,-73.993,2400,7210427,4,38,137,11,198,84.297384,7,6,24,7,1
2,2.0,3,40.6678,-73.9398,3050,7211226,4,55,20664,8,183,84.329988,8,6,24,7,0
3,2.0,4,40.7436,-73.9727,6100,7210946,4,38,457,13,210,84.318426,8,6,24,7,0
4,1.0,3,40.691,-73.9228,2895,7210714,4,7,1,4,122,84.309664,8,6,24,7,0


In [43]:
fealist = ["bathrooms", "bedrooms", "latitude", "longitude", "price",'listing_id'] 
fealist = fealist + fea_additional
#fealist = fealist+fealist_txt+fealist_addr +fea_additional
#fealist = fealist+ fealist_quant1 + fealist_quant2+fealist_txt+fealist_addr + fea_additional

In [28]:
len(fealist_addr)

4

In [50]:
def func(a,b,c):
    print a+b
    print a+c
    
class foo():
    def __init__(self):
        pass
    def ff(self,x,**arg):
        print x*2
        func(**arg)

In [51]:
tmp = foo()

In [52]:
tmp.ff(1,a=4,b=5,c=6)

2
9
10


In [35]:
train_df['interest_level'].head(10)

0    1.0
1    1.0
2    0.0
3    0.0
4    0.0
5    0.0
6    0.0
7    0.0
8    0.0
9    0.0
Name: interest_level, dtype: float64

In [53]:
from sklearn.svm import SVC

In [54]:
tmp = SVC()

In [56]:
hasattr(tmp, 'predict_proba')

False