In [14]:
import numpy as np
import pandas as pd
from scipy import sparse
import xgboost as xgb
import random
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import log_loss
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import StratifiedKFold



In [2]:
train_df = pd.read_json("data/train.json")
test_df = pd.read_json("data/test.json")
train_df['features'].head()

10                                                       []
100004    [Laundry In Building, Dishwasher, Hardwood Flo...
100007                            [Hardwood Floors, No Fee]
100014                                                   []
100026    [Cats Allowed, Dogs Allowed, Elevator, Laundry...
Name: features, dtype: object

In [3]:
def feature_engineering(df, flag, b_dict={}, m_dict={}):

    ulimit = np.percentile(df.price.values, 99)
    df['price'].loc[df.price > ulimit] = ulimit

    llimit = np.percentile(df.latitude.values, 1)
    ulimit = np.percentile(df.latitude.values, 99)
    df['latitude'].loc[df.latitude < llimit] = llimit
    df['latitude'].loc[df.latitude > ulimit] = ulimit

    llimit = np.percentile(df.longitude.values, 1)
    ulimit = np.percentile(df.longitude.values, 99)
    df['longitude'].loc[df.longitude < llimit] = llimit
    df['longitude'].loc[df.longitude > ulimit] = ulimit
    
    df["created"] = pd.to_datetime(df["created"])
    df["created_year"] = df["created"].dt.year
    df["created_month"] = df["created"].dt.month
    df["created_day"] = df["created"].dt.day
    df["created_hour"] = df["created"].dt.hour
    df["is_night"] = 1*(df["created_hour"] <= 7)
    df["created_weekday"] = df["created"].dt.weekday
    df["is_weekend"] = 1*(df["created_weekday"] > 5)
    
    df["total_rooms"] = df["bedrooms"] + df["bathrooms"]
    df['price_per_bed'] = df['price']/(df['bedrooms']+1)
    df['price_per_bath'] = df['price']/(df['bathrooms']+1)
    df["price_per_room"] = df['price']/(df['total_rooms']+1)
    
    df['description'] = df['description'].str.replace('<[^<>]+>', ' ')
    df['description'] = df['description'].str.replace('[0-9]+', 'num')
    df['description'] = df['description'].str.lower().replace('[^a-zA-Z0-9]', ' ')
    for i in df.index:
        df.loc[i, 'len_description'] = len(df.loc[i, 'description'])
        df.loc[i, 'num_features'] = len(df.loc[i, 'features'])

    

    if not flag:
        interest_dummies = pd.get_dummies(df.interest_level)
        df_dumm = pd.concat([df,interest_dummies[['low','medium','high']]], axis = 1).drop('interest_level', axis = 1)

        df_n = df_dumm[['building_id', 'manager_id', 'low', 'high', 'medium']]
        df_n_m = df_n.groupby(df_n.manager_id).mean()
        df_n_b = df_n.groupby(df_n.building_id).mean()

        df_n_m['manager_skill'] = df_n_m['low']*0+ df_n_m['medium']*1 + df_n_m['high']*2
        df_n_b['build_popularity'] = df_n_b['low']*0+ df_n_b['medium']*1 + df_n_b['high']*2

        df_n_m.drop(['low', 'high', 'medium'], axis=1)
        df_n_b.drop(['low', 'high', 'medium'], axis=1)
        b_dict = df_n_b.to_dict()
        m_dict = df_n_m.to_dict() 

    for i in range(df.shape[0]):
        if df.loc[df.index[i], 'building_id'] in b_dict['build_popularity']:
            df.loc[df.index[i], 'building_popularity'] = b_dict['build_popularity'][df.loc[df.index[i], 'building_id']]
        else:
            df.loc[df.index[i], 'building_popularity'] = 0.33453255354249495 #bp_mean
    
        if df.loc[df.index[i], 'manager_id'] in m_dict['manager_skill']:
            df.loc[df.index[i], 'manager_skill'] = m_dict['manager_skill'][df.loc[df.index[i], 'manager_id']]
        else:
            df.loc[df.index[i], 'manager_skill'] = 0.3666338439324288 #ms_mean
    df["logprice"] = np.log(df["price"])

    num_feats = ["bathrooms", "bedrooms", "latitude", "longitude", "price", "num_features", 
    "len_description","created_year", "created_month", "created_day", "created_hour", "created_weekday", 
    "is_night", "is_weekend", "price_per_bed", "price_per_bath", "total_rooms", "price_per_room",
    "manager_skill", "building_popularity", "description", "features"]
    X = df[num_feats]
    return [X, b_dict, m_dict]
train, b_dict, m_dict = feature_engineering(df=train_df,flag=0)
test, b_dict, m_dict = feature_engineering(df=test_df,flag=1, b_dict = b_dict,m_dict =  m_dict)
train["logprice"] = np.log(train_df["price"])
test["logprice"] = np.log(test_df["price"])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [3]:
# train_df["price_t"] =train_df["price"]/train_df["bedrooms"]
# test_df["price_t"] = test_df["price"]/test_df["bedrooms"] 

# train_df["room_sum"] = train_df["bedrooms"]+train_df["bathrooms"] 
# test_df["room_sum"] = test_df["bedrooms"]+test_df["bathrooms"] 

# train_df['price_per_room'] = train_df['price']/train_df['room_sum']
# test_df['price_per_room'] = test_df['price']/test_df['room_sum']

# train_df["num_photos"] = train_df["photos"].apply(len)
# test_df["num_photos"] = test_df["photos"].apply(len)

# train_df["num_features"] = train_df["features"].apply(len)
# test_df["num_features"] = test_df["features"].apply(len)

# train_df["num_description_words"] = train_df["description"].apply(lambda x: len(x.split(" ")))
# test_df["num_description_words"] = test_df["description"].apply(lambda x: len(x.split(" ")))

In [4]:
# train_df["created"] = pd.to_datetime(train_df["created"])
# test_df["created"] = pd.to_datetime(test_df["created"])
# train_df["created_year"] = train_df["created"].dt.year
# test_df["created_year"] = test_df["created"].dt.year
# train_df["created_month"] = train_df["created"].dt.month
# test_df["created_month"] = test_df["created"].dt.month
# train_df["created_day"] = train_df["created"].dt.day
# test_df["created_day"] = test_df["created"].dt.day
# train_df["created_hour"] = train_df["created"].dt.hour
# test_df["created_hour"] = test_df["created"].dt.hour

In [4]:
train["pos"] = train_df.longitude.round(3).astype(str) + '_' + train_df.latitude.round(3).astype(str)
test["pos"] = test_df.longitude.round(3).astype(str) + '_' + test_df.latitude.round(3).astype(str)

vals = train['pos'].value_counts()
dvals = vals.to_dict()
train["density"] = train['pos'].apply(lambda x: dvals.get(x, vals.min()))
test["density"] = test['pos'].apply(lambda x: dvals.get(x, vals.min()))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/

In [5]:
features_to_use=["bathrooms", "bedrooms", "latitude", "longitude", "price", "num_features", 
    "len_description","created_year", "created_month", "created_day", "created_hour", "created_weekday", 
    "is_night", "is_weekend", "price_per_bed", "price_per_bath", "total_rooms", "price_per_room",
    "manager_skill", "building_popularity", "logprice", "density"]

In [7]:
# index=list(range(train_df.shape[0]))
# random.shuffle(index)
# a=[np.nan]*len(train_df)
# b=[np.nan]*len(train_df)
# c=[np.nan]*len(train_df)

In [9]:
# a=[]
# b=[]
# c=[]
# building_level={}
# for j in train_df['manager_id'].values:
#     building_level[j]=[0,0,0]

# for j in range(train_df.shape[0]):
#     temp=train_df.iloc[j]
#     if temp['interest_level']=='low':
#         building_level[temp['manager_id']][0]+=1
#     if temp['interest_level']=='medium':
#         building_level[temp['manager_id']][1]+=1
#     if temp['interest_level']=='high':
#         building_level[temp['manager_id']][2]+=1

# for i in test_df['manager_id'].values:
#     if i not in building_level.keys():
#         a.append(np.nan)
#         b.append(np.nan)
#         c.append(np.nan)
#     else:
#         a.append(building_level[i][0]*1.0/sum(building_level[i]))
#         b.append(building_level[i][1]*1.0/sum(building_level[i]))
#         c.append(building_level[i][2]*1.0/sum(building_level[i]))
# test_df['manager_level_low']=a
# test_df['manager_level_medium']=b
# test_df['manager_level_high']=c

In [21]:
# train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 39481 entries, 10 to 99994
Data columns (total 31 columns):
bathrooms                39481 non-null float64
bedrooms                 39481 non-null int64
building_id              39481 non-null object
created                  39481 non-null datetime64[ns]
description              39481 non-null object
display_address          39481 non-null object
features                 39481 non-null object
interest_level           39481 non-null object
latitude                 39481 non-null float64
listing_id               39481 non-null int64
longitude                39481 non-null float64
manager_id               39481 non-null object
photos                   39481 non-null object
price                    39481 non-null int64
street_address           39481 non-null object
logprice                 39481 non-null float64
price_t                  39481 non-null float64
room_sum                 39481 non-null float64
price_per_room           39481 no

In [10]:
# index=list(range(train_df.shape[0]))
# random.shuffle(index)
# a=[np.nan]*len(train_df)
# b=[np.nan]*len(train_df)
# c=[np.nan]*len(train_df)
# for i in range(5):
#     building_level={}
#     for j in train_df['building_id'].values:
#         building_level[j]=[0,0,0]
    
#     test_index=index[int((i*train_df.shape[0])/5):int(((i+1)*train_df.shape[0])/5)]
#     train_index=list(set(index).difference(test_index))
    
#     for j in train_index:
#         temp=train_df.iloc[j]
#         if temp['interest_level']=='low':
#             building_level[temp['building_id']][0]+=1
#         if temp['interest_level']=='medium':
#             building_level[temp['building_id']][1]+=1
#         if temp['interest_level']=='high':
#             building_level[temp['building_id']][2]+=1
            
#     for j in test_index:
#         temp=train_df.iloc[j]
#         if sum(building_level[temp['building_id']])!=0:
#             a[j]=building_level[temp['building_id']][0]*1.0/sum(building_level[temp['building_id']])
#             b[j]=building_level[temp['building_id']][1]*1.0/sum(building_level[temp['building_id']])
#             c[j]=building_level[temp['building_id']][2]*1.0/sum(building_level[temp['building_id']])
            
# train_df['building_level_low']=a
# train_df['building_level_medium']=b
# train_df['building_level_high']=c


# train_df.head()

Unnamed: 0,bathrooms,bedrooms,building_id,created,description,display_address,features,interest_level,latitude,listing_id,...,created_day,created_hour,pos,density,manager_level_low,manager_level_medium,manager_level_high,building_level_low,building_level_medium,building_level_high
10,1.5,3,53a5b119ba8f7b61d4e010512e0dfc85,2016-06-24 07:54:24,A Brand New 3 Bedroom 1.5 bath ApartmentEnjoy ...,Metropolitan Avenue,[],medium,40.7145,34633,...,24,7,-73.942_40.714,4,0.745763,0.254237,0.0,1.0,0.0,0.0
100004,1.0,1,c3ba40552e2120b0acfc3cb5730bb2aa,2016-04-17 03:26:41,"Top Top West Village location, beautiful Pre-w...",W 13 Street,"[Laundry In Building, Dishwasher, Hardwood Flo...",high,40.7388,6727,...,17,3,-74.002_40.739,70,0.595238,0.357143,0.047619,0.470588,0.382353,0.147059
100007,1.0,1,28d9ad350afeaab8027513a3e52ac8d5,2016-04-18 02:22:02,Building Amenities - Garage - Garden - fitness...,East 49th Street,"[Hardwood Floors, No Fee]",low,40.7539,38446,...,18,2,-73.968_40.754,117,0.808511,0.141844,0.049645,0.901408,0.098592,0.0
100014,2.0,4,38a913e46c94a7f46ddf19b756a9640c,2016-04-19 04:24:47,,West 18th Street,[],medium,40.7429,2808,...,19,4,-74.003_40.743,23,0.890625,0.101562,0.007812,0.4,0.6,0.0
100026,1.0,1,a7efbeb58190aa267b4a9121cd0c88c0,2016-04-20 02:36:35,<p><a website_redacted,Hamilton Terrace,"[Cats Allowed, Dogs Allowed, Elevator, Laundry...",medium,40.8234,21731,...,20,2,-73.946_40.823,32,0.6,0.218182,0.181818,0.176471,0.529412,0.294118


In [11]:
# a=[]
# b=[]
# c=[]
# building_level={}
# for j in train_df['building_id'].values:
#     building_level[j]=[0,0,0]

# for j in range(train_df.shape[0]):
#     temp=train_df.iloc[j]
#     if temp['interest_level']=='low':
#         building_level[temp['building_id']][0]+=1
#     if temp['interest_level']=='medium':
#         building_level[temp['building_id']][1]+=1
#     if temp['interest_level']=='high':
#         building_level[temp['building_id']][2]+=1

# for i in test_df['building_id'].values:
#     if i not in building_level.keys():
#         a.append(np.nan)
#         b.append(np.nan)
#         c.append(np.nan)
#     else:
#         a.append(building_level[i][0]*1.0/sum(building_level[i]))
#         b.append(building_level[i][1]*1.0/sum(building_level[i]))
#         c.append(building_level[i][2]*1.0/sum(building_level[i]))
# test_df['building_level_low']=a
# test_df['building_level_medium']=b
# test_df['building_level_high']=c

In [12]:
# features_to_use.append('manager_level_low') 
# features_to_use.append('manager_level_medium') 
# features_to_use.append('manager_level_high')
# features_to_use.append('building_level_low') 
# features_to_use.append('building_level_medium') 
# features_to_use.append('building_level_high')

In [6]:
train['description'] = train_df['description'].str.replace('<[^<>]+>', ' ')
train['description'] = train_df['description'].str.replace('[0-9]+', 'num')
train['description'] = train_df['description'].str.lower().replace('[^a-zA-Z0-9]', ' ', regex=True)
test['description'] = test_df['description'].str.replace('<[^<>]+>', ' ')
test['description'] = test_df['description'].str.replace('[0-9]+', 'num')
test['description'] = test_df['description'].str.lower().replace('[^a-zA-Z0-9]', ' ', regex=True)
# from textblob import TextBlob
# train_df['sentiment_polarity'] = train_df['description'].apply(lambda x: TextBlob(str(x)).sentiment.polarity)
# train_df['sentiment_subjectivity'] = train_df['description'].apply(lambda x: TextBlob(str(x)).sentiment.subjectivity)
# test_df['sentiment_polarity'] = test_df['description'].apply(lambda x: TextBlob(str(x)).sentiment.polarity)
# test_df['sentiment_subjectivity'] = test_df['description'].apply(lambda x: TextBlob(str(x)).sentiment.subjectivity)
# features_to_use.append('sentiment_polarity') 
# features_to_use.append('sentiment_subjectivity')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See

In [7]:
# categorical = ["display_address", "manager_id", "building_id"]
# for f in categorical:
#         if train_df[f].dtype=='object':
#             lbl = LabelEncoder()
#             lbl.fit(list(train_df[f].values) + list(test_df[f].values))
#             train_df[f] = lbl.transform(list(train_df[f].values))
#             test_df[f] = lbl.transform(list(test_df[f].values))
#             features_to_use.append(f)

train_df['features'] = train_df["features"].apply(lambda x: " ".join(["_".join(i.split(" ")) for i in x]))
test_df['features'] = test_df["features"].apply(lambda x: " ".join(["_".join(i.split(" ")) for i in x]))

target_num_map = {'high':0, 'medium':1, 'low':2}
train_y = np.array(train_df['interest_level'].apply(lambda x: target_num_map[x]))



In [8]:
train['features'] = train_df['features']
test['features'] = test_df['features']
# train.head()
# train.to_csv("train_all_feat.csv", index = False)
# test.to_csv("test_all_feat.csv", index = False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [9]:
features_to_use=["bathrooms", "bedrooms", "latitude", "longitude", "price", "num_features", 
    "len_description","created_year", "created_month", "created_day", "created_hour", "created_weekday", 
    "is_night", "is_weekend", "price_per_bed", "price_per_bath", "total_rooms", "price_per_room",
    "manager_skill", "building_popularity", "logprice", "density"]
count_features = CountVectorizer(stop_words='english', max_features=500,ngram_range=(1,4))
count_features_train_sparse = count_features.fit_transform(train["features"])
count_features_test_sparse = count_features.transform(test["features"])

train_X = sparse.hstack([train[features_to_use], count_features_train_sparse]).tocsr()
test_X = sparse.hstack([test[features_to_use], count_features_test_sparse]).tocsr()

tfidf_features = TfidfVectorizer(stop_words='english', max_features=500,ngram_range=(1,4))
tfidf_features_train_sparse = count_features.fit_transform(train_df["features"])
tfidf_features_test_sparse = count_features.transform(test_df["features"])

train_X = sparse.hstack([train_X, tfidf_features_train_sparse]).tocsr()
test_X = sparse.hstack([test_X, tfidf_features_test_sparse]).tocsr()

In [16]:
count_desc = CountVectorizer(stop_words='english', max_features=500,ngram_range=(1,4))
count_desc_train_sparse = count_desc.fit_transform(train_df["description"])
count_desc_test_sparse = count_desc.transform(test_df["description"])

train_X = sparse.hstack([train_X, count_desc_train_sparse]).tocsr()
test_X = sparse.hstack([test_X, count_desc_test_sparse]).tocsr()

tfidf_desc = CountVectorizer(stop_words='english', max_features=500,ngram_range=(1,4))
tfidf_desc_train_sparse = tfidf_desc.fit_transform(train_df["description"])
tfidf_desc_test_sparse = tfidf_desc.transform(test_df["description"])

train_X = sparse.hstack([train_X, tfidf_desc_train_sparse]).tocsr()
test_X = sparse.hstack([test_X, tfidf_desc_test_sparse]).tocsr()

AttributeError: head not found

In [17]:
print(train_X[0:5])

  (0, 0)	1.5
  (0, 1)	3.0
  (0, 2)	40.7145
  (0, 3)	-73.9425
  (0, 4)	3000.0
  (0, 6)	561.0
  (0, 7)	2016.0
  (0, 8)	6.0
  (0, 9)	24.0
  (0, 10)	7.0
  (0, 11)	4.0
  (0, 12)	1.0
  (0, 14)	750.0
  (0, 15)	1200.0
  (0, 16)	4.5
  (0, 17)	545.4545454545455
  (0, 18)	0.2647058823529412
  (0, 19)	0.5
  (0, 20)	8.006367567650246
  (0, 21)	4.0
  (0, 1027)	1.0
  (0, 1036)	1.0
  (0, 1037)	1.0
  (0, 1039)	1.0
  (0, 1040)	1.0
  :	:
  (4, 17)	575.0
  (4, 18)	0.6527777777777778
  (4, 19)	1.1
  (4, 20)	7.45298232946546
  (4, 21)	32.0
  (4, 27)	1.0
  (4, 28)	1.0
  (4, 31)	1.0
  (4, 87)	1.0
  (4, 95)	1.0
  (4, 154)	1.0
  (4, 191)	1.0
  (4, 309)	1.0
  (4, 527)	1.0
  (4, 528)	1.0
  (4, 531)	1.0
  (4, 587)	1.0
  (4, 595)	1.0
  (4, 654)	1.0
  (4, 691)	1.0
  (4, 809)	1.0
  (4, 1508)	1.0
  (4, 2008)	1.0
  (4, 2508)	1.0
  (4, 3008)	1.0


In [18]:
def runXGB(train_X, train_y, test_X, test_y=None, feature_names=None, seed_val=321, num_rounds=1300):
    param = {'booster': 'gbtree',
              #'objective': 'multi:softmax',
              'objective': 'multi:softprob',
              'eval_metric': 'mlogloss',
              'gamma': 1,
              'min_child_weight': 1.5,
              'max_depth': 5,
              'lambda': 10,
              'subsample': 0.7,
              'colsample_bytree': 0.7,
              'colsample_bylevel': 0.7,
              'eta': 0.03,
              'tree_method': 'exact',
              'seed': 2017,
              'nthread': 12,
              "num_class":3
              }
    num_rounds = num_rounds

    plst = list(param.items())
    xgtrain = xgb.DMatrix(train_X, label=train_y)

    if test_y is not None:
        xgtest = xgb.DMatrix(test_X, label=test_y)
        watchlist = [ (xgtrain,'train'), (xgtest, 'test') ]
        model = xgb.train(plst, xgtrain, num_rounds, watchlist, early_stopping_rounds=20)
    else:
        xgtest = xgb.DMatrix(test_X)
        model = xgb.train(plst, xgtrain, num_rounds)

    pred_test_y = model.predict(xgtest)
    return pred_test_y, model

In [26]:
# preds, model = runXGB(train_X, train_y, test_X, num_rounds=1300)
# out_df = pd.DataFrame(preds)
# out_df.columns = ["high", "medium", "low"]
# out_df["listing_id"] = test_df.listing_id.values
# out_df.to_csv("sub51.csv", index=False)

In [22]:
# out_df.head()

Unnamed: 0,high,medium,low,listing_id
0,0.000679,0.007194,0.992127,5529
1,0.001685,0.008177,0.990138,14597
2,0.003122,0.042709,0.95417,43396
3,0.008244,0.061525,0.930231,25538
4,0.000808,0.123692,0.8755,26644


In [19]:
from sklearn.model_selection import KFold
cv_scores = []
kf = KFold(n_splits=5, shuffle=True, random_state=2016)
for dev_index, val_index in kf.split(range(train_X.shape[0])):
        dev_X, val_X = train_X[dev_index,:], train_X[val_index,:]
        dev_y, val_y = train_y[dev_index], train_y[val_index]
        preds, model = runXGB(dev_X, dev_y, val_X, val_y)
        cv_scores.append(log_loss(val_y, preds))
#         print(cv_scores)
        break

[0]	train-mlogloss:1.07489	test-mlogloss:1.07511
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 20 rounds.
[1]	train-mlogloss:1.05194	test-mlogloss:1.05246
[2]	train-mlogloss:1.03034	test-mlogloss:1.03123
[3]	train-mlogloss:1.00968	test-mlogloss:1.01089
[4]	train-mlogloss:0.989035	test-mlogloss:0.990525
[5]	train-mlogloss:0.969713	test-mlogloss:0.971506
[6]	train-mlogloss:0.952411	test-mlogloss:0.954472
[7]	train-mlogloss:0.934486	test-mlogloss:0.936744
[8]	train-mlogloss:0.916897	test-mlogloss:0.919403
[9]	train-mlogloss:0.903303	test-mlogloss:0.906016
[10]	train-mlogloss:0.887589	test-mlogloss:0.890473
[11]	train-mlogloss:0.87258	test-mlogloss:0.875668
[12]	train-mlogloss:0.860634	test-mlogloss:0.863916
[13]	train-mlogloss:0.847956	test-mlogloss:0.851458
[14]	train-mlogloss:0.83487	test-mlogloss:0.838607
[15]	train-mlogloss:0.822853	test-mlogloss:0.826759
[16]	train-mlogloss:0.810342	test-mlo

[155]	train-mlogloss:0.435906	test-mlogloss:0.455872
[156]	train-mlogloss:0.435491	test-mlogloss:0.455541
[157]	train-mlogloss:0.435124	test-mlogloss:0.455242
[158]	train-mlogloss:0.43475	test-mlogloss:0.454936
[159]	train-mlogloss:0.434367	test-mlogloss:0.454674
[160]	train-mlogloss:0.433983	test-mlogloss:0.454422
[161]	train-mlogloss:0.433596	test-mlogloss:0.454132
[162]	train-mlogloss:0.433266	test-mlogloss:0.453852
[163]	train-mlogloss:0.432902	test-mlogloss:0.453608
[164]	train-mlogloss:0.432528	test-mlogloss:0.453362
[165]	train-mlogloss:0.43218	test-mlogloss:0.453093
[166]	train-mlogloss:0.431809	test-mlogloss:0.452808
[167]	train-mlogloss:0.431473	test-mlogloss:0.452513
[168]	train-mlogloss:0.4311	test-mlogloss:0.45221
[169]	train-mlogloss:0.430776	test-mlogloss:0.45193
[170]	train-mlogloss:0.430485	test-mlogloss:0.451731
[171]	train-mlogloss:0.430172	test-mlogloss:0.451512
[172]	train-mlogloss:0.429848	test-mlogloss:0.45125
[173]	train-mlogloss:0.429534	test-mlogloss:0.451041


[311]	train-mlogloss:0.399104	test-mlogloss:0.437809
[312]	train-mlogloss:0.398907	test-mlogloss:0.437787
[313]	train-mlogloss:0.398786	test-mlogloss:0.437749
[314]	train-mlogloss:0.39864	test-mlogloss:0.437683
[315]	train-mlogloss:0.398425	test-mlogloss:0.437638
[316]	train-mlogloss:0.398214	test-mlogloss:0.437576
[317]	train-mlogloss:0.398045	test-mlogloss:0.437531
[318]	train-mlogloss:0.397885	test-mlogloss:0.43748
[319]	train-mlogloss:0.397727	test-mlogloss:0.437423
[320]	train-mlogloss:0.397535	test-mlogloss:0.437366
[321]	train-mlogloss:0.397353	test-mlogloss:0.437349
[322]	train-mlogloss:0.397141	test-mlogloss:0.437308
[323]	train-mlogloss:0.39689	test-mlogloss:0.43727
[324]	train-mlogloss:0.3967	test-mlogloss:0.437223
[325]	train-mlogloss:0.396462	test-mlogloss:0.43716
[326]	train-mlogloss:0.396338	test-mlogloss:0.437129
[327]	train-mlogloss:0.396154	test-mlogloss:0.437084
[328]	train-mlogloss:0.396005	test-mlogloss:0.437067
[329]	train-mlogloss:0.395797	test-mlogloss:0.437036


[467]	train-mlogloss:0.375069	test-mlogloss:0.434041
[468]	train-mlogloss:0.374926	test-mlogloss:0.433955
[469]	train-mlogloss:0.374773	test-mlogloss:0.433906
[470]	train-mlogloss:0.374589	test-mlogloss:0.433859
[471]	train-mlogloss:0.374456	test-mlogloss:0.433864
[472]	train-mlogloss:0.374304	test-mlogloss:0.433867
[473]	train-mlogloss:0.374163	test-mlogloss:0.433874
[474]	train-mlogloss:0.374054	test-mlogloss:0.433828
[475]	train-mlogloss:0.373897	test-mlogloss:0.433835
[476]	train-mlogloss:0.373757	test-mlogloss:0.433832
[477]	train-mlogloss:0.373605	test-mlogloss:0.433874
[478]	train-mlogloss:0.373424	test-mlogloss:0.433892
[479]	train-mlogloss:0.373326	test-mlogloss:0.433894
[480]	train-mlogloss:0.373207	test-mlogloss:0.433891
[481]	train-mlogloss:0.373046	test-mlogloss:0.433855
[482]	train-mlogloss:0.372878	test-mlogloss:0.433851
[483]	train-mlogloss:0.372706	test-mlogloss:0.433822
[484]	train-mlogloss:0.37255	test-mlogloss:0.433766
[485]	train-mlogloss:0.37243	test-mlogloss:0.43

In [3]:
preds, model = runXGB(train_X, train_y, test_X, num_rounds=1300)
print(preds[0:5])

NameError: name 'runXGB' is not defined

In [24]:
# print(preds[0:5])
# test_df['listing_id'].head()

10000      5529
100013    14597
100016    43396
100020    25538
100099    26644
Name: listing_id, dtype: int64

In [18]:
def pred_to_csv(model_name, y_test, time):
    y_pred = pd.DataFrame()
    y_pred['listing_id'] = test_df['listing_id']
    for i in range(len(list(y_test))):
        [y_pred.loc[y_pred.index[i], 'high'], y_pred.loc[y_pred.index[i], 'low'], y_pred.loc[y_pred.index[i], 'medium']] = list(y_test[i])

    print(y_pred.head())

    y_pred.to_csv("results/pred_"+ model_name + str(time) + ".csv", index=False)

    print("\n\nDone! CSV for "+model_name+"'s predictions created!\n")

In [26]:
pred_to_csv("xgboost", preds, 22)

        listing_id      high       low    medium
10000         5529  0.000134  0.003511  0.996356
100013       14597  0.000187  0.000305  0.999508
100016       43396  0.000014  0.000038  0.999948
100020       25538  0.006909  0.185572  0.807519
100099       26644  0.000559  0.059903  0.939538


Done! CSV for xgboost's predictions created!



In [15]:
model = RandomForestClassifier()
print(model.get_params())

grid = {'n_estimators' : [200, 350, 500, 750, 1000],
        'max_depth': [int(x) for x in np.linspace(10, 110, 11)],
        'min_samples_split': [2, 5, 10, 25, 50],
        'min_samples_leaf': [2, 5, 10, 25, 50],
        }
#'max_feaures': ['sqrt', 'log2']
#'bootstrap': [True, False]

kFold = StratifiedKFold(n_splits=3, shuffle=True, random_state=7)
randomSearch = RandomizedSearchCV(estimator=model, param_distributions=grid, n_iter=50, 
                                  verbose=2, scoring="neg_log_loss", n_jobs=-1, cv=kFold)

rs_result = randomSearch.fit(train_X, train_y)

print("Best: %f using %s" % (rs_result.best_score_, rs_result.best_params_))
means = rs_result.cv_results_['mean_test_score']
stds = rs_result.cv_results_['std_test_score']
params = rs_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

{'bootstrap': True, 'random_state': None, 'verbose': 0, 'n_estimators': 10, 'warm_start': False, 'max_features': 'auto', 'n_jobs': 1, 'min_impurity_split': None, 'min_samples_leaf': 1, 'max_depth': None, 'min_samples_split': 2, 'class_weight': None, 'min_weight_fraction_leaf': 0.0, 'max_leaf_nodes': None, 'criterion': 'gini', 'oob_score': False, 'min_impurity_decrease': 0.0}
Fitting 3 folds for each of 50 candidates, totalling 150 fits
[CV] min_samples_leaf=50, n_estimators=500, min_samples_split=2, max_depth=110 
[CV] min_samples_leaf=50, n_estimators=500, min_samples_split=2, max_depth=110 
[CV] min_samples_leaf=50, n_estimators=500, min_samples_split=2, max_depth=110 
[CV] min_samples_leaf=5, n_estimators=1000, min_samples_split=5, max_depth=100 
[CV]  min_samples_leaf=50, n_estimators=500, min_samples_split=2, max_depth=110, total= 2.4min
[CV] min_samples_leaf=5, n_estimators=1000, min_samples_split=5, max_depth=100 
[CV]  min_samples_leaf=50, n_estimators=500, min_samples_split=2,

[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed: 50.7min


[CV]  min_samples_leaf=25, n_estimators=200, min_samples_split=50, max_depth=60, total= 1.3min
[CV] min_samples_leaf=25, n_estimators=200, min_samples_split=50, max_depth=60 
[CV]  min_samples_leaf=25, n_estimators=200, min_samples_split=50, max_depth=60, total= 1.3min
[CV] min_samples_leaf=25, n_estimators=200, min_samples_split=50, max_depth=60 
[CV]  min_samples_leaf=25, n_estimators=200, min_samples_split=50, max_depth=60, total= 1.3min
[CV] min_samples_leaf=10, n_estimators=500, min_samples_split=10, max_depth=60 
[CV]  min_samples_leaf=10, n_estimators=500, min_samples_split=10, max_depth=60, total= 5.5min
[CV] min_samples_leaf=10, n_estimators=500, min_samples_split=10, max_depth=60 
[CV]  min_samples_leaf=2, n_estimators=750, min_samples_split=5, max_depth=50, total=15.9min
[CV] min_samples_leaf=10, n_estimators=500, min_samples_split=10, max_depth=60 
[CV]  min_samples_leaf=2, n_estimators=750, min_samples_split=5, max_depth=50, total=15.8min
[CV] min_samples_leaf=2, n_estimat

[CV] min_samples_leaf=10, n_estimators=200, min_samples_split=2, max_depth=100 
[CV]  min_samples_leaf=25, n_estimators=500, min_samples_split=5, max_depth=40, total= 2.9min
[CV] min_samples_leaf=10, n_estimators=200, min_samples_split=2, max_depth=100 
[CV]  min_samples_leaf=25, n_estimators=500, min_samples_split=5, max_depth=40, total= 2.9min
[CV] min_samples_leaf=10, n_estimators=200, min_samples_split=2, max_depth=100 
[CV]  min_samples_leaf=10, n_estimators=200, min_samples_split=2, max_depth=100, total= 1.9min
[CV] min_samples_leaf=50, n_estimators=1000, min_samples_split=25, max_depth=100 
[CV]  min_samples_leaf=25, n_estimators=500, min_samples_split=5, max_depth=40, total= 2.9min
[CV] min_samples_leaf=50, n_estimators=1000, min_samples_split=25, max_depth=100 
[CV]  min_samples_leaf=10, n_estimators=200, min_samples_split=2, max_depth=100, total= 2.1min
[CV] min_samples_leaf=50, n_estimators=1000, min_samples_split=25, max_depth=100 
[CV]  min_samples_leaf=10, n_estimators=20

[CV]  min_samples_leaf=2, n_estimators=200, min_samples_split=10, max_depth=70, total= 3.8min
[CV] min_samples_leaf=10, n_estimators=200, min_samples_split=25, max_depth=60 
[CV]  min_samples_leaf=10, n_estimators=200, min_samples_split=25, max_depth=60, total= 2.0min
[CV] min_samples_leaf=10, n_estimators=200, min_samples_split=25, max_depth=60 
[CV]  min_samples_leaf=10, n_estimators=750, min_samples_split=50, max_depth=70, total= 6.1min
[CV] min_samples_leaf=10, n_estimators=200, min_samples_split=25, max_depth=60 
[CV]  min_samples_leaf=10, n_estimators=200, min_samples_split=25, max_depth=60, total= 2.1min
[CV] min_samples_leaf=5, n_estimators=350, min_samples_split=2, max_depth=40 
[CV]  min_samples_leaf=10, n_estimators=750, min_samples_split=50, max_depth=70, total= 6.2min
[CV] min_samples_leaf=5, n_estimators=350, min_samples_split=2, max_depth=40 
[CV]  min_samples_leaf=10, n_estimators=750, min_samples_split=50, max_depth=70, total= 6.2min
[CV] min_samples_leaf=5, n_estimato

[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed: 385.4min finished


Best: -0.558242 using {'min_samples_leaf': 2, 'n_estimators': 1000, 'max_depth': 100, 'min_samples_split': 10}
-0.617095 (0.002107) with: {'min_samples_leaf': 50, 'n_estimators': 500, 'max_depth': 110, 'min_samples_split': 2}
-0.565710 (0.000931) with: {'min_samples_leaf': 5, 'n_estimators': 1000, 'max_depth': 100, 'min_samples_split': 5}
-0.569248 (0.000806) with: {'min_samples_leaf': 2, 'n_estimators': 350, 'max_depth': 100, 'min_samples_split': 50}
-0.615772 (0.000839) with: {'min_samples_leaf': 50, 'n_estimators': 1000, 'max_depth': 40, 'min_samples_split': 10}
-0.569106 (0.000601) with: {'min_samples_leaf': 2, 'n_estimators': 1000, 'max_depth': 90, 'min_samples_split': 50}
-0.615029 (0.000359) with: {'min_samples_leaf': 50, 'n_estimators': 1000, 'max_depth': 60, 'min_samples_split': 5}
-0.566079 (0.000746) with: {'min_samples_leaf': 5, 'n_estimators': 500, 'max_depth': 30, 'min_samples_split': 5}
-0.615025 (0.003249) with: {'min_samples_leaf': 50, 'n_estimators': 350, 'max_depth':

In [19]:
def train_random_forest():
    classifier = RandomForestClassifier(n_estimators=1000, max_depth=100, 
                                        min_samples_leaf=2, min_samples_split=10, n_jobs=-1)
    print("\nRandom Forest: Training...\n")
    classifier.fit(train_X, train_y)

    y_train_pred = classifier.predict_proba(train_X)
    print("Random Forest: Log loss on training set: %f" %log_loss(train_y, y_train_pred))

#     y_val_pred = classifier.predict_proba(X_val)
#     print("\nRandom Forest: Log loss on validation set: %f" %log_loss(y_val, y_val_pred))

    print("\nPredicting on test set...\n\n\n")
    y_test = classifier.predict_proba(test_X)

    pred_to_csv("rf", y_test, 1)
    
train_random_forest()


Random Forest: Training...

Random Forest: Log loss on training set: 0.327016

Predicting on test set...



        listing_id      high       low    medium
10000         5529  0.000580  0.008257  0.991163
100013       14597  0.038811  0.137065  0.824125
100016       43396  0.012042  0.054109  0.933849
100020       25538  0.038005  0.223061  0.738935
100099       26644  0.017179  0.146049  0.836772


Done! CSV for rf's predictions created!

