In [2]:
import os
import sys
import operator
import numpy as np
import pandas as pd
from scipy import sparse
import xgboost as xgb
import random
from sklearn import model_selection, preprocessing, ensemble
from sklearn.metrics import log_loss
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
#input data
train_df=pd.read_json('data/train.json')
test_df=pd.read_json('data/test.json')
target_num_map = {'high':0, 'medium':2, 'low':1}

In [3]:
#basic features
train_df["price_t"] =train_df["price"]/train_df["bedrooms"]
test_df["price_t"] = test_df["price"]/test_df["bedrooms"] 
train_df["room_sum"] = train_df["bedrooms"]+train_df["bathrooms"] 
test_df["room_sum"] = test_df["bedrooms"]+test_df["bathrooms"] 

# count of photos #
train_df["num_photos"] = train_df["photos"].apply(len)
test_df["num_photos"] = test_df["photos"].apply(len)

# count of "features" #
train_df["num_features"] = train_df["features"].apply(len)
test_df["num_features"] = test_df["features"].apply(len)

# count of words present in description column #
train_df["num_description_words"] = train_df["description"].apply(lambda x: len(x.split(" ")))
test_df["num_description_words"] = test_df["description"].apply(lambda x: len(x.split(" ")))

train_df["pos"] = train_df.longitude.round(3).astype(str) + '_' + train_df.latitude.round(3).astype(str)
test_df["pos"] = test_df.longitude.round(3).astype(str) + '_' + test_df.latitude.round(3).astype(str)

vals = train_df['pos'].value_counts()
dvals = vals.to_dict()
train_df["density"] = train_df['pos'].apply(lambda x: dvals.get(x, vals.min()))
test_df["density"] = test_df['pos'].apply(lambda x: dvals.get(x, vals.min()))

features_to_use=["bathrooms", "bedrooms", "latitude", "longitude", "price","price_t","num_photos", "num_features", "num_description_words","listing_id","density"]

In [4]:
def runXGB(train_X, train_y, test_X, test_y=None, feature_names=None, seed_val=0, num_rounds=1000):
    param = {}
    param['objective'] = 'multi:softprob'
    param['eta'] = 0.03
    param['max_depth'] = 6
    param['silent'] = 1000
    param['num_class'] = 3
    param['eval_metric'] = "mlogloss"
    param['min_child_weight'] = 1
    param['subsample'] = 0.7
    param['colsample_bytree'] = 0.7
    param['seed'] = seed_val
    num_rounds = num_rounds

    plst = list(param.items())
    xgtrain = xgb.DMatrix(train_X, label=train_y)

    if test_y is not None:
        xgtest = xgb.DMatrix(test_X, label=test_y)
        watchlist = [ (xgtrain,'train'), (xgtest, 'test') ]
        model = xgb.train(plst, xgtrain, num_rounds, watchlist, early_stopping_rounds=20)
    else:
        xgtest = xgb.DMatrix(test_X)
        model = xgb.train(plst, xgtrain, num_rounds)

    pred_test_y = model.predict(xgtest)
    return pred_test_y, model

In [5]:
index=list(range(train_df.shape[0]))
random.shuffle(index)
# print(index)
a=[np.nan]*len(train_df)
b=[np.nan]*len(train_df)
c=[np.nan]*len(train_df)
d=[np.nan]*len(train_df)
print(len(train_df['manager_id'].values))

for i in range(5):
    building_level={}
    for j in train_df['manager_id'].values:
        building_level[j]=[0,0,0]
    test_index=index[int((i*train_df.shape[0])/5):int(((i+1)*train_df.shape[0])/5)]
    train_index=list(set(index).difference(test_index))
    for j in train_index:
        temp=train_df.iloc[j]
        if temp['interest_level']=='low':
            building_level[temp['manager_id']][0]+=1
        if temp['interest_level']=='medium':
            building_level[temp['manager_id']][1]+=1
        if temp['interest_level']=='high':
            building_level[temp['manager_id']][2]+=1
    for j in test_index:
        temp=train_df.iloc[j]
        if sum(building_level[temp['manager_id']])!=0:
            a[j]=building_level[temp['manager_id']][0]*1.0/sum(building_level[temp['manager_id']])
            b[j]=building_level[temp['manager_id']][1]*1.0/sum(building_level[temp['manager_id']])
            c[j]=building_level[temp['manager_id']][2]*1.0/sum(building_level[temp['manager_id']])
        d[j] = sum(building_level[temp['manager_id']])
train_df['manager_level_low']=a
train_df['manager_level_medium']=b
train_df['manager_level_high']=c
train_df["manager_post_num"]=d


39481


In [6]:
a=[]
b=[]
c=[]
d=[]
building_level={}
for j in train_df['manager_id'].values:
    building_level[j]=[0,0,0]
for j in range(train_df.shape[0]):
    temp=train_df.iloc[j]
    if temp['interest_level']=='low':
        building_level[temp['manager_id']][0]+=1
    if temp['interest_level']=='medium':
        building_level[temp['manager_id']][1]+=1
    if temp['interest_level']=='high':
        building_level[temp['manager_id']][2]+=1

for i in test_df['manager_id'].values:
    if i not in building_level.keys():
        a.append(np.nan)
        b.append(np.nan)
        c.append(np.nan)
        d.append(np.nan)
    else:
        a.append(building_level[i][0]*1.0/sum(building_level[i]))
        b.append(building_level[i][1]*1.0/sum(building_level[i]))
        c.append(building_level[i][2]*1.0/sum(building_level[i]))
        d.append(sum(building_level[i]))
test_df['manager_level_low']=a
test_df['manager_level_medium']=b
test_df['manager_level_high']=c
test_df['manager_post_num'] = d
features_to_use.append('manager_level_low') 
features_to_use.append('manager_level_medium') 
features_to_use.append('manager_level_high')
features_to_use.append('manager_post_num')

In [7]:
categorical = ["display_address", "manager_id", "building_id", "street_address"]
for f in categorical:
        if train_df[f].dtype=='object':
            #print(f)
            lbl = preprocessing.LabelEncoder()
            lbl.fit(list(train_df[f].values) + list(test_df[f].values))
            train_df[f] = lbl.transform(list(train_df[f].values))
            test_df[f] = lbl.transform(list(test_df[f].values))
            features_to_use.append(f)

In [8]:
train_df['features'] = train_df["features"].apply(lambda x: " ".join(["_".join(i.split(" ")) for i in x]))
test_df['features'] = test_df["features"].apply(lambda x: " ".join(["_".join(i.split(" ")) for i in x]))
print(train_df["features"].head())
tfidf = TfidfVectorizer(stop_words='english', max_features=200)
tr_sparse = tfidf.fit_transform(train_df["features"])
te_sparse = tfidf.transform(test_df["features"])

10                                                         
100004    Laundry_In_Building Dishwasher Hardwood_Floors...
100007                               Hardwood_Floors No_Fee
100014                                                     
100026    Cats_Allowed Dogs_Allowed Elevator Laundry_In_...
Name: features, dtype: object


In [9]:
train_X = sparse.hstack([train_df[features_to_use], tr_sparse]).tocsr()
test_X = sparse.hstack([test_df[features_to_use], te_sparse]).tocsr()


train_y = np.array(train_df['interest_level'].apply(lambda x: target_num_map[x]))


# print(train_X.shape, test_X.shape)

In [10]:
preds, model = runXGB(train_X, train_y, test_X, num_rounds=1000)

In [11]:
def pred_to_csv(model_name, y_test):
    y_pred = pd.DataFrame()
    y_pred['listing_id'] = test_df['listing_id']
    for i in range(len(list(y_test))):
        [y_pred.loc[y_pred.index[i], 'high'], y_pred.loc[y_pred.index[i], 'low'], y_pred.loc[y_pred.index[i], 'medium']] = list(y_test[i])

    print(y_pred.head())

    y_pred.to_csv("results/pred_"+ model_name +".csv", index=False)

    print("\n\nDone! CSV for "+model_name+"'s predictions created!\n")

In [12]:
pred_to_csv("xgboost_25th_submission1", preds)

        listing_id      high       low    medium
10000         5529  0.000292  0.994118  0.005590
100013       14597  0.001816  0.986276  0.011908
100016       43396  0.002123  0.971539  0.026337
100020       25538  0.003224  0.941691  0.055085
100099       26644  0.000867  0.876206  0.122927


Done! CSV for xgboost_25th_submission1's predictions created!



In [14]:
train_df['dist_jfk_airport_lat'] = train_df['latitude'].apply(lambda x: abs(x-40.641590))
train_df['dist_jfk_airport_lon'] = train_df['longitude'].apply(lambda x: abs(x+73.778515))
test_df['dist_jfk_airport_lat'] = test_df['latitude'].apply(lambda x: abs(x-40.641590))
test_df['dist_jfk_airport_lon'] = test_df['longitude'].apply(lambda x: abs(x+73.778515))
features_to_use.append('dist_jfk_airport_lat')
features_to_use.append('dist_jfk_airport_lon')
train_df['dist_wallst_lat'] = train_df['latitude'].apply(lambda x: abs(x-40.705638))
train_df['dist_wallst_lon'] = train_df['longitude'].apply(lambda x: abs(x+74.010278))
test_df['dist_wallst_lat'] = test_df['latitude'].apply(lambda x: abs(x-40.705638))
test_df['dist_wallst_lon'] = test_df['longitude'].apply(lambda x: abs(x+74.010278))
features_to_use.append('dist_wallst_lat')
features_to_use.append('dist_wallst_lon')
train_df['dist_centralpk_lat'] = train_df['latitude'].apply(lambda x: abs(x-40.783661))
train_df['dist_centralpk_lon'] = train_df['longitude'].apply(lambda x: abs(x+73.96536827))
test_df['dist_centralpk_lat'] = test_df['latitude'].apply(lambda x: abs(x-40.783661))
test_df['dist_centralpk_lon'] = test_df['longitude'].apply(lambda x: abs(x+73.96536827))
features_to_use.append('dist_centralpk_lat')
features_to_use.append('dist_centralpk_lon')
train_df['dist_washSqPark_lat'] = train_df['latitude'].apply(lambda x: abs(x-40.73083612))
train_df['dist_washSqPark_lon'] = train_df['longitude'].apply(lambda x: abs(x+73.99749041))
test_df['dist_washSqPark_lat'] = test_df['latitude'].apply(lambda x: abs(x-40.73083612))
test_df['dist_washSqPark_lon'] = test_df['longitude'].apply(lambda x: abs(x+73.99749041))
features_to_use.append('dist_washSqPark_lat')
features_to_use.append('dist_washSqPark_lon')

In [15]:
train_X = sparse.hstack([train_df[features_to_use], tr_sparse]).tocsr()
test_X = sparse.hstack([test_df[features_to_use], te_sparse]).tocsr()

In [16]:
preds, model = runXGB(train_X, train_y, test_X, num_rounds=1000)

In [17]:
pred_to_csv("xgboost_25th_submission1", preds)

        listing_id      high       low    medium
10000         5529  0.000477  0.991681  0.007842
100013       14597  0.001189  0.986393  0.012418
100016       43396  0.002069  0.972997  0.024933
100020       25538  0.003388  0.951905  0.044707
100099       26644  0.000970  0.879702  0.119328


Done! CSV for xgboost_25th_submission1's predictions created!



In [18]:
print(features_to_use)

['bathrooms', 'bedrooms', 'latitude', 'longitude', 'price', 'price_t', 'num_photos', 'num_features', 'num_description_words', 'listing_id', 'density', 'manager_level_low', 'manager_level_medium', 'manager_level_high', 'manager_post_num', 'display_address', 'manager_id', 'building_id', 'street_address', 'dist_jfk_airport_lat', 'dist_jfk_airport_lon', 'dist_wallst_lat', 'dist_wallst_lon', 'dist_centralpk_lat', 'dist_centralpk_lon', 'dist_washSqPark_lat', 'dist_washSqPark_lon']


In [19]:
train_df.head()

Unnamed: 0,bathrooms,bedrooms,building_id,created,description,display_address,features,interest_level,latitude,listing_id,...,manager_level_high,manager_post_num,dist_jfk_airport_lat,dist_jfk_airport_lon,dist_wallst_lat,dist_wallst_lon,dist_centralpk_lat,dist_centralpk_lon,dist_washSqPark_lat,dist_washSqPark_lon
10,1.5,3,2431,2016-06-24 07:54:24,A Brand New 3 Bedroom 1.5 bath ApartmentEnjoy ...,6544,,medium,40.7145,34633,...,0.0,57,0.07291,0.163985,0.008862,0.067778,0.069161,0.022868,0.016336,0.05499
100004,1.0,1,5806,2016-04-17 03:26:41,"Top Top West Village location, beautiful Pre-w...",7387,Laundry_In_Building Dishwasher Hardwood_Floors...,high,40.7388,6727,...,0.054795,73,0.09721,0.223285,0.033162,0.008478,0.044861,0.036432,0.007964,0.00431
100007,1.0,1,1201,2016-04-18 02:22:02,Building Amenities - Garage - Garden - fitness...,5703,Hardwood_Floors No_Fee,low,40.7539,38446,...,0.055556,126,0.11231,0.189185,0.048262,0.042578,0.029761,0.002332,0.023064,0.02979
100014,2.0,4,1633,2016-04-19 04:24:47,,8351,,medium,40.7429,2808,...,0.0,121,0.10131,0.224285,0.037262,0.007478,0.040761,0.037432,0.012064,0.00531
100026,1.0,1,4991,2016-04-20 02:36:35,<p><a website_redacted,6153,Cats_Allowed Dogs_Allowed Elevator Laundry_In_...,medium,40.8234,21731,...,0.183333,60,0.18181,0.167185,0.117762,0.064578,0.039739,0.019668,0.092564,0.05179


In [20]:
count_train = pd.read_csv("X_count.csv")
count_test = pd.read_csv("X_test_count.csv")

In [22]:
count_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39481 entries, 0 to 39480
Data columns (total 5 columns):
index                       39481 non-null int64
manager_listings_count      39481 non-null int64
building_listings_count     39481 non-null int64
s_address_listings_count    39481 non-null int64
d_address_listings_count    39481 non-null int64
dtypes: int64(5)
memory usage: 1.5 MB


In [23]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 39481 entries, 10 to 99994
Data columns (total 34 columns):
bathrooms                39481 non-null float64
bedrooms                 39481 non-null int64
building_id              39481 non-null int64
created                  39481 non-null object
description              39481 non-null object
display_address          39481 non-null int64
features                 39481 non-null object
interest_level           39481 non-null object
latitude                 39481 non-null float64
listing_id               39481 non-null int64
longitude                39481 non-null float64
manager_id               39481 non-null int64
photos                   39481 non-null object
price                    39481 non-null int64
street_address           39481 non-null int64
price_t                  39481 non-null float64
room_sum                 39481 non-null float64
num_photos               39481 non-null int64
num_features             39481 non-null int64
n

In [24]:
train_df['manager_listings_count'] = count_train['manager_listings_count']
train_df['building_listings_count'] = count_train['building_listings_count']
train_df['s_address_listings_count'] = count_train['s_address_listings_count']
train_df['d_address_listings_count'] = count_train['d_address_listings_count']
test_df['manager_listings_count'] = count_test['manager_listings_count']
test_df['building_listings_count'] = count_test['building_listings_count']
test_df['s_address_listings_count'] = count_test['s_address_listings_count']
test_df['d_address_listings_count'] = count_test['d_address_listings_count']

In [25]:
features_to_use.append('manager_listings_count')
features_to_use.append('building_listings_count')
features_to_use.append('s_address_listings_count')
features_to_use.append('d_address_listings_count')

In [26]:
train_X = sparse.hstack([train_df[features_to_use], tr_sparse]).tocsr()
test_X = sparse.hstack([test_df[features_to_use], te_sparse]).tocsr()

In [27]:
preds, model = runXGB(train_X, train_y, test_X, num_rounds=1000)

In [30]:
pred_to_csv("xgboost_25th_submission2", preds)

        listing_id      high       low    medium
10000         5529  0.000400  0.992036  0.007564
100013       14597  0.002050  0.984002  0.013947
100016       43396  0.001898  0.972809  0.025294
100020       25538  0.003920  0.959988  0.036092
100099       26644  0.000798  0.885936  0.113266


Done! CSV for xgboost_25th_submission2's predictions created!



In [31]:
print(features_to_use)

['bathrooms', 'bedrooms', 'latitude', 'longitude', 'price', 'price_t', 'num_photos', 'num_features', 'num_description_words', 'listing_id', 'density', 'manager_level_low', 'manager_level_medium', 'manager_level_high', 'manager_post_num', 'display_address', 'manager_id', 'building_id', 'street_address', 'dist_jfk_airport_lat', 'dist_jfk_airport_lon', 'dist_wallst_lat', 'dist_wallst_lon', 'dist_centralpk_lat', 'dist_centralpk_lon', 'dist_washSqPark_lat', 'dist_washSqPark_lon', 'manager_listings_count', 'building_listings_count', 's_address_listings_count', 'd_address_listings_count']
