In [1]:
import math
import numpy as np
import pandas as pd
import random
import xgboost as xgb
from sklearn import preprocessing, model_selection
from sklearn.metrics import log_loss
import string
from sklearn.feature_extraction.text import  CountVectorizer
from scipy.stats import boxcox
from scipy import stats
from scipy import sparse

y_map = {'low': 2, 'medium': 1, 'high': 0}

train = pd.read_json('data/backup/train.json')
test = pd.read_json('data/backup/test.json')

y_train = train['interest_level'].apply(lambda x: y_map[x]).reset_index(drop=True)
train = train.drop(['listing_id', 'interest_level'], axis=1)
listing_id = test.listing_id
test = test.drop('listing_id', axis=1)

y_train.to_json('data/train_interest.json', orient='records')
listing_id.to_json('data/test_ids.json', orient='records')



In [3]:
ntrain = train.shape[0]
train_test = pd.concat((train, test), axis=0).reset_index(drop=True)

In [5]:
categoricals = [x for x in train_test.columns if train_test[x].dtype == 'object']
categoricals = 'building_id', 'display_address', 'manager_id', 'street_address'

['building_id',
 'created',
 'description',
 'display_address',
 'features',
 'manager_id',
 'photos',
 'street_address']

In [33]:
section_with_one_lot

building_id
00005cb939f9986300d987652c933e15    1
0012f1955391bca600ec301035b97b65    1
003d8740e21484dcc2280639b25539a4    1
0055c8662ba19e95f78df97592d2b83e    1
0056dbdf2881b76f2a0171eb753ec9e0    1
0059ae562b9e338a59eaf962cb3eedd2    1
005e0f8d7fb7b92be351cbf1dd985149    1
0070bc94a3f80aa717bb15708e98ba54    1
0078281cd69f4bfec17e42e5cf5eecd9    1
007ae1cd90420f18bad7b6892a9a1411    1
008ff72d77a8fc85eccfc4ec33ec09a3    1
009c6ad006e8fd679991c5f8cffaef9f    1
00a61b88186b5115356374b0f5dd0d1e    1
00a7b4a6aec7ca1a1635c622918b68f0    1
00bb734cde488aa3e1f3e5f1376b9c13    1
00d1b109f921cd8bc69a203bf35a9bac    1
00dfd2bccb9127f2e7966ff29ae1e060    1
00e8bbc4c74980a06c187165d9a5869e    1
00ffdfd150acc0b097182bbf9dd1db28    1
010435ab3b0b415421d583937a55283e    1
010f3d0141cd76667ca8e3d86e221cf2    1
0114c80bf2a9027612083e354d7fbdbc    1
013a96b772f0e46731faee50ad25d727    1
01401bc9a8908b2d6ffa84ebf9e1b984    1
0145e758b990b8d2648ee57c30762d76    1
0152c6255a4e29051b817ce6f3f6dd6f    1


In [29]:
sum(train_test[feat].isin(section_with_one_lot))

1

In [22]:
for feat in ['building_id']:
    # clump all things with one entry per section
    counts = train_test.groupby(feat, as_index = False).size()
    section_with_one_lot = counts[counts == 1]
    train_test.loc[train_test[feat].isin(section_with_one_lot.ravel()), feat] = "-1"

In [23]:
counts = train_test.groupby('building_id', as_index = False).size()
counts[counts == 1]

building_id
00005cb939f9986300d987652c933e15    1
0012f1955391bca600ec301035b97b65    1
003d8740e21484dcc2280639b25539a4    1
0055c8662ba19e95f78df97592d2b83e    1
0056dbdf2881b76f2a0171eb753ec9e0    1
0059ae562b9e338a59eaf962cb3eedd2    1
005e0f8d7fb7b92be351cbf1dd985149    1
0070bc94a3f80aa717bb15708e98ba54    1
0078281cd69f4bfec17e42e5cf5eecd9    1
007ae1cd90420f18bad7b6892a9a1411    1
008ff72d77a8fc85eccfc4ec33ec09a3    1
009c6ad006e8fd679991c5f8cffaef9f    1
00a61b88186b5115356374b0f5dd0d1e    1
00a7b4a6aec7ca1a1635c622918b68f0    1
00bb734cde488aa3e1f3e5f1376b9c13    1
00d1b109f921cd8bc69a203bf35a9bac    1
00dfd2bccb9127f2e7966ff29ae1e060    1
00e8bbc4c74980a06c187165d9a5869e    1
00ffdfd150acc0b097182bbf9dd1db28    1
010435ab3b0b415421d583937a55283e    1
010f3d0141cd76667ca8e3d86e221cf2    1
0114c80bf2a9027612083e354d7fbdbc    1
013a96b772f0e46731faee50ad25d727    1
01401bc9a8908b2d6ffa84ebf9e1b984    1
0145e758b990b8d2648ee57c30762d76    1
0152c6255a4e29051b817ce6f3f6dd6f    1


In [2]:
ntrain = train.shape[0]
train_test = pd.concat((train, test), axis=0).reset_index(drop=True)

# convert date to be more useable
train_test['Date'] = pd.to_datetime(train_test['created'])
# year is all the same
#train_test['Year'] = train_test['Date'].dt.year
train_test['Month'] = train_test['Date'].dt.month
train_test['Day'] = train_test['Date'].dt.day
train_test['Wday'] = train_test['Date'].dt.dayofweek
train_test['Yday'] = train_test['Date'].dt.dayofyear
train_test['hour'] = train_test['Date'].dt.hour
train_test = train_test.drop(['Date', 'created'], axis=1)


# check if this building id/manager id and add worths is 0
train_test['Zero_building_id'] = train_test['building_id'].apply(lambda x: 1 if x == '0' else 0)
train_test['Zero_manager_id'] = train_test['manager_id'].apply(lambda x: 1 if x == '0' else 0)


# mess with description meta data
train_test['desc'] = train_test['description']
train_test['desc'] = train_test['desc'].apply(lambda x: x.replace('<p><a  website_redacted ', ''))
train_test['desc'] = train_test['desc'].apply(lambda x: x.replace('!<br /><br />', ''))

string.punctuation.__add__('!!')
string.punctuation.__add__('(')
string.punctuation.__add__(')')
remove_punct_map = dict.fromkeys(map(ord, string.punctuation))

train_test['desc'] = train_test['desc'].apply(lambda x: x.translate(remove_punct_map))
train_test['desc_letters_count'] = train_test['description'].apply(lambda x: len(x.strip()))
train_test['desc_words_count'] = train_test['desc'].apply(lambda x: 0 if len(x.strip()) == 0 else len(x.split(' ')))
train_test['desc_words_length'] = (train_test['desc_letters_count'] / train_test['desc_words_count']).apply(lambda x: 0 if math.isnan(x) or math.isinf(x) else x)

train_test.drop(['description', 'desc'], axis=1, inplace=True)


# address
train_test['address1'] = train_test['display_address']
train_test['address1'] = train_test['address1'].apply(lambda x: x.lower())

address_map = {
    'w': 'west',
    'st.': 'street',
    'ave': 'avenue',
    'st': 'street',
    'e': 'east',
    'n': 'north',
    's': 'south'
}

def address_map_func(s):
    s = s.split(' ')
    out = []
    for x in s:
        if x in address_map:
            out.append(address_map[x])
        else:
            out.append(x)
    return ' '.join(out)


train_test['address1'] = train_test['address1'].apply(lambda x: x.translate(remove_punct_map))
train_test['address1'] = train_test['address1'].apply(lambda x: address_map_func(x))

new_cols = ['street', 'avenue', 'east', 'west', 'north', 'south']
for col in new_cols:
    train_test[col] = train_test['address1'].apply(lambda x: 1 if col in x else 0)

train_test['other_address'] = train_test[new_cols].apply(lambda x: 1 if x.sum() == 0 else 0, axis=1)
train_test.drop(['display_address', 'street_address'], axis=1, inplace=True)


# features (better not to lower)
train_test['features_count'] = train_test['features'].apply(lambda x: len(x))
train_test['features2'] = train_test['features']
train_test['features2'] = train_test['features2'].apply(lambda x: ' '.join([''.join(i.replace('_',' ').replace('-',' ').split(' ')) for i in x]))

c_vect = CountVectorizer(stop_words='english', max_features=200, ngram_range=(1, 1))
c_vect.fit(train_test['features2'])

c_vect_sparse_1 = c_vect.transform(train_test['features2'])
c_vect_sparse1_cols = c_vect.get_feature_names()
train_test.drop(['features', 'features2'], axis=1, inplace=True)


# photos
train_test['photos_count'] = train_test['photos'].apply(lambda x: len(x))
train_test.drop(['photos'], axis=1, inplace=True)

# clump all things with one entry per section
def find_objects_with_only_one_record(feature_name):
    temp = pd.concat([X_train[feature_name].reset_index(), 
                      X_test[feature_name].reset_index()])
    temp = temp.groupby(feature_name, as_index = False).count()
    return temp[temp['index'] == 1]

managers_with_one_lot = find_objects_with_only_one_record('manager_id')

# convert ['building_id', 'manager_id', 'address1'] to enumerated labels
categoricals = ['building_id', 'display_address', 'manager_id']
for feat in categoricals:
    # clump all things with one entry per section
    counts = train_test.groupby(feat, as_index = False).size()
    section_with_one_lot = counts[counts == 1]
    train_test.loc[train_test[feat].isin(section_with_one_lot.ravel()), feat] = "-1"
    
    # convert ['building_id', 'manager_id', 'address1'] to enumerated labels
    encoder = preprocessing.LabelEncoder()
    encoder.fit(list(train_test[feat].values))
    train_test[feat] = encoder.transform(train_test[feat].ravel())

# transform managers and building ids
#train_test['manager_id'] = train_test['manager_id'].apply(lambda x: str(x))
#train_test['manager_id'], labels = pd.factorize(train_test['manager_id'].values, sort=True)
#train_test['building_id'] = train_test['building_id'].apply(lambda x: str(x))
#train_test['building_id'], labels = pd.factorize(train_test['building_id'].values, sort=True)


# transform price
bc_price, tmp = boxcox(train_test.price)
train_test['bc_price'] = bc_price


# add price per bed/bath
train_test['price_per_bedroom'] = train_test.apply(lambda x: x.price / max(1, x.bedrooms), axis=1)
train_test['price_per_bathroom'] = train_test.apply(lambda x: x.price / max(1, x.bathrooms), axis=1)
train_test['price_per_bed_and_bath'] = train_test.apply(lambda x: x.price / (max(1, x.bedrooms) + max(1, x.bathrooms)*0.1), axis=1)


train_test.loc[train_test['manager_id'].isin(managers_with_one_lot['manager_id'].ravel()), 
      'manager_id'] = "-1"
train_test.loc[train_test['building_id'].isin(buildings_with_one_lot['building_id'].ravel()), 
      'building_id'] = "-1"
train_test.loc[train_test['display_address'].isin(addresses_with_one_lot['display_address'].ravel()), 
      'display_address'] = "-1"


# add sparse
train_test_cv1_sparse = sparse.hstack((train_test, c_vect_sparse_1)).tocsr()
x_train = train_test_cv1_sparse[:ntrain, :]
x_test = train_test_cv1_sparse[ntrain:, :]

# add feature names
features = list(train_test.columns) + ['sparse_' + vect_name for vect_name in c_vect_sparse1_cols]
dtrain_data = pd.DataFrame(np.array(x_train.todense()), columns=features)
dtest_data = pd.DataFrame(np.array(x_test.todense()), columns=features)

In [3]:
dtrain_data.to_json('data/train.json', orient='records')
dtest_data.to_json('data/test.json', orient='records')