# Classification model: Two Sigma

In [1]:
import os
import re
import datetime
import pandas as pd
import numpy as np

from nltk.corpus import stopwords
from bs4 import BeautifulSoup
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import log_loss, accuracy_score

In [2]:
data_src = "./"

In [3]:
train_df = pd.read_json(os.path.join(data_src, "train.json"))
test_df = pd.read_json(os.path.join(data_src, "test.json"))

In [4]:
def clean_dataset(df):
    print "started.."
    # convert to datetime. derive new things.. for convenience
    df.created = pd.to_datetime(df.created)
    df["created_month"] = df.created.dt.month
    df["created_year"] = df.created.dt.year
    df["created_quarter"] = df.created.dt.quarter
    
    # days count
    today = datetime.datetime.now()
    df["days_count"] = df.created.apply(lambda d: (today - d).days)
    df.days_count
    
    # photos count
    df["photos_count"] = df.photos.apply(lambda x: len(x))
    
    # clean listing-features
    print "cleaning features.."
    ignore_words = stopwords.words("english")
    escape_chars = r"[-!:*;/.&<>^()#@$_+=,]"
    def clean_features(v):
        new_v = []
        for each in v:
            cleaned_each = re.sub(escape_chars, " ", each.lower()).split()
            cleaned_each = " ".join([i for i in cleaned_each if i not in ignore_words])
            new_v.append(cleaned_each)
        return new_v
    df["features_cleaned"] = df.features.apply(clean_features)
    
    # clean description
    print "cleaning description.."
    escape_chars = r"[-!:*;/.&<>^()#@$_+=,]"
    def clean_description(v):
        soup = BeautifulSoup(v, "html.parser")
        words = re.sub(escape_chars, " ", soup.get_text().lower()).split()
        new_v = " ".join([i for i in words if i not in ignore_words])
        return new_v
    df["description_cleaned"] = df.description.apply(clean_description)
    
    # price. Ignore prices that are too large; find price per room.
    df = df.drop(df[df.price > 10**5].index)
    df["price_per_room"] = df.price / (df.bedrooms + df.bathrooms + 1)
    
    # group by lat-lng..
    grouped = train_df.groupby(["lat_lng_group"]).mean()
    def price_group_by(row):
        grp = row.lat_lng_group
        return grouped.loc[grp].price
    df["price_per_lat_lng"] = df.apply(price_group_by, axis=1)

In [5]:
def set_tailored_features(df):
    keywords = [
        ("dog", "allowed"), ("cat", "allowed"), ("fitness",), ("laundry",), ("dishwasher",),
        ("hardwood",), ("elevator",), ("doorman",), ("outdoor", "space"), ("new", "construction"),
        ("internet",), ("high", "ceiling"), ("swimming", "pool"), ("terrace",), ("balcony",),
        ("pet", "allowed"), ("lowrise",),
    ]
    
    def activate(row, kw):
        exists = 0
        for each in row.features_cleaned:
            if min([each.find(w) for w in kw]) >= 0:
                exists = 1
                break
        return exists        
                    
    for kw in keywords:
        col = "_".join(kw)
        df[col] = 0
        df[col] = df.apply(activate, args=(kw,), axis=1)
    
    return df

In [6]:
def integerize_dataset(train, test, ignore_cols=None):
    columns = ("manager_id", "interest_level", "display_address")
    for col in columns:
        if ignore_cols and col not in ignore_cols:
            continue
        labels, uniques = pd.factorize(train[col])
        train["{0}_label".format(col)] = labels
        if col != "interest_level":
            test["{0}_label".format(col)] = uniques.get_indexer(test[col])
        setattr(integerize_dataset, "{0}_uniques".format(col), uniques)

In [7]:
def group_lat_lng(df, cnt=8, max_iter=1000):
    lat_lng = df[["latitude", "longitude"]].values
    kmeans = KMeans(n_clusters=cnt, random_state=0, max_iter=max_iter).fit(lat_lng)
    df["lat_lng_group"] = kmeans.labels_
    return kmeans

group_lat_lng(train_df, 10)
group_lat_lng(test_df, 10)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=1000,
    n_clusters=10, n_init=10, n_jobs=1, precompute_distances='auto',
    random_state=0, tol=0.0001, verbose=0)

In [9]:
clean_dataset(train_df)

started..
cleaning features..
cleaning description..


In [8]:
clean_dataset(test_df)

started..
cleaning features..
cleaning description..


  'Beautiful Soup.' % markup)


In [10]:
set_tailored_features(train_df)
set_tailored_features(test_df)
print None

None


In [11]:
integerize_dataset(train_df, test_df)

In [14]:
c = ["bathrooms",
"bedrooms",
"building_id",
"latitude",
"listing_id",
"longitude",
"manager_id",
"price",
"created_month",
"created_year",
"created_quarter",
"days_count",
"photos_count",
"lat_lng_group",
"dog_allowed",
"cat_allowed",
"fitness",
"laundry",
"dishwasher",
"hardwood",
"elevator",
"doorman",
"outdoor_space",
"new_construction",
"internet",
"high_ceiling",
"swimming_pool",
"terrace",
"balcony",
"pet_allowed",
"lowrise",
"manager_id_label",
"display_address_label",]

In [18]:
test_df[c].to_csv("./test-cleaned.csv")

In [17]:
train_df[c].to_csv("./train-cleaned.csv")

In [76]:
selected_features = [
    "bathrooms", "bedrooms", "created_month", "photos_count", "price",
    "display_address_label", "manager_id_label", "lat_lng_group",
    
    "price_per_room", "price_per_lat_lng", "latitude", "longitude", "days_count",
    
    # tailored
#     "dog_allowed", "cat_allowed", "fitness", "laundry", "dishwasher", "hardwood", "elevator",
#     "doorman", "outdoor_space", "new_construction", "internet", "high_ceiling", "swimming_pool",
#     "terrace", "balcony", "pet_allowed",
    
    "dog_allowed", "cat_allowed", "laundry", "dishwasher", "elevator", "doorman",
]
X = train_df[selected_features].values
Y = train_df["interest_level_label"].values
X.shape, len(selected_features)

((49346, 19), 19)

In [None]:
import xgboost as xgb

def run_xgb(x_train, y_train, x_test, y_test=None):
    # training params
    xg_train = xgb.DMatrix(x_train, label=y_train)
    params = dict(
        objective="multi:softprob",
        eval_metric="mlogloss",
        sub_sample=0.6,
        num_class=3,
        max_depth=8
    )
    
    # train the model
    bt = xgb.train(params, xg_train)  
    
    # test (predict) values
    xg_test = xgb.DMatrix(x_test, label=y_test)
    predicted = bt.predict(xg_test)
    
    # check log-loss if y-test is known
    if y_test is not None:
        print "Logloss: ", log_loss(y_test, predicted)
    
    # return predicted values
    return predicted

# bt = run_xgb(x_train, y_train, x_test, y_test)

In [89]:
from sklearn.ensemble import RandomForestClassifier

def run_random_forest(x_train, y_train, x_test, y_test=None):
    clf = RandomForestClassifier(n_estimators=300, random_state=0)
    clf.fit(x_train, y_train)
    predicted = clf.predict_proba(x_test)
    if y_test is not None:
        print "Logloss: ", log_loss(y_test, predicted)
    return predicted
    

In [98]:
from sklearn.ensemble import AdaBoostClassifier

def run_adaboost(x_train, y_train, x_test, y_test=None):
    clf = AdaBoostClassifier(n_estimators=60, random_state=0)
    clf.fit(x_train, y_train)
    predicted = clf.predict_proba(x_test)
    if y_test is not None:
        print "Logloss: ", log_loss(y_test, predicted)
    return predicted

In [106]:
from sklearn.neural_network import MLPClassifier

def run_mlp(x_train, y_train, x_test, y_test=None):
    layers = [100, 100]
    clf = MLPClassifier(hidden_layer_sizes=layers)
    clf.fit(x_train, y_train)
    predicted = clf.predict_proba(x_test)
    if y_test is not None:
        print "Logloss: ", log_loss(y_test, predicted)
    return predicted

In [108]:
def run_model(model_funcs):
    for func in model_funcs:
        print func.func_name
        kf = KFold(n_splits=5, shuffle=True, random_state=10)
        for train_index, test_index in kf.split(X):
            x_tr, y_tr = X[train_index, :], Y[train_index]
            x_te, y_te = X[test_index, :], Y[test_index]
            func(x_tr, y_tr, x_te, y_te)
            break

run_model([run_random_forest])

run_random_forest
Logloss:  0.636797761202
