In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

from subprocess import check_output
# print(check_output(["ls", "data"]).decode("utf8"))

# Any results you write to the current directory are saved as output.


from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
%matplotlib inline
import statistics
# from fuzzywuzzy import fuzz
import nltk.tokenize as nt
import nltk
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
from joblib import dump, load
import json
from pprint import pprint
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV

In [None]:
def remove_outliers(df):
    # standard deviation threshold
    sd_threshold = 1
    
    # Remove price outliers
    df = df[(df.price <= 15000) & (df.price >= 1000)]
    
    # Remove dist from city centre outliers
    # apporimate radius from city centre
    NYC_RADIUS = 20
    df = df[(df.dist_from_ctr <= 20)]
    return df

In [None]:
manager_scores = {}
def create_manager_scores(df):
    global manager_scores
    manager_scores = {}
    
    def calculate_manager_score(row):
        manager_id = row['manager_id']
        interest = row['interest_level']
        
        score_to_add = 0
        if interest == 'high':
            score_to_add += 3
        elif interest == 'medium':
            score_to_add += 2
        elif interest == 'low':
            score_to_add += 1
        
        if manager_id in manager_scores:
            manager_scores[manager_id].append(score_to_add)
        else:
            manager_scores[manager_id] = [score_to_add]
    df.apply(calculate_manager_score, axis=1)

def apply_manager_scores(row):
    manager_id = row['manager_id']
    
    if manager_id in manager_scores:
        row['manager_score'] = sum(manager_scores[manager_id])/len(manager_scores[manager_id])
    else:
        row['manager_score'] = 0
        
    return row

In [None]:
building_scores = {}
def create_building_scores(df):
    global building_scores
    building_scores = {}
    
    def calculate_building_score(row):
        building_id = row['building_id']
        interest = row['interest_level']
        
        score_to_add = 0
        if interest == 'high':
            score_to_add += 3
        elif interest == 'medium':
            score_to_add += 2
        elif interest == 'low':
            score_to_add += 1
        
        if building_id in building_scores:
            building_scores[building_id].append(score_to_add)
        else:
            building_scores[building_id] = [score_to_add]
        
    df.apply(calculate_building_score, axis=1)
    
    # NOTE : Building ID '0' seem to be missing data issue | Assigning 0 score to building id 0   
    building_scores['0'] = [0]
    
def apply_building_scores(row):
    building_id = row['building_id']
    if building_id in building_scores:
        row['building_score'] = sum(building_scores[building_id])/len(building_scores[building_id])
    else:
        row['building_score'] = 0
        
    return row

In [None]:
def price_per_bedroom(row):
    bedrooms = row['bedrooms']
    if bedrooms == 0:
        price_per_bedroom = 0
    else:
        price_per_bedroom = row['price'] * 1.00 / bedrooms
    row['price_per_bedroom'] = price_per_bedroom
    return row

In [None]:
def price_per_bathroom(row):
    bathrooms = row['bathrooms']
    if bathrooms == 0:
        price_per_bathroom = 0
    else:
        price_per_bathroom = row['price'] * 1.00 / bathrooms
    row['price_per_bathroom'] = price_per_bathroom
    return row

In [None]:
def price_per_total_room(row):
    rooms = row['total_rooms']
    if rooms == 0:
        price_per_total_rooms = 0
    else:
        price_per_total_rooms = row['price'] * 1.00 / rooms
    row['price_per_total_rooms'] = price_per_total_rooms
    return row

In [None]:
def bath_bed_ratio(row):
    bedrooms = row['bedrooms']
    bathrooms = row['bathrooms']
    if bedrooms == 0:
        bath_bed_ratio = bathrooms
    else:
        bath_bed_ratio = bathrooms/bedrooms
    
    row['bath_bed_ratio'] = bath_bed_ratio
    return row

In [None]:
def add_missing_values(test_df):
    missing_cols = set(get_feature_list()) - set(test_df.columns)
    # Add a missing column in test set with default value equal to 0
    for c in missing_cols:
        test_df[c] = 0
    return test_df

In [None]:
# Key : Feature in data | Value : column name to be created for category variable

home_features_dict = {'Elevator': 'has_elevator',
 'Cats Allowed': 'has_cats_allowed',
 'Hardwood Floors': 'has_hardwood_floors',
 'Dogs Allowed': 'has_dogs_allowed',
 'Doorman': 'has_doorman',
 'Dishwasher': 'has_dishwasher',
 'No Fee': 'has_no_fee',
 'Laundry in Building': 'has_laundry_in_building',
 'Fitness Center': 'has_fitness_center',
 'Pre-War': 'has_pre-war',
 'Laundry in Unit': 'has_laundry_in_unit',
 'Roof Deck': 'has_roof_deck',
 'Outdoor Space': 'has_outdoor_space',
 'Dining Room': 'has_dining_room',
 'High Speed Internet': 'has_high_speed_internet',
 'Balcony': 'has_balcony',
 'Swimming Pool': 'has_swimming_pool',
 'Laundry In Building': 'has_laundry_in_building',
 'New Construction': 'has_new_construction',
 'Terrace': 'has_terrace',
 'Exclusive': 'has_exclusive',
 'Loft': 'has_loft',
 'Garden/Patio': 'has_garden/patio',
 'Wheelchair Access': 'has_wheelchair_access',
 'Common Outdoor Space': 'has_common_outdoor_space'}

def process_home_features(df):
    # Add columns for popular features
    for key, val in home_features_dict.items():
        df[val] = 0
        
    def update_popular_feature_cols(row):
        features = row['features']
        for feature in features:
            if feature in home_features_dict:
                row[home_features_dict[feature]] = 1

        return row
    
    df = df.apply(update_popular_feature_cols, axis=1)
    return df

In [None]:
def create_has_photos_has_description(row):
    row['has_photos'] = 1 if row['num_photos'] > 0 else 0
    row['has_description'] = 1 if row['num_description_words'] > 0 else 0
    return row

In [None]:
import geopy.distance
def distance_from_centre(row):
    centre = (40.718, -74.008)
    lat_long = (row['latitude'], row['longitude'])
    distance = geopy.distance.geodesic(centre, lat_long).miles
    row['dist_from_ctr'] = distance
    return row

In [None]:
def street_and_display_address_similarity(row):
    street_ad = row['street_address'].lower()
    display_ad = row['display_address'].lower()
    row['address_similarity'] = 0 if (fuzz.ratio(street_ad, display_ad)/100) <= 0.5 else 1
    return row

In [None]:
def get_num_adjectives(text):
    if not text:
        return 0
    ss=nt.sent_tokenize(text)
    tokenized_sent=[nt.word_tokenize(sent) for sent in ss]
    pos_sentences=[nltk.pos_tag(sent) for sent in tokenized_sent]
    pos_sentences
    adjectives = 0
    for pos in pos_sentences:
        for pair in pos:
            tag = pair[1]
            if tag in ['JJ', 'JJR', 'JJS']:
                adjectives += 1

    return adjectives

In [None]:
def add_adjectives_column(row):
    description = row["description"]
    row['num_adjectives_description'] = get_num_adjectives(description)
    return row

In [None]:
def bucket_hour(hour):
    if hour > 4 and hour <= 10:
        return 'morning'
    elif hour > 10 and hour <= 16:
        return 'noon'
    elif hour > 16 and hour <= 22:
        return 'evening'
    else:
        return 'night'

### One hot encoder

In [None]:
cat_dummies = set()
def one_hot_encode(df, cols, is_test_set):
    # Get one hot encoding of column
    df_processed = pd.get_dummies(df, prefix_sep="__",
                              columns=cols)
    # save all categorical variables
    global cat_dummies
    if not is_test_set:
        cat_dummies = set([col for col in df_processed 
                   if "__" in col 
                   and col.split("__")[0] in cols])
    
    return df_processed

## Feature Engineering

In [None]:
def feature_engineering(df, is_test_set=False):
    df["num_photos"] = df["photos"].apply(len)
    df["num_features"] = df["features"].apply(len)
    df["num_description_words"] = df["description"].apply(lambda x: len(x.split(" ")))

    # Create date month year
    df["created"] = pd.to_datetime(df["created"])
    df["created_month"] = df["created"].dt.month
    df["created_day"] = df["created"].dt.day
    df["created_hour"] = df["created"].dt.hour
    df["created_day_of_week"] = df["created"].dt.dayofweek
    df["created_day_of_month"] = df["created"].dt.day
    df['is_weekday'] = ((df.created_day_of_week) // 5 == 1).astype(float)
    df = df.apply(apply_manager_scores, axis=1)
    df = df.apply(price_per_bedroom, axis=1)
    df = process_home_features(df)
    df = df.apply(distance_from_centre, axis=1)
    
    # if length is more than 2 - has address
    df['has_display_address'] = df['display_address'].apply(lambda x : 1 if len(x) > 2 else 0)
    df['has_street_address'] = df['street_address'].apply(lambda x : 1 if len(x) > 2 else 0)
    df['total_rooms'] = df['bedrooms'] + df['bathrooms']
    df = df.apply(price_per_total_room, axis=1)
#     df = df.apply(apply_building_scores, axis=1)
    
    # One hot encodings
    df = one_hot_encode(df, ['created_hour', 'created_month'], is_test_set)
    
    if is_test_set:
        add_missing_values(df)
    
    # Didn't work
    # df = df.apply(create_has_photos_has_description, axis=1)
    # df = df.apply(price_per_bathroom, axis=1) 
    # df = df.apply(bath_bed_ratio, axis=1)
    # df["bed_bath_differnce"] = df['bedrooms'] - df['bathrooms']
    # df["bed_bath_sum"] = df["bedrooms"] + df['bathrooms']
    return df

In [None]:
def get_feature_list():
    num_feats = ["bathrooms", 
             "bedrooms", 
             "latitude", 
             "longitude", 
             "price",
             "num_photos", 
             "num_features", 
             "num_description_words",
             "created_day_of_month",
             "is_weekday",
             "manager_score",
             "dist_from_ctr",
             "has_display_address", 
             "has_street_address",
             "total_rooms",
             "price_per_total_rooms"
#              "num_adjectives_description"
            ]
    
    # Add one hot encoded variables
    global cat_dummies
    num_feats.extend(cat_dummies)
    
    # add names of house features
    num_feats.extend(set(home_features_dict.values()))
    return num_feats

### Read Data

In [None]:
df = pd.read_json(open("data/train.json", "r"))
print(df.shape)
df.describe()

In [None]:
create_manager_scores(df)
# create_building_scores(df)

In [None]:
import datetime

currentDT = datetime.datetime.now()
print (str(currentDT))

In [None]:
df = feature_engineering(df)

### Remove Outliers

In [None]:
df = remove_outliers(df)

In [None]:
currentDT = datetime.datetime.now()
print (str(currentDT))

In [None]:
scaler = StandardScaler()
df.shape

### Train Model

In [None]:
# shuffle dataframe
df = df.sample(frac=1).reset_index(drop=True)

X = df[get_feature_list()]
y = df["interest_level"]
X.head()
X = scaler.fit_transform(X)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.20)

clf = get_optimised_clf_grid_search(X_train, y_train)
clf.fit(X_train, y_train)
y_val_pred = clf.predict_proba(X_val)

# calculate training loss
loss = log_loss(y_val, y_val_pred)
print(f'Loss : {loss}')

## Random SearchCV
Idea is to narrow down our search for hyperparameters after evaluate a wide range of values for each hyperparameter randomly.
(Later we can use GridSearchCV to choose the best out of the narrowed ones as we will have a better idea of the ballparks)


*NOTE* : Result from RandomSearch CV - {'n_estimators': 200, 'min_samples_split': 10, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': 60, 'bootstrap': False}

In [None]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [5] + [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
pprint(random_grid)


# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestClassifier()
# Random search of parameters, using 5 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 5, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train, y_train)

In [None]:
print(rf_random.best_params_)


## GridSearchCV

Result Params after running grid_search - {'bootstrap': False, 'max_depth': 40, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 200}

In [None]:
def get_optimised_clf_grid_search(X_train, y_train):
    # NOTE :  the parameter grid based on the results of random search 
    param_grid = {
        'bootstrap': [False],
        'max_depth': [40, 50, 60, 70, 80],
        'max_features': ['sqrt'],
        'min_samples_leaf': [1, 2, 3, 4],
        'min_samples_split': [8, 10, 12],
        'n_estimators': [100, 200, 300, 600]
    }
    # Create a based model
    rf = RandomForestClassifier()
    # Instantiate the grid search model
    clf = GridSearchCV(estimator = rf, param_grid = param_grid, 
                              cv = 5, n_jobs = -1, verbose = 2)
    clf.fit(X_train, y_train)
    print(clf.best_params_)
    optimised_clf = clf.best_estimator_
    return optimised_clf

## Save model

In [None]:
dump(clf, 'model.joblib')
dump(scaler, "scaler.save")
with open('manager_scores.json', 'w') as fp:
    json.dump(manager_scores, fp)

In [None]:
pd.Series(index = get_feature_list(), data = clf.feature_importances_).sort_values().plot(kind = 'bar')

## Making predictions on test data

### Read and feature engineer

In [None]:
test_df = pd.read_json(open("data/test.json", "r"))
test_df = feature_engineering(test_df, is_test_set=True)

In [None]:
%%timeit
test_df[get_feature_list()].describe()

### Predict

In [None]:
X_test = test_df[get_feature_list()]
X_test = scaler.transform(X_test)
y = clf.predict_proba(X_test)

labels2idx = {label: i for i, label in enumerate(clf.classes_)}
sub = pd.DataFrame()
sub["listing_id"] = test_df["listing_id"]
for label in ["high", "medium", "low"]:
    sub[label] = y[:, labels2idx[label]]

In [None]:
sub.to_csv("submission_rf.csv", index=False)

In [None]:
pd.Series(index = get_feature_list(), data = clf.feature_importances_).sort_values().plot(kind = 'bar')

## XGB

In [None]:
def runXGB(train_X, train_y, test_X, test_y=None, feature_names=None, seed_val=0, num_rounds=1000):
    param = {}
    param['objective'] = 'multi:softprob'
    param['eta'] = 0.1
    param['max_depth'] = 6
    param['silent'] = 1
    param['num_class'] = 3
    param['eval_metric'] = "mlogloss"
    param['min_child_weight'] = 1
    param['subsample'] = 0.7
    param['colsample_bytree'] = 0.7
    param['seed'] = seed_val
    num_rounds = num_rounds

    plst = list(param.items())
    xgtrain = xgb.DMatrix(train_X, label=train_y)

    if test_y is not None:
        xgtest = xgb.DMatrix(test_X, label=test_y)
        watchlist = [ (xgtrain,'train'), (xgtest, 'test') ]
        model = xgb.train(plst, xgtrain, num_rounds, watchlist, early_stopping_rounds=20)
    else:
        xgtest = xgb.DMatrix(test_X)
        model = xgb.train(plst, xgtrain, num_rounds)

    pred_test_y = model.predict(xgtest)
    return pred_test_y, model

In [None]:
X = test_df[get_feature_list()]
X = scaler.transform(X)

In [None]:
target_num_map = {'high':0, 'medium':1, 'low':2}
y = np.array(df['interest_level'].apply(lambda x: target_num_map[x]))

preds, model = runXGB(X, y, X_test, num_rounds=400)
out_df = pd.DataFrame(preds)
out_df.columns = ["high", "medium", "low"]
out_df["listing_id"] = test_df.listing_id.values
out_df.to_csv("xgb_results.csv", index=False)