In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

from subprocess import check_output
# print(check_output(["ls", "data"]).decode("utf8"))

# Any results you write to the current directory are saved as output.


from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
%matplotlib inline

In [None]:
def remove_outliers(df):
    # standard deviation threshold
    sd_threshold = 1
    
    # Remove price outliers
    df = df[(df.price <= 15000) & (df.price >= 1000)]
    
    # Remove dist from city centre outliers
    # apporimate radius from city centre
    NYC_RADIUS = 20
    df = df[(df.dist_from_ctr <= 20)]
    return df

In [None]:
manager_scores = {}
def create_manager_scores(df):
    global manager_scores
    manager_scores = {}
    
    def calculate_manager_score(row):
        manager_id = row['manager_id']
        interest = row['interest_level']
        
        score_to_add = 0
        if interest == 'high':
            score_to_add += 3
        elif interest == 'medium':
            score_to_add += 2
        elif interest == 'low':
            score_to_add += 1
        
        if manager_id in manager_scores:
            manager_scores[manager_id] = manager_scores[manager_id] + score_to_add
        else:
            manager_scores[manager_id] = score_to_add
    df.apply(calculate_manager_score, axis=1)
    
def apply_manager_scores(row):
    manager_id = row['manager_id']
    if manager_id in manager_scores:
        row['manager_score'] = manager_scores[manager_id]
    else:
        row['manager_score'] = 0
        
    return row

In [None]:
def price_per_bedroom(row):
    bedrooms = row['bedrooms']
    if bedrooms == 0:
        price_per_bedroom = 0
    else:
        price_per_bedroom = row['price'] * 1.00 / bedrooms
    row['price_per_bedroom'] = price_per_bedroom
    return row

In [None]:
def price_per_bathroom(row):
    bathrooms = row['bathrooms']
    if bathrooms == 0:
        price_per_bathroom = 0
    else:
        price_per_bathroom = row['price'] * 1.00 / bathrooms
    row['price_per_bathroom'] = price_per_bathroom
    return row

In [None]:
def bath_bed_ratio(row):
    bedrooms = row['bedrooms']
    bathrooms = row['bathrooms']
    if bedrooms == 0:
        bath_bed_ratio = bathrooms
    else:
        bath_bed_ratio = bathrooms/bedrooms
    
    row['bath_bed_ratio'] = bath_bed_ratio
    return row

In [None]:
# Key : Feature in data | Value : column name to be created for category variable

home_features_dict = {'Elevator': 'has_elevator',
 'Cats Allowed': 'has_cats_allowed',
 'Hardwood Floors': 'has_hardwood_floors',
 'Dogs Allowed': 'has_dogs_allowed',
 'Doorman': 'has_doorman',
 'Dishwasher': 'has_dishwasher',
 'No Fee': 'has_no_fee',
 'Laundry in Building': 'has_laundry_in_building',
 'Fitness Center': 'has_fitness_center',
 'Pre-War': 'has_pre-war',
 'Laundry in Unit': 'has_laundry_in_unit',
 'Roof Deck': 'has_roof_deck',
 'Outdoor Space': 'has_outdoor_space',
 'Dining Room': 'has_dining_room',
 'High Speed Internet': 'has_high_speed_internet',
 'Balcony': 'has_balcony',
 'Swimming Pool': 'has_swimming_pool',
 'Laundry In Building': 'has_laundry_in_building',
 'New Construction': 'has_new_construction',
 'Terrace': 'has_terrace',
 'Exclusive': 'has_exclusive',
 'Loft': 'has_loft',
 'Garden/Patio': 'has_garden/patio',
 'Wheelchair Access': 'has_wheelchair_access',
 'Common Outdoor Space': 'has_common_outdoor_space'}

def process_home_features(df):
    # Add columns for popular features
    for key, val in home_features_dict.items():
        df[val] = 0
        
    def update_popular_feature_cols(row):
        features = row['features']
        for feature in features:
            if feature in home_features_dict:
                row[home_features_dict[feature]] = 1

        return row
    
    df = df.apply(update_popular_feature_cols, axis=1)
    return df

In [None]:
def create_has_photos_has_description(row):
    row['has_photos'] = 1 if row['num_photos'] > 0 else 0
    row['has_description'] = 1 if row['num_description_words'] > 0 else 0
    return row

In [None]:
import geopy.distance
def distance_from_centre(row):
    centre = (40.730610, -73.935242)
    lat_long = (row['latitude'], row['longitude'])
    distance = geopy.distance.vincenty(centre, lat_long).miles
    row['dist_from_ctr'] = distance
    return row

## Feature Engineering

In [None]:
def feature_engineering(df):
    df["num_photos"] = df["photos"].apply(len)
    df["num_features"] = df["features"].apply(len)
    df["num_description_words"] = df["description"].apply(lambda x: len(x.split(" ")))

    # Create date month year
    df["created"] = pd.to_datetime(df["created"])
    df["created_month"] = df["created"].dt.month
    df["created_day"] = df["created"].dt.day
    df["created_hour"] = df["created"].dt.hour

    df['is_weekday'] = ((df.created_day) // 5 == 1).astype(float)
    df = df.apply(apply_manager_scores, axis=1)
    df = df.apply(price_per_bedroom, axis=1)
    df = process_home_features(df)
    
    df = df.apply(distance_from_centre, axis=1)
    
    # Didn't work
    # df = df.apply(create_has_photos_has_description, axis=1)
    # df = df.apply(price_per_bathroom, axis=1) 
    # df = df.apply(bath_bed_ratio, axis=1)
    # df["bed_bath_differnce"] = df['bedrooms'] - df['bathrooms']
    # df["bed_bath_sum"] = df["bedrooms"] + df['bathrooms']
    
   
    return df

In [None]:
# num_feats = ["bathrooms", "bedrooms", "latitude", "longitude", "price",
    #              "num_photos", "num_features", "num_description_words",
    #              "created_year", "created_month", "created_day"]

num_feats = ["bathrooms", 
             "bedrooms", 
             "latitude", 
             "longitude", 
             "price",
             "num_photos", 
             "num_features", 
             "num_description_words",
             "created_month", 
             "created_day", 
             "created_hour", 
             "manager_score",
             "price_per_bedroom",
             "dist_from_ctr"
            ]

# add names of house features
num_feats.extend(list(home_features_dict.values()))

### Read Data

In [None]:
df = pd.read_json(open("data/train.json", "r"))
print(df.shape)
df.describe()

### Remove Outliers

In [None]:
create_manager_scores(df)
create_listing_scores(df)

In [None]:
df = feature_engineering(df)

In [None]:
df = remove_outliers(df)
print(df.shape)

### Train Model

In [None]:
X = df[num_feats]
y = df["interest_level"]
X.head()

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.33)

clf = RandomForestClassifier(n_estimators=1000)
clf.fit(X_train, y_train)
y_val_pred = clf.predict_proba(X_val)

# calculate training loss
loss = log_loss(y_val, y_val_pred)
print(f'Loss : {loss}')

In [None]:
pd.Series(index = num_feats, data = clf.feature_importances_).sort_values().plot(kind = 'bar')

In [None]:
X.shape

## Making predictions on test data

### Read and feature engineer

In [None]:
test_df = pd.read_json(open("data/test.json", "r"))
test_df = feature_engineering(test_df)

In [None]:
test_df['price'].describe()

### Predict

In [None]:
X = test_df[num_feats]
y = clf.predict_proba(X)

labels2idx = {label: i for i, label in enumerate(clf.classes_)}

sub = pd.DataFrame()
sub["listing_id"] = test_df["listing_id"]
for label in ["high", "medium", "low"]:
    sub[label] = y[:, labels2idx[label]]

In [None]:
sub.to_csv("submission_rf.csv", index=False)

In [None]:
submission = pd.read_csv('submission_rf.csv')

In [None]:
submission.head()

In [None]:
pd.Series(index = num_feats, data = clf.feature_importances_).sort_values().plot(kind = 'bar')

# Playground

## Popular Features of home -  Analyses

In [None]:
data = pd.read_json('data/train.json')
train_features = data['features']
from itertools import chain
train_features = list(chain.from_iterable(train_features))
s_train = set(train_features)

In [None]:
# Test Features

In [None]:
data = pd.read_json('data/test.json')
test_features = data['features']
from itertools import chain
test_features = list(chain.from_iterable(test_features))
s_test = set(test_features)

In [None]:
series_train = pd.Series(train_features)
vc = series_train.value_counts()
vc = vc[vc > 1000]
len(vc)

In [None]:
series_test = pd.Series(test_features)
test_vc = series_test.value_counts()
test_vc = test_vc[test_vc > 1500]
len(test_vc)

In [None]:
set(vc.index.tolist()) - set(test_vc.index.tolist()) 

In [None]:
home_features = test_vc.index.tolist()

In [None]:
home_features_col_names = ['has_' + (feature.replace(' ', '_')).lower() for feature in home_features]

In [None]:
home_features_dict = dict(zip(home_features, home_features_col_names))

In [None]:
home_features_dict

In [None]:
# Key : Feature in data | Value : column name to be created for category variable

home_features_dict = {'Elevator': 'has_elevator',
 'Cats Allowed': 'has_cats_allowed',
 'Hardwood Floors': 'has_hardwood_floors',
 'Dogs Allowed': 'has_dogs_allowed',
 'Doorman': 'has_doorman',
 'Dishwasher': 'has_dishwasher',
 'No Fee': 'has_no_fee',
 'Laundry in Building': 'has_laundry_in_building',
 'Fitness Center': 'has_fitness_center',
 'Pre-War': 'has_pre-war',
 'Laundry in Unit': 'has_laundry_in_unit',
 'Roof Deck': 'has_roof_deck',
 'Outdoor Space': 'has_outdoor_space',
 'Dining Room': 'has_dining_room',
 'High Speed Internet': 'has_high_speed_internet',
 'Balcony': 'has_balcony',
 'Swimming Pool': 'has_swimming_pool',
 'Laundry In Building': 'has_laundry_in_building',
 'New Construction': 'has_new_construction',
 'Terrace': 'has_terrace',
 'Exclusive': 'has_exclusive',
 'Loft': 'has_loft',
 'Garden/Patio': 'has_garden/patio',
 'Wheelchair Access': 'has_wheelchair_access',
 'Common Outdoor Space': 'has_common_outdoor_space'}

In [None]:
data = data.head(5)

In [None]:
data[['features']]

In [None]:
# Add columns for popular features
for key, val in home_features_dict.items():
    data[val] = 0

def update_popular_feature_cols(row):
    features = row['features']
    for feature in features:
        if feature in home_features_dict:
            row[home_features_dict[feature]] = 1
    
    return row

In [None]:
data.shape

In [None]:
data = data.apply(update_popular_feature_cols, axis=1)

In [None]:
data.iloc[0]