In [154]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

from subprocess import check_output
# print(check_output(["ls", "data"]).decode("utf8"))

# Any results you write to the current directory are saved as output.


from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
%matplotlib inline

In [155]:
def remove_outliers(df):
    # standard deviation threshold
    sd_threshold = 1
    
    # remove price outliers
    df = df[(df.price <= 15000) & (df.price >= 1000)]
    return df

In [156]:
manager_scores = {}
def create_manager_scores(df):
    global manager_scores
    manager_scores = {}
    
    def calculate_manager_score(row):
        manager_id = row['manager_id']
        interest = row['interest_level']
        
        score_to_add = 0
        if interest == 'high':
            score_to_add += 3
        elif interest == 'medium':
            score_to_add += 2
        elif interest == 'low':
            score_to_add += 1
        
        if manager_id in manager_scores:
            manager_scores[manager_id] = manager_scores[manager_id] + score_to_add
        else:
            manager_scores[manager_id] = score_to_add
    df.apply(calculate_manager_score, axis=1)
    
def apply_manager_scores(row):
    manager_id = row['manager_id']
    if manager_id in manager_scores:
        row['manager_score'] = manager_scores[manager_id]
    else:
        row['manager_score'] = 0
        
    return row

In [163]:
def price_per_bedroom(row):
    bedrooms = row['bedrooms']
    if bedrooms == 0:
        price_per_bedroom = 0
    else:
        price_per_bedroom = row['price'] * 1.00 / bedrooms
    row['price_per_bedroom'] = price_per_bedroom
    return row

### Read Data

In [164]:
df = pd.read_json(open("data/train.json", "r"))
print(df.shape)
df.describe()

(49352, 15)


Unnamed: 0,bathrooms,bedrooms,latitude,listing_id,longitude,price
count,49352.0,49352.0,49352.0,49352.0,49352.0,49352.0
mean,1.21218,1.54164,40.741545,7024055.0,-73.955716,3830.174
std,0.50142,1.115018,0.638535,126274.6,1.177912,22066.87
min,0.0,0.0,0.0,6811957.0,-118.271,43.0
25%,1.0,1.0,40.7283,6915888.0,-73.9917,2500.0
50%,1.0,1.0,40.7518,7021070.0,-73.9779,3150.0
75%,1.0,2.0,40.7743,7128733.0,-73.9548,4100.0
max,10.0,8.0,44.8835,7753784.0,0.0,4490000.0


### Remove Outliers

In [165]:
df = remove_outliers(df)
print(df.shape)
# df.head()
# df.to_csv('t.csv')

(49077, 15)


## Feature Engineering

In [166]:
create_manager_scores(df)

In [167]:
df["num_photos"] = df["photos"].apply(len)
df["num_features"] = df["features"].apply(len)
df["num_description_words"] = df["description"].apply(lambda x: len(x.split(" ")))

# Create date month year
df["created"] = pd.to_datetime(df["created"])
df["created_month"] = df["created"].dt.month
df["created_day"] = df["created"].dt.day
df["created_hour"] = df["created"].dt.hour

df['is_weekday'] = ((df.created_day) // 5 == 1).astype(float)
df = df.apply(apply_manager_scores, axis=1)
df = df.apply(price_per_bedroom, axis=1)

num_feats = ["bathrooms", "bedrooms", "latitude", "longitude", "price",
             "num_photos", "num_features", "num_description_words",
             "created_month", "created_day", "created_hour", "manager_score", "price_per_bedroom"]

# num_feats = ["bathrooms", "bedrooms", "latitude", "longitude", "price",
#              "num_photos", "num_features", "num_description_words",
#              "created_year", "created_month", "created_day"]

In [168]:
### Train Model

X = df[num_feats]
y = df["interest_level"]
X.head()

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.33)

clf = RandomForestClassifier(n_estimators=1000)
clf.fit(X_train, y_train)
y_val_pred = clf.predict_proba(X_val)

# calculate training loss
loss = log_loss(y_val, y_val_pred)
print(f'Loss : {loss}')

Loss : 0.6028461268004806


## Making predictions on test data

In [148]:
df = pd.read_json(open("data/test.json", "r"))
df["num_photos"] = df["photos"].apply(len)
df["num_features"] = df["features"].apply(len)
df["num_description_words"] = df["description"].apply(lambda x: len(x.split(" ")))
df["created"] = pd.to_datetime(df["created"])
df["created_month"] = df["created"].dt.month
df["created_day"] = df["created"].dt.day
df["created_hour"] = df["created"].dt.hour
df = df.apply(apply_manager_scores, axis=1)
df = df.apply(price_per_bedroom, axis=1)
X = df[num_feats]

y = clf.predict_proba(X)


labels2idx = {label: i for i, label in enumerate(clf.classes_)}
labels2idx

sub = pd.DataFrame()
sub["listing_id"] = df["listing_id"]
for label in ["high", "medium", "low"]:
    sub[label] = y[:, labels2idx[label]]


In [151]:

sub.to_csv("submission_rf.csv", index=False)

In [152]:
submission = pd.read_csv('submission_rf.csv')

In [153]:
submission.head()

Unnamed: 0,listing_id,high,medium,low
0,7142618,0.086,0.397,0.517
1,7210040,0.34,0.286,0.374
2,7103890,0.008,0.105,0.887
3,7143442,0.129,0.372,0.499
4,6860601,0.011,0.18,0.809


In [None]:
pd.Series(index = num_feats, data = clf.feature_importances_).sort_values().plot(kind = 'bar')