In [1]:
import json
import ujson
import pandas as pd
import numpy as np
import datetime
import seaborn as sns

import matplotlib.pyplot as plt
% matplotlib inline



In [2]:
df = pd.read_json('rent_hop_train.json')

In [3]:
df.head(3)

Unnamed: 0,bathrooms,bedrooms,building_id,created,description,display_address,features,interest_level,latitude,listing_id,longitude,manager_id,photos,price,street_address
10,1.5,3,53a5b119ba8f7b61d4e010512e0dfc85,2016-06-24 07:54:24,A Brand New 3 Bedroom 1.5 bath ApartmentEnjoy ...,Metropolitan Avenue,[],medium,40.7145,7211212,-73.9425,5ba989232d0489da1b5f2c45f6688adc,[https://photos.renthop.com/2/7211212_1ed4542e...,3000,792 Metropolitan Avenue
10000,1.0,2,c5c8a357cba207596b04d1afd1e4f130,2016-06-12 12:19:27,,Columbus Avenue,"[Doorman, Elevator, Fitness Center, Cats Allow...",low,40.7947,7150865,-73.9667,7533621a882f71e25173b27e3139d83d,[https://photos.renthop.com/2/7150865_be3306c5...,5465,808 Columbus Avenue
100004,1.0,1,c3ba40552e2120b0acfc3cb5730bb2aa,2016-04-17 03:26:41,"Top Top West Village location, beautiful Pre-w...",W 13 Street,"[Laundry In Building, Dishwasher, Hardwood Flo...",high,40.7388,6887163,-74.0018,d9039c43983f6e564b1482b273bd7b01,[https://photos.renthop.com/2/6887163_de85c427...,2850,241 W 13 Street


#### Training matrix shape and feature vectors

In [4]:
df.shape

(49352, 15)

In [5]:
print (df.columns.values)

['bathrooms' 'bedrooms' 'building_id' 'created' 'description'
 'display_address' 'features' 'interest_level' 'latitude' 'listing_id'
 'longitude' 'manager_id' 'photos' 'price' 'street_address']


### Function to print unique numerical feature values and their frequencies given feature list

In [6]:
def printfeaturevaluedist(features):
    for feature in features:
        print ("Total values = ",df[feature].value_counts().sum())
        print ("Freq. disbn")
        print (df[feature].value_counts())
        

In [7]:
printfeaturevaluedist(['bathrooms', 'bedrooms','interest_level'])

Total values =  49352
Freq. disbn
1.0     39422
2.0      7660
3.0       745
1.5       645
0.0       313
2.5       277
4.0       159
3.5        70
4.5        29
5.0        20
5.5         5
6.0         4
6.5         1
10.0        1
7.0         1
Name: bathrooms, dtype: int64
Total values =  49352
Freq. disbn
1    15752
2    14623
0     9475
3     7276
4     1929
5      247
6       46
8        2
7        2
Name: bedrooms, dtype: int64
Total values =  49352
Freq. disbn
low       34284
medium    11229
high       3839
Name: interest_level, dtype: int64


#### ^^^ High = 8%, Medium = 22%, Low = 70%

In [8]:
def intcheck(data):
    target = 'interest_level'
    print ('# Interest values = ', data[target].value_counts().sum())
    print ('')
    print ('% Low interest    = ', round(data[target].value_counts()[0]/data[target].value_counts().sum(),2))
    print ('% Med interest    = ', round(data[target].value_counts()[1]/data[target].value_counts().sum(),2))
    print ('% High interest   = ', round(data[target].value_counts()[2]/data[target].value_counts().sum(),2))
    print ('')
    print (data[target].value_counts())


In [9]:
intcheck(df[df['bathrooms'] > 3])

# Interest values =  290

% Low interest    =  0.97
% Med interest    =  0.02
% High interest   =  0.01

low       281
medium      5
high        4
Name: interest_level, dtype: int64


#### ^^^ Interestingly > 3 bathrooms isn't drawing much interest compared to the overall data set

In [10]:
intcheck(df[df['bedrooms'] > 4])

# Interest values =  297

% Low interest    =  0.98
% Med interest    =  0.02
% High interest   =  0.01

low       290
medium      5
high        2
Name: interest_level, dtype: int64


### Accuracy function

In [11]:
# Build model accuracy calculator
from sklearn.metrics import f1_score

def f1(model, X, y):
    y_true = y
    y_pred = model.predict(X)
    
    print ('F1 score with macro averaging    = ', round(f1_score(y_true, y_pred, average='macro'),2))
    print ('F1 score with micro averaging    = ', round(f1_score(y_true, y_pred, average='micro'),2))
    print ('F1 score with weighted averaging = ', round(f1_score(y_true, y_pred, average='weighted'),2))
    
    

#### ^^^ Greater than 4 bedrooms interest level dropped relative to overall distribution

### Text Analyzer

In [11]:
# from sklearn.pipeline import Pipeline
# text_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), 'clf', MultinomialNB()])

#### Build a count vectorizer (Bag of words sparse matrix)

In [68]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
dft_counts = count_vect.fit_transform(dft['description'])

In [13]:
# del df['count']

In [14]:
df.head(3)

Unnamed: 0,bathrooms,bedrooms,building_id,created,description,display_address,features,interest_level,latitude,listing_id,longitude,manager_id,photos,price,street_address
10,1.5,3,53a5b119ba8f7b61d4e010512e0dfc85,2016-06-24 07:54:24,A Brand New 3 Bedroom 1.5 bath ApartmentEnjoy ...,Metropolitan Avenue,[],medium,40.7145,7211212,-73.9425,5ba989232d0489da1b5f2c45f6688adc,[https://photos.renthop.com/2/7211212_1ed4542e...,3000,792 Metropolitan Avenue
10000,1.0,2,c5c8a357cba207596b04d1afd1e4f130,2016-06-12 12:19:27,,Columbus Avenue,"[Doorman, Elevator, Fitness Center, Cats Allow...",low,40.7947,7150865,-73.9667,7533621a882f71e25173b27e3139d83d,[https://photos.renthop.com/2/7150865_be3306c5...,5465,808 Columbus Avenue
100004,1.0,1,c3ba40552e2120b0acfc3cb5730bb2aa,2016-04-17 03:26:41,"Top Top West Village location, beautiful Pre-w...",W 13 Street,"[Laundry In Building, Dishwasher, Hardwood Flo...",high,40.7388,6887163,-74.0018,d9039c43983f6e564b1482b273bd7b01,[https://photos.renthop.com/2/6887163_de85c427...,2850,241 W 13 Street


In [15]:
print (df.head(1)['description'])

10    A Brand New 3 Bedroom 1.5 bath ApartmentEnjoy ...
Name: description, dtype: object


In [16]:
df.shape

(49352, 15)

#### Build a TF-IDF (Term frequency - Inverse Document Frequency) vectorizer - Important words sparse matrix

In [69]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_creator = TfidfTransformer(use_idf= False)

# dft_tfidf = tfidf_creator.fit_transform(dft['features'])

In [58]:
sgd_tfidf = SGDClassifier(loss='log', penalty = 'elasticnet', l1_ratio = 0.5,
                    learning_rate = 'optimal', shuffle = True, verbose =0,
                    warm_start = True, fit_intercept = True)


In [62]:
target = ['interest_level']

In [63]:
X = df_tfidf
y = dft[target]

In [65]:
# sgd_tfidf.fit(X, y)

In [18]:
print (df_counts.shape, df_tfidf.shape)

(49352, 37823) (49352, 37823)


##### ^^^ TFIDF is a sparse matrix with "Interesting" words and TF_IDF values. Not sure yet how to incorporate this into SGD Logistic Classifier

### Build a Logistic Classifier

In [19]:
from sklearn.model_selection import train_test_split as split

In [20]:
dft, dfv = split(df, test_size=0.20, random_state=100)

In [21]:
print (len(df), len(dft), len(dfv), len(dfv)/len(dft))

49352 39481 9871 0.2500189964793192


In [22]:
from sklearn.linear_model import SGDClassifier

### Parameters - 
log loss = logistic classification - probabilistic classifer
penalty  = elasticnet - does both l1 (controls sparsity) and l2 (regularizes)
alpha    = learning rate - optimal - means SGD iterates to figure out best value
l1_ratio = 0.5 -  0 means L2, 1 means L1. default is 0.15
fit intercept = True - Centers data

In [47]:
sgd_simple = SGDClassifier(loss='log', penalty = 'elasticnet', l1_ratio = 0.5,
                    learning_rate = 'optimal', shuffle = True, verbose =0,
                    warm_start = True, fit_intercept = True)


In [50]:
simple_features = ['bathrooms','bedrooms','latitude', 'longitude', 'price']
target = ['interest_level']

In [49]:
dft[simple_features].head(3)

Unnamed: 0,bathrooms,bedrooms,latitude,longitude,price
6622,2.0,4,40.6931,-73.9722,3975
20647,1.0,1,40.7916,-73.9402,1825
99923,2.0,2,40.7829,-73.9731,11950


In [51]:
X = dft[simple_features]
y = dft[target]

In [52]:
sgd_simple.fit(X, y)

  y = column_or_1d(y, warn=True)


SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.5, learning_rate='optimal',
       loss='log', n_iter=5, n_jobs=1, penalty='elasticnet', power_t=0.5,
       random_state=None, shuffle=True, verbose=0, warm_start=True)

In [53]:
f1(sgd_simple, X, y)

  'precision', 'predicted', average, warn_for)


F1 score with macro averaging    =  0.33
F1 score with micro averaging    =  0.7
F1 score with weighted averaging =  0.58


In [55]:
#dft['predicted_level'] = sgd.predict(X)

In [56]:
simple_features_plus = simple_features + ['interest_level', 'predicted_level','description', 'features']

In [57]:
dft[simple_features_plus].head(3)

Unnamed: 0,bathrooms,bedrooms,latitude,longitude,price,interest_level,predicted_level,description,features
6622,2.0,4,40.6931,-73.9722,3975,low,high,Stainless steel appliances- Hardwood floors- G...,[]
20647,1.0,1,40.7916,-73.9402,1825,low,high,New to market! One bedroom apartment in East H...,"[Dogs Allowed, Cats Allowed]"
99923,2.0,2,40.7829,-73.9731,11950,low,medium,Tremendous Classic Six in a Luxurious Prewar D...,"[Doorman, Pre-War, Dogs Allowed, Cats Allowed]"


#### ^^^^ Model excluding description, features and photos is not a good model