In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn import metrics

In [2]:
train = pd.read_json('train.json')

In [3]:
train.columns

Index([u'bathrooms', u'bedrooms', u'building_id', u'created', u'description',
       u'display_address', u'features', u'interest_level', u'latitude',
       u'listing_id', u'longitude', u'manager_id', u'photos', u'price',
       u'street_address'],
      dtype='object')

In [4]:
low = train[train.interest_level=='low']
med = train[train.interest_level=='medium']
hi = train[train.interest_level=='high']

In [5]:
len(low),len(med),len(hi)

(34284, 11229, 3839)

### Undersample 'low' and 'medium' interest_level classes. Shuffle data.

In [6]:
low = low.sample(5000)
med = med.sample(5000)
train = low.append(med).append(hi)
train = train.sample(frac=1)

### What is the majority class?

In [7]:
train.interest_level.value_counts(normalize=True)

medium    0.361298
low       0.361298
high      0.277404
Name: interest_level, dtype: float64

### How predictive is the property description alone?
Note: Going forward, it will make sense to first apply some text cleaning.

In [8]:
pipe = Pipeline([('tfidf', TfidfVectorizer()), ('clf', SGDClassifier())])

In [9]:
pipe.fit(train.description, train.interest_level)

Pipeline(steps=[('tfidf', TfidfVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm=u'l2', preprocessor=None, smooth_idf=Tru...   penalty='l2', power_t=0.5, random_state=None, shuffle=True,
       verbose=0, warm_start=False))])

In [10]:
pipe.score(train.description, train.interest_level)

0.67432617963725705

In [11]:
predictions = pd.Series(pipe.predict(train.description))
predictions.value_counts(sort=False)

high      2454
low       5034
medium    6351
dtype: int64

In [12]:
train.interest_level.value_counts(sort=False)

high      3839
low       5000
medium    5000
Name: interest_level, dtype: int64

Performance roughly the same as when training on unbalanced dataset, although model bias now toward 'medium' interest_level class.

### How predictive are the features 'bedrooms', 'bathrooms', 'price' on their own?

In [13]:
num_feats = ['bathrooms','bedrooms','price']
train_subset = train[num_feats]
train_target = train.interest_level

In [14]:
model = MultinomialNB()
model.fit(train_subset, train_target)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [15]:
model.score(train_subset, train_target)

0.43688127754895584

In [16]:
predictions = pd.Series(model.predict(train_subset))
predictions.value_counts(sort=False)

high       103
low       5179
medium    8557
dtype: int64

### Build list of unique features included in property descriptions
Note: Other competition participants have interesting takes on building this feature list, such as deduplicating features or excluding ones that occur less than 5 times in the dataset: https://www.kaggle.com/jxnlco/two-sigma-connect-rental-listing-inquiries/deduplicating-features

In [17]:
features = []
for i in train.features:
    for j in i:
        if j not in features:
            features.append(j)

### Initialize feature ndarray, iterate over properties dataframe, updating feature ndarray appropriately.

In [18]:
feat_array = np.ndarray((len(train),len(features)))

In [19]:
for i in range(len(train)):
    for word in train.features.iloc[i]:
        if word in features:
            feat_array[i,features.index(word)] = 1
            #print features.index(word)

In [20]:
target_array = np.array(train_target)

### Train linear regression model on feature array and test performance.

In [21]:
svm_model = SGDClassifier()
svm_model.fit(feat_array, train_target)

SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', n_iter=5, n_jobs=1,
       penalty='l2', power_t=0.5, random_state=None, shuffle=True,
       verbose=0, warm_start=False)

In [22]:
svm_model.score(feat_array, train_target)

0.43110051304284991

In [23]:
predictions = svm_model.predict(feat_array)
predictions = pd.Series(predictions)
predictions.value_counts(sort=False)

high       573
low       9381
medium    3885
dtype: int64

In [24]:
train_target.value_counts(sort=False)

high      3839
low       5000
medium    5000
Name: interest_level, dtype: int64