In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.cross_validation import train_test_split
from sklearn import metrics



In [2]:
train = pd.read_json('train.json')

### What is the majority class?

In [3]:
train.interest_level.value_counts(normalize=True)

low       0.694683
medium    0.227529
high      0.077788
Name: interest_level, dtype: float64

Okay so 0.695 prediction accuracy is the baseline

### How predictive is the property description alone?
Note: Going forward, it will make sense to first apply some text cleaning.

In [4]:
pipe = Pipeline([('tfidf', TfidfVectorizer()), ('clf', SGDClassifier())])

In [5]:
pipe.fit(train.description, train.interest_level)

Pipeline(steps=[('tfidf', TfidfVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm=u'l2', preprocessor=None, smooth_idf=Tru...   penalty='l2', power_t=0.5, random_state=None, shuffle=True,
       verbose=0, warm_start=False))])

In [6]:
pipe.score(train.description, train.interest_level)

0.70299076025287732

### How predictive are the numeric features alone?

In [7]:
train_numer_df = train.select_dtypes(include=['float64', 'int64'])
train_target = train.interest_level

In [8]:
model = SGDClassifier()
model.fit(train_numer_df, train_target)

SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', n_iter=5, n_jobs=1,
       penalty='l2', power_t=0.5, random_state=None, shuffle=True,
       verbose=0, warm_start=False)

In [9]:
model.score(train_numer_df, train_target)

0.69468309288377372

### Build list of unique features included in property descriptions
Note: Other competition participants have interesting takes on building this feature list, such as deduplicating features or excluding ones that occur less than 5 times in the dataset: https://www.kaggle.com/jxnlco/two-sigma-connect-rental-listing-inquiries/deduplicating-features

In [10]:
features = []
for i in train.features:
    for j in i:
        if j not in features:
            features.append(j)

### Initialize feature ndarray, iterate over properties dataframe, updating feature ndarray appropriately.

In [11]:
feat_array = np.ndarray((len(train),len(features)))

In [12]:
for i in range(len(train)):
    for word in train.features.iloc[i]:
        if word in features:
            feat_array[i,features.index(word)] = 1
            #print features.index(word)
    

In [13]:
target_array = np.array(train_target)

### Train linear regression model on feature array and test performance.

In [14]:
svm_model = SGDClassifier()
svm_model.fit(feat_array, train_target)

SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', n_iter=5, n_jobs=1,
       penalty='l2', power_t=0.5, random_state=None, shuffle=True,
       verbose=0, warm_start=False)

In [15]:
svm_model.score(feat_array, train_target)

0.69972848111525365

In [16]:
predictions = svm_model.predict(feat_array)
predictions = pd.Series(predictions)
predictions.value_counts()

low       48700
medium      534
high        118
dtype: int64

In [17]:
train_target.value_counts()

low       34284
medium    11229
high       3839
Name: interest_level, dtype: int64

### Thoughts going forward:
The class imbalance of the training data is obvious. It may be worthwhile to undersample the 'low' and 'medium' interest groups and see if better performance is achieved. In the end, however, I suspect training a deep learning model on the provided image data will be necessary to be competitive.