In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math
import time
import random
from itertools import product
from collections import Counter
from sklearn.svm import SVC, LinearSVC 
from sklearn import preprocessing
from sklearn.model_selection import KFold, cross_val_score
from sklearn.ensemble import BaggingClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import log_loss, make_scorer

In [2]:
#Load the training and test data into pandas df
trainDF = pd.read_json('./data/train.json')
testDF = pd.read_json('./data/test.json')

In [3]:
#Create copies to mess around with columns to use in feature vectors
train = trainDF.copy()
test = testDF.copy()

In [4]:
print train.shape
train.head()

(49352, 15)


Unnamed: 0,bathrooms,bedrooms,building_id,created,description,display_address,features,interest_level,latitude,listing_id,longitude,manager_id,photos,price,street_address
10,1.5,3,53a5b119ba8f7b61d4e010512e0dfc85,2016-06-24 07:54:24,A Brand New 3 Bedroom 1.5 bath ApartmentEnjoy ...,Metropolitan Avenue,[],medium,40.7145,7211212,-73.9425,5ba989232d0489da1b5f2c45f6688adc,[https://photos.renthop.com/2/7211212_1ed4542e...,3000,792 Metropolitan Avenue
10000,1.0,2,c5c8a357cba207596b04d1afd1e4f130,2016-06-12 12:19:27,,Columbus Avenue,"[Doorman, Elevator, Fitness Center, Cats Allow...",low,40.7947,7150865,-73.9667,7533621a882f71e25173b27e3139d83d,[https://photos.renthop.com/2/7150865_be3306c5...,5465,808 Columbus Avenue
100004,1.0,1,c3ba40552e2120b0acfc3cb5730bb2aa,2016-04-17 03:26:41,"Top Top West Village location, beautiful Pre-w...",W 13 Street,"[Laundry In Building, Dishwasher, Hardwood Flo...",high,40.7388,6887163,-74.0018,d9039c43983f6e564b1482b273bd7b01,[https://photos.renthop.com/2/6887163_de85c427...,2850,241 W 13 Street
100007,1.0,1,28d9ad350afeaab8027513a3e52ac8d5,2016-04-18 02:22:02,Building Amenities - Garage - Garden - fitness...,East 49th Street,"[Hardwood Floors, No Fee]",low,40.7539,6888711,-73.9677,1067e078446a7897d2da493d2f741316,[https://photos.renthop.com/2/6888711_6e660cee...,3275,333 East 49th Street
100013,1.0,4,0,2016-04-28 01:32:41,Beautifully renovated 3 bedroom flex 4 bedroom...,West 143rd Street,[Pre-War],low,40.8241,6934781,-73.9493,98e13ad4b495b9613cef886d79a6291f,[https://photos.renthop.com/2/6934781_1fa4b41a...,3350,500 West 143rd Street


In [5]:
#let's do some basic feature engineering
train['price_per_bedroom'] = train['price'] / train['bedrooms']
train['price_per_bathroom'] = train['price'] / train['bathrooms']
train['num_photos'] = train['photos'].apply(len)
train['num_features'] = train['features'].apply(len)
train['num_description_words'] = train['description'].apply(lambda x: len(x.split(" ")))
train['created'] = pd.to_datetime(train["created"])
train['created_month'] = train['created'].dt.month
train['created_day'] = train['created'].dt.day


test['price_per_bedroom'] = test['price'] / test['bedrooms']
test['price_per_bathroom'] = test['price'] / test['bathrooms']
test['num_photos'] = test['photos'].apply(len)
test['num_features'] = test['features'].apply(len)
test['num_description_words'] = test['description'].apply(lambda x: len(x.split(" ")))
test['created'] = pd.to_datetime(test["created"])
test['created_month'] = test['created'].dt.month
test['created_day'] = test['created'].dt.day


features_to_use = ["bathrooms", "bedrooms", "latitude", "longitude", "price",
                   "price_per_bedroom", "price_per_bathroom", "num_photos", 
                   "num_features", "num_description_words",  "created_month", 
                   "created_day"]

train.head()

Unnamed: 0,bathrooms,bedrooms,building_id,created,description,display_address,features,interest_level,latitude,listing_id,...,photos,price,street_address,price_per_bedroom,price_per_bathroom,num_photos,num_features,num_description_words,created_month,created_day
10,1.5,3,53a5b119ba8f7b61d4e010512e0dfc85,2016-06-24 07:54:24,A Brand New 3 Bedroom 1.5 bath ApartmentEnjoy ...,Metropolitan Avenue,[],medium,40.7145,7211212,...,[https://photos.renthop.com/2/7211212_1ed4542e...,3000,792 Metropolitan Avenue,1000.0,2000.0,5,0,95,6,24
10000,1.0,2,c5c8a357cba207596b04d1afd1e4f130,2016-06-12 12:19:27,,Columbus Avenue,"[Doorman, Elevator, Fitness Center, Cats Allow...",low,40.7947,7150865,...,[https://photos.renthop.com/2/7150865_be3306c5...,5465,808 Columbus Avenue,2732.5,5465.0,11,5,9,6,12
100004,1.0,1,c3ba40552e2120b0acfc3cb5730bb2aa,2016-04-17 03:26:41,"Top Top West Village location, beautiful Pre-w...",W 13 Street,"[Laundry In Building, Dishwasher, Hardwood Flo...",high,40.7388,6887163,...,[https://photos.renthop.com/2/6887163_de85c427...,2850,241 W 13 Street,2850.0,2850.0,8,4,94,4,17
100007,1.0,1,28d9ad350afeaab8027513a3e52ac8d5,2016-04-18 02:22:02,Building Amenities - Garage - Garden - fitness...,East 49th Street,"[Hardwood Floors, No Fee]",low,40.7539,6888711,...,[https://photos.renthop.com/2/6888711_6e660cee...,3275,333 East 49th Street,3275.0,3275.0,3,2,80,4,18
100013,1.0,4,0,2016-04-28 01:32:41,Beautifully renovated 3 bedroom flex 4 bedroom...,West 143rd Street,[Pre-War],low,40.8241,6934781,...,[https://photos.renthop.com/2/6934781_1fa4b41a...,3350,500 West 143rd Street,837.5,3350.0,3,1,68,4,28


In [6]:
#let's add some binary features where we check whether a building has a particular feature or not

#function that will be used later
def newfeat(name, df, series):
    """Create a Series for my feature building loop to fill"""
    feature = pd.Series(0, df.index, name=name)
    """Now populate the new Series with numeric values"""
    for row, word in enumerate(series):
        if name in word:
            feature.iloc[row] = 1
    df[name] = feature
    return(df)

#extract feature text
feature_text = ''
for ind, row in train.iterrows():
    for feature in row['features']:
        feature_text = " ".join([feature_text, "_".join(feature.strip().split(" "))])
feature_text = feature_text.strip()
feature_text.encode('ascii', 'ignore')
feature_text = feature_text.split()

In [7]:
#get top 10 most common features
words_to_count = (word for word in feature_text if word[:1].isupper())
c = Counter(words_to_count)
topTenFeats = []
for i in range(0,10):
    topTenFeats.append(c.most_common()[i][0])
    features_to_use.append(c.most_common()[i][0])

#add binary variables on whether building has a particular top 10 feature or not
for feat in topTenFeats:
    train = newfeat(feat, train, train.features)
    test = newfeat(feat, test, test.features)

train.head()

Unnamed: 0,bathrooms,bedrooms,building_id,created,description,display_address,features,interest_level,latitude,listing_id,...,Elevator,Cats_Allowed,Hardwood_Floors,Dogs_Allowed,Doorman,Dishwasher,No_Fee,Laundry_in_Building,Fitness_Center,Pre-War
10,1.5,3,53a5b119ba8f7b61d4e010512e0dfc85,2016-06-24 07:54:24,A Brand New 3 Bedroom 1.5 bath ApartmentEnjoy ...,Metropolitan Avenue,[],medium,40.7145,7211212,...,0,0,0,0,0,0,0,0,0,0
10000,1.0,2,c5c8a357cba207596b04d1afd1e4f130,2016-06-12 12:19:27,,Columbus Avenue,"[Doorman, Elevator, Fitness Center, Cats Allow...",low,40.7947,7150865,...,1,0,0,0,1,0,0,0,0,0
100004,1.0,1,c3ba40552e2120b0acfc3cb5730bb2aa,2016-04-17 03:26:41,"Top Top West Village location, beautiful Pre-w...",W 13 Street,"[Laundry In Building, Dishwasher, Hardwood Flo...",high,40.7388,6887163,...,0,0,0,0,0,1,0,0,0,0
100007,1.0,1,28d9ad350afeaab8027513a3e52ac8d5,2016-04-18 02:22:02,Building Amenities - Garage - Garden - fitness...,East 49th Street,"[Hardwood Floors, No Fee]",low,40.7539,6888711,...,0,0,0,0,0,0,0,0,0,0
100013,1.0,4,0,2016-04-28 01:32:41,Beautifully renovated 3 bedroom flex 4 bedroom...,West 143rd Street,[Pre-War],low,40.8241,6934781,...,0,0,0,0,0,0,0,0,0,1


In [8]:
#lastly, let's add a 'manager skill feature' that was done on kaggle by den3b 
lbl = preprocessing.LabelEncoder()
lbl.fit(list(train['manager_id'].values))
train['manager_id'] = lbl.transform(list(train['manager_id'].values))


lbl2 = preprocessing.LabelEncoder()
lbl2.fit(list(test['manager_id'].values))
test['manager_id'] = lbl2.transform(list(test['manager_id'].values))


# let's add this feature
features_to_use.append('manager_id')

In [9]:
y = train["interest_level"]

# compute fractions and count for each manager
temp_train = pd.concat([train.manager_id,pd.get_dummies(y)], axis = 1).groupby('manager_id').mean()
temp_train.columns = ['high_frac','low_frac', 'medium_frac']
temp_train['count'] = train.groupby('manager_id').count().iloc[:,1]

# remember the manager_ids look different because we encoded them in the previous step 
print(temp_train.tail(10))

            high_frac  low_frac  medium_frac  count
manager_id                                         
3471         0.000000  0.666667     0.333333      3
3472         0.625000  0.250000     0.125000      8
3473         0.000000  1.000000     0.000000      1
3474         0.000000  1.000000     0.000000      1
3475         0.142857  0.714286     0.142857      7
3476         0.000000  1.000000     0.000000      1
3477         0.000000  1.000000     0.000000      7
3478         0.000000  1.000000     0.000000      2
3479         0.142857  0.714286     0.142857      7
3480         0.034783  0.669565     0.295652    115


In [10]:
# compute skill
temp_train['manager_skill'] = temp_train['high_frac']*2 + temp_train['medium_frac']

# get ixes for unranked managers...
unranked_managers_ixes_train = temp_train['count']<20
# ... and ranked ones
ranked_managers_ixes_train = ~unranked_managers_ixes_train

# compute mean values from ranked managers and assign them to unranked ones
mean_values = temp_train.loc[ranked_managers_ixes_train, ['high_frac','low_frac', 'medium_frac','manager_skill']].mean()
print(mean_values)
temp_train.loc[unranked_managers_ixes_train,['high_frac','low_frac', 'medium_frac','manager_skill']] = mean_values.values
print(temp_train.tail(10))

high_frac        0.081314
low_frac         0.673194
medium_frac      0.245492
manager_skill    0.408120
dtype: float64
            high_frac  low_frac  medium_frac  count  manager_skill
manager_id                                                        
3471         0.081314  0.673194     0.245492      3       0.408120
3472         0.081314  0.673194     0.245492      8       0.408120
3473         0.081314  0.673194     0.245492      1       0.408120
3474         0.081314  0.673194     0.245492      1       0.408120
3475         0.081314  0.673194     0.245492      7       0.408120
3476         0.081314  0.673194     0.245492      1       0.408120
3477         0.081314  0.673194     0.245492      7       0.408120
3478         0.081314  0.673194     0.245492      2       0.408120
3479         0.081314  0.673194     0.245492      7       0.408120
3480         0.034783  0.669565     0.295652    115       0.365217


In [11]:
# inner join to assign manager features to the managers in the training dataframe
train = train.merge(temp_train.reset_index(),how='left', left_on='manager_id', right_on='manager_id')

# inner join to assign manager features to the managers in the testing dataframe
test = test.merge(temp_train.reset_index(),how='left', left_on='manager_id', right_on='manager_id')

# add manager fractions and skills to the features to use
features_to_use.extend(['high_frac','low_frac', 'medium_frac','manager_skill'])

train.head()

Unnamed: 0,bathrooms,bedrooms,building_id,created,description,display_address,features,interest_level,latitude,listing_id,...,Dishwasher,No_Fee,Laundry_in_Building,Fitness_Center,Pre-War,high_frac,low_frac,medium_frac,count,manager_skill
0,1.5,3,53a5b119ba8f7b61d4e010512e0dfc85,2016-06-24 07:54:24,A Brand New 3 Bedroom 1.5 bath ApartmentEnjoy ...,Metropolitan Avenue,[],medium,40.7145,7211212,...,0,0,0,0,0,0.0,0.744444,0.255556,90,0.255556
1,1.0,2,c5c8a357cba207596b04d1afd1e4f130,2016-06-12 12:19:27,,Columbus Avenue,"[Doorman, Elevator, Fitness Center, Cats Allow...",low,40.7947,7150865,...,0,0,0,0,0,0.0,0.988372,0.011628,86,0.011628
2,1.0,1,c3ba40552e2120b0acfc3cb5730bb2aa,2016-04-17 03:26:41,"Top Top West Village location, beautiful Pre-w...",W 13 Street,"[Laundry In Building, Dishwasher, Hardwood Flo...",high,40.7388,6887163,...,1,0,0,0,0,0.059701,0.574627,0.365672,134,0.485075
3,1.0,1,28d9ad350afeaab8027513a3e52ac8d5,2016-04-18 02:22:02,Building Amenities - Garage - Garden - fitness...,East 49th Street,"[Hardwood Floors, No Fee]",low,40.7539,6888711,...,0,0,0,0,0,0.068063,0.806283,0.125654,191,0.26178
4,1.0,4,0,2016-04-28 01:32:41,Beautifully renovated 3 bedroom flex 4 bedroom...,West 143rd Street,[Pre-War],low,40.8241,6934781,...,0,0,0,0,1,0.081314,0.673194,0.245492,15,0.40812


In [12]:
#get feature vector in X and labels in y
X = train[features_to_use]
trainDF['interest_level'] = trainDF['interest_level'].map({'low': 0, 'medium': 1, 'high': 2})
y = trainDF['interest_level']

In [13]:
y = y.tolist()
Xlist = []
for i in range(0, X.shape[0]):
    Xlist.append(list(X.loc[i]))

from numpy import inf    
    
for i in range(len(Xlist)):
    for j in range(len(Xlist[i])):
        if Xlist[i][j] == inf:
            Xlist[i][j] = 0.0

In [14]:
#Now let's do some SVM
svmLinear = LinearSVC()
n_estimators = 10
svmRBF = OneVsRestClassifier(BaggingClassifier(SVC(kernel='rbf'), max_samples=1.0 / n_estimators, n_estimators=n_estimators, bootstrap=False), n_jobs=-1)

k_fold = KFold(n_splits=10, shuffle=True)

scaler = preprocessing.StandardScaler()
scaler.fit(Xlist)

svmRBFCross = cross_val_score(svmRBF, scaler.transform(Xlist), y, cv=k_fold, n_jobs=-1)
avgSVMRBF = np.mean(svmRBFCross)
svmRBFSD = np.std(svmRBFCross)
print "RBF Kernel Mean Cross Validated Accuracy : ", avgSVMRBF
print "RBF Kernel Standard Deviation of Cross Validated Accuracy: ", svmRBFSD

svmLinearCross = cross_val_score(svmLinear, Xlist, y, cv=k_fold)
avgLinearSVM = np.mean(svmLinearCross)
svmLinearSD = np.std(svmLinearCross)
print "Linear Kernel Mean Cross Validated Accuracy : ", avgLinearSVM
print "Linear Kernel Standard Deviation of Cross Validated Accuracy: ", svmLinearSD

RBF Kernel Mean Cross Validated Accuracy :  0.715208894724
RBF Kernel Standard Deviation of Cross Validated Accuracy:  0.00752143658909
Linear Kernel Mean Cross Validated Accuracy :  0.522890083238
Linear Kernel Standard Deviation of Cross Validated Accuracy:  0.187550427881


In [15]:
C_range = np.logspace(-5, 15, 10, base=2.0)
gamma_range = np.logspace(-15, 3, 9, base=2.0)


combos = list(product(C_range, gamma_range))
maxAccuracyParams = []
maxAccuracy = 0
i = int(len(Xlist) * 0.1)
Xlist_short = Xlist[:i]
y_short = y[:i]
j = 0

#log_loss_scorer = make_scorer(log_loss, labels=[0,1,2])

scaler.fit(Xlist_short)
transformedXShort = scaler.transform(Xlist_short)

print 'Testing ' + str((len(combos)))  + ' different combinations'
start = time.time()
for combo in combos:
    if(j % 10 == 0 and j != 0):
        print "Done testing " + str(j) + " combinations"
    model = SVC(kernel='rbf', C=combo[0], gamma=combo[1])
    scores = cross_val_score(model, transformedXShort, y_short, cv=k_fold, n_jobs=-1)   
    acc = np.mean(scores)
    if(maxAccuracy < acc):
        maxAccuracy = acc
        maxAccuracyParams = []
        maxAccuracyParams.append(combo[0])
        maxAccuracyParams.append(combo[1])
    else:
        pass
    j += 1

print("Parameter space search took {:.2f} seconds".format(time.time() - start))
print "Parameter search max precision: " + str(maxAccuracy)
print "Parameter search best parameters: " + str(maxAccuracyParams)

Testing 90 different combinations
Done testing 10 combinations
Done testing 20 combinations
Done testing 30 combinations
Done testing 40 combinations
Done testing 50 combinations
Done testing 60 combinations
Done testing 70 combinations
Done testing 80 combinations
Parameter space search took 3052.79 seconds
Parameter search max precision: 0.710851926978
Parameter search best parameters: [7022.5427075323769, 0.00069053396600248786]


In [16]:
n_estimators = 10
svmOptimized = OneVsRestClassifier(BaggingClassifier(SVC(kernel='rbf', C=maxAccuracyParams[0], 
                                                         gamma=maxAccuracyParams[1], probability=True), 
                                                     max_samples=1.0 / n_estimators, n_estimators=n_estimators, 
                                                     bootstrap=False), n_jobs=-1)

k_fold = KFold(n_splits=10, shuffle=True)

scaler.fit(Xlist)

svmCrossOptimized = cross_val_score(svmOptimized, scaler.transform(Xlist), y, cv=k_fold, n_jobs=-1)
avgSVM = np.mean(svmCrossOptimized)
svmSD = np.std(svmCrossOptimized)
print "Optimized RBF Kernel Mean Cross Validated Accuracy : ", avgSVM
print "Optimized RBF Kernel Standard Deviation of Cross Validated Accuracy: ", svmSD

Optimized RBF Kernel Mean Cross Validated Accuracy :  0.717802900428
Optimized RBF Kernel Standard Deviation of Cross Validated Accuracy:  0.00689347749209


In [17]:
#create submission for Kaggle
X_test = test[features_to_use]
X_test.head()

Unnamed: 0,bathrooms,bedrooms,latitude,longitude,price,price_per_bedroom,price_per_bathroom,num_photos,num_features,num_description_words,...,Dishwasher,No_Fee,Laundry_in_Building,Fitness_Center,Pre-War,manager_id,high_frac,low_frac,medium_frac,manager_skill
0,1.0,1,40.7185,-73.9865,2950,2950.0,2950.0,8,6,78,...,1,0,0,0,0,2694,0.081314,0.673194,0.245492,0.40812
1,1.0,2,40.7278,-74.0,2850,1425.0,2850.0,3,3,35,...,0,0,0,0,1,3145,0.081314,0.673194,0.245492,0.40812
2,1.0,1,40.7306,-73.989,3758,3758.0,3758.0,6,3,333,...,0,0,0,0,0,2346,0.081314,0.673194,0.245492,0.40812
3,1.0,2,40.7109,-73.9571,3300,1650.0,3300.0,6,10,204,...,1,0,0,0,0,179,0.02381,0.571429,0.404762,0.452381
4,2.0,2,40.765,-73.9845,4900,2450.0,2450.0,7,14,174,...,1,0,0,0,1,2764,0.0,0.916388,0.083612,0.083612


In [18]:
X_test_list = []
for i in range(0, X_test.shape[0]):
    X_test_list.append(list(X_test.loc[i]))

for i in range(len(X_test_list)):
    for j in range(len(X_test_list[i])):
        if X_test_list[i][j] == inf or str(X_test_list[i][j]) == 'nan':
            X_test_list[i][j] = 0.0

In [19]:
submission = {'listing_id': [],
             'high': [], 
             'medium': [],
             'low': []}

svmOptimized.fit(scaler.transform(Xlist), y)
scaler.fit(X_test_list)
scaledTestingSet = scaler.transform(X_test_list)
probs = svmOptimized.predict_proba(scaledTestingSet)
preds = svmOptimized.predict(scaledTestingSet)

for i in range(len(X_test_list)):
    submission['listing_id'].append(test['listing_id'][i])
    submission['high'].append(probs[i][2])
    submission['medium'].append(probs[i][1])
    submission['low'].append(probs[i][0])

In [20]:
df = pd.DataFrame(submission, 
                  columns=['listing_id', 'high', 'medium', 'low'])
df.to_csv('svmSubmission.csv', mode='w', index= False, header = True)

In [21]:
c2 = Counter(preds)
print c2.most_common(3)

[(0, 71906), (1, 2203), (2, 550)]
