In [1]:
import json
import numpy as np
import pandas as pd
from sklearn.cross_validation import train_test_split
from sklearn import metrics
from sklearn.linear_model import LogisticRegression 
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold



In [2]:
def get_feature_vector(df):
    
    ulimit = np.percentile(df.price.values, 99)
    df['price'].loc[df.price > ulimit] = ulimit

    llimit = np.percentile(df.latitude.values, 1)
    ulimit = np.percentile(df.latitude.values, 99)
    df['latitude'].loc[df.latitude < llimit] = llimit
    df['latitude'].loc[df.latitude > ulimit] = ulimit

    llimit = np.percentile(df.longitude.values, 1)
    ulimit = np.percentile(df.longitude.values, 99)
    df['longitude'].loc[df.longitude < llimit] = llimit
    df['longitude'].loc[df.longitude > ulimit] = ulimit
    
    df['bathrooms'].loc[df.bathrooms == 0] = 0.5
    df['bedrooms'].loc[df.bedrooms == 0] = 0.5
        
    df["created"] = pd.to_datetime(df["created"])
    df["created_year"] = df["created"].dt.year
    df["created_month"] = df["created"].dt.month
    df["created_day"] = df["created"].dt.day
    df["created_hour"] = df["created"].dt.hour
    
    df['price_per_bed'] = df['price']/df['bedrooms']
    df['price_per_bath'] = df['price']/df['bathrooms']
    
    df['description'] = df['description'].str.lower().replace('[^a-zA-Z0-9]', ' ', regex=True)
    for i in df.index:
        df.loc[i, 'len_description'] = len(df.loc[i, 'description'])
        df.loc[i, 'num_features'] = len(df.loc[i, 'features'])
    
    num_feats = ["bathrooms", "bedrooms", "latitude", "longitude", "price", "num_features", "len_description",
             "created_year", "created_month", "created_day", "created_hour", "price_per_bed", "price_per_bath"]
    X = df[num_feats]
    
    print(X.head())
    
    return X

In [3]:
file_X = 'train.json'
df = pd.read_json(file_X)

file_X_test = 'test.json'
df_test = pd.read_json(file_X_test)

X = get_feature_vector(df)
y = df["interest_level"]

X_test = get_feature_vector(df_test)

print(y.head())

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


        bathrooms  bedrooms  latitude  longitude   price  num_features  \
10            1.5       3.0   40.7145   -73.9425  3000.0           0.0   
100004        1.0       1.0   40.7388   -74.0018  2850.0           4.0   
100007        1.0       1.0   40.7539   -73.9677  3275.0           2.0   
100014        2.0       4.0   40.7429   -74.0028  7995.0           0.0   
100026        1.0       1.0   40.8234   -73.9457  1725.0           4.0   

        len_description  created_year  created_month  created_day  \
10                588.0          2016              6           24   
100004            691.0          2016              4           17   
100007            492.0          2016              4           18   
100014              8.0          2016              4           19   
100026             24.0          2016              4           20   

        created_hour  price_per_bed  price_per_bath  
10                 7        1000.00          2000.0  
100004             3        2850

In [4]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.25)

In [7]:
model = LogisticRegression()
model.fit(X_train, y_train)
y_val_pred = model.predict_proba(X_val)
print(log_loss(y_val, y_val_pred))

y_test = model.predict_proba(X_test)

y_pred = pd.DataFrame()
y_pred['listing_id'] = df_test['listing_id']

for i in range(len(list(y_test))):
    [y_pred.loc[y_pred.index[i], 'high'], y_pred.loc[y_pred.index[i], 'low'], y_pred.loc[y_pred.index[i], 'medium']] = list(y_test[i])

print(y_pred.head())
y_pred.to_csv("pred_lr.csv", index=False)

0.7133868068047021
        listing_id      high       low    medium
10000         5529  0.009373  0.885950  0.104677
100013       14597  0.049852  0.718807  0.231341
100016       43396  0.047581  0.698299  0.254120
100020       25538  0.006465  0.899898  0.093637
100099       26644  0.010196  0.866909  0.122895


In [7]:
model = XGBClassifier()
n_estimators = [500, 700, 1000]
learning_rate = [0.1, 0.3, 0.5]
param_grid = dict(learning_rate=learning_rate, n_estimators=n_estimators)
kfold = StratifiedKFold(n_splits=7, shuffle=True, random_state=7)
grid_search = GridSearchCV(model, param_grid, scoring="neg_log_loss", n_jobs=-1, cv=kfold)
grid_result = grid_search.fit(X_train, y_train)
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))


Best: -0.604912 using {'n_estimators': 700, 'learning_rate': 0.1}
-0.605352 (0.012756) with: {'n_estimators': 500, 'learning_rate': 0.1}
-0.604912 (0.013798) with: {'n_estimators': 700, 'learning_rate': 0.1}
-0.606470 (0.014769) with: {'n_estimators': 1000, 'learning_rate': 0.1}
-0.613520 (0.014455) with: {'n_estimators': 500, 'learning_rate': 0.3}
-0.620953 (0.016319) with: {'n_estimators': 700, 'learning_rate': 0.3}
-0.633596 (0.017988) with: {'n_estimators': 1000, 'learning_rate': 0.3}
-0.631531 (0.015554) with: {'n_estimators': 500, 'learning_rate': 0.5}
-0.647192 (0.017996) with: {'n_estimators': 700, 'learning_rate': 0.5}
-0.669653 (0.019068) with: {'n_estimators': 1000, 'learning_rate': 0.5}


In [8]:
y_val_pred = grid_search.predict_proba(X_val)
print(log_loss(y_val, y_val_pred))

0.602655225769174


In [9]:
y_test = grid_search.predict_proba(X_test)
y_test = list(y_test)

y_pred = pd.DataFrame()
y_pred['listing_id'] = df_test['listing_id']

for i in range(len(y_test)):
    [y_pred.loc[y_pred.index[i], 'high'], y_pred.loc[y_pred.index[i], 'low'], y_pred.loc[y_pred.index[i], 'medium']] = list(y_test[i])

print(y_pred.head())
y_pred.to_csv("pred_xg.csv", index=False)


        listing_id      high       low    medium
10000         5529  0.002490  0.979726  0.017783
100013       14597  0.005346  0.952540  0.042114
100016       43396  0.005745  0.901037  0.093218
100020       25538  0.006124  0.947461  0.046415
100099       26644  0.013510  0.775547  0.210943


In [6]:
clf = RandomForestClassifier(n_estimators=800, max_depth=20, n_jobs=-1)
clf.fit(X_train, y_train)
y_val_pred = clf.predict_proba(X_val)
print(log_loss(y_val, y_val_pred))

0.6168611118603844


In [7]:
y_test = clf.predict_proba(X_test)
y_test = list(y_test)
list(y_test[1])

[0.094, 0.79, 0.116]

In [8]:
y_pred = pd.DataFrame()
y_pred['listing_id'] = df_test['listing_id']

for i in range(len(y_test)):
    [y_pred.loc[y_pred.index[i], 'high'], y_pred.loc[y_pred.index[i], 'low'], y_pred.loc[y_pred.index[i], 'medium']] = list(y_test[i])

print(y_pred.head())

        listing_id   high    low  medium
10000         5529  0.004  0.991   0.005
100013       14597  0.094  0.790   0.116
100016       43396  0.001  0.975   0.024
100020       25538  0.014  0.925   0.061
100099       26644  0.020  0.783   0.197


In [9]:
#y_pred.to_csv("pred_rf.csv", index=False)

NameError: name 'df' is not defined