In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle
import os
import glob
from datetime import datetime as dt
import seaborn as sns

np.set_printoptions(suppress=True)

%matplotlib inline

pd.set_option("display.max_colwidth",999)
pd.set_option("display.max_rows",999)
pd.set_option("display.max_columns",999)

In [2]:
# df_sf_2017 = pickle.load(open('../data_sf_2017.p', 'rb'))

## load 2017 data plus additional manipulations

In [2]:
df_sf_2017 = pd.read_json('df_sf_2017')
df_sf_2017.loc[df_sf_2017.cancellation_policy == 'flexible_new', 'cancellation_policy'] = "flexible"
prop_type = set(df_sf_2017.property_type.values)
keep = ['Apartment', 'House', 'Condomonium']
drop = list(set(prop_type).difference(keep))
df_sf_2017['property_type_new'] = df_sf_2017['property_type']
df_sf_2017.loc[df_sf_2017.property_type.isin(drop), 'property_type_new'] = "Other"

In [54]:
# df_sf['price'].apply(np.log)[df_sf_2017['price_per_guest'].isnull()][[ 'price', 'guests_included','price_per_guest']]

Unnamed: 0,price,guests_included,price_per_guest


In [81]:
df_sf_2017.price_per_bedroom.isnull().sum()

0

## Random Forest

In [3]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, make_scorer, recall_score, precision_score, f1_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV
import pickle

## Predict 2017

In [4]:
def split_data(start_month, end_month):
    df_sf_temp = df_sf_2017.copy()
    X_train = df_sf_temp[(df_sf_temp['month'] >= start_month) & (df_sf_temp['month'] < end_month)][columns_to_keep]
    y_train = df_sf_temp[(df_sf_temp['month'] >= start_month) & (df_sf_temp['month'] < end_month)]['popular']

    X_test = df_sf_temp[df_sf_temp['month'] == end_month][columns_to_keep]
    y_test = df_sf_temp[df_sf_temp['month'] == end_month]['popular']
    
    return X_train, y_train, X_test, y_test
    

In [5]:
def transform_data(X):
    X['extra_people'].replace('[\=$,]', '', regex=True, inplace=True)
#     X['price'].replace('[\=$,]', '', regex=True, inplace=True)
    X['property_type_new'] = le.fit_transform(X['property_type_new'])
    X['bed_type'] = le.fit_transform(X['bed_type'])
    X['cancellation_policy'] = le.fit_transform(X['cancellation_policy'])
    X['room_type'] = le.fit_transform(X['room_type'])
 
    return X

In [25]:
def predict_data(X_train, y_train, X_test, y_test):
#     rf = RandomForestClassifier(n_estimators = 1000, n_jobs=-1, random_state=0, class_weight = {0:.95, 1:.05})
#     rf.fit(X_train, y_train)
    
    param_grid = {'n_estimators': [100, 500, 750, 1000], 'max_features': ['auto', 'sqrt'], 'max_depth': [2,4]}
    grid_search = GridSearchCV(RandomForestClassifier(), param_grid, cv=5, n_jobs=-1, scoring=make_scorer(f1_score))
    fit = grid_search.fit(X_train, y_train)
    print(grid_search.best_params_)

    predicted = fit.predict(X_test)
    scores[0][model_num] = accuracy_score(y_test, predicted)
    scores[1][model_num] = recall_score(y_test, predicted)
    scores[2][model_num] = precision_score(y_test, predicted)
    scores[3][model_num] = f1_score(y_test, predicted)
    return scores, grid_search

In [23]:
scores = np.zeros(shape=(4,9))
model_num = 0
start_month = 1
end_month = 4
# columns_to_keep = ['accommodates','bed_type', 'extra_people', 'price','guests_included', 'host_about_filled','host_picture_url_filled', 'cancellation_policy', 'room_type']
# columns_to_keep = ['accommodates','bed_type', 'extra_people', 'price_per_guest', 'price_per_bedroom','guests_included', 'host_about_filled','host_picture_url_filled', 'cancellation_policy', 'room_type']


##USED THIS ONE EARLIER
# columns_to_keep = ['access_filled', 'house_rules_filled','space_filled','accommodates','bed_type', 'extra_people', 'price_per_guest', 'price_per_bedroom','guests_included', 'host_about_filled','host_picture_url_filled', 'cancellation_policy', 'room_type', 'property_type_new']

#THIS IS A TEST ONE
columns_to_keep = ['accommodates','bed_type', 'extra_people', 'price_per_guest','guests_included', 'host_about_filled','host_picture_url_filled', 'cancellation_policy', 'room_type', 'property_type_new']

In [None]:
##DATA BELOW IS WITH TEST COLUMN TO KEEP

In [10]:
%%time
while end_month <13:
    X_train, y_train, X_test, y_test = split_data(start_month, end_month)
    le = LabelEncoder()
    X_train = transform_data(X_train)
    X_test = transform_data(X_test)
    scores, rf = predict_data(X_train, y_train, X_test, y_test)
    print(start_month, end_month, model_num)
    start_month += 1
    end_month += 1
    model_num += 1
    print(scores)

1 4 0
[[0.91218782 0.         0.         0.         0.         0.
  0.         0.         0.        ]
 [0.6209068  0.         0.         0.         0.         0.
  0.         0.         0.        ]
 [0.85963383 0.         0.         0.         0.         0.
  0.         0.         0.        ]
 [0.72102377 0.         0.         0.         0.         0.
  0.         0.         0.        ]]
2 5 1
[[0.91218782 0.91204128 0.         0.         0.         0.
  0.         0.         0.        ]
 [0.6209068  0.60176434 0.         0.         0.         0.
  0.         0.         0.        ]
 [0.85963383 0.87614679 0.         0.         0.         0.
  0.         0.         0.        ]
 [0.72102377 0.71348524 0.         0.         0.         0.
  0.         0.         0.        ]]
3 6 2
[[0.91218782 0.91204128 0.91266226 0.         0.         0.
  0.         0.         0.        ]
 [0.6209068  0.60176434 0.62394705 0.         0.         0.
  0.         0.         0.        ]
 [0.85963383 0.87614

In [None]:
%%time
while end_month <13:
    X_train, y_train, X_test, y_test = split_data(start_month, end_month)
    le = LabelEncoder()
    X_train = transform_data(X_train)
    X_test = transform_data(X_test)
    scores, rf = predict_data(X_train, y_train, X_test, y_test)
    print(start_month, end_month, model_num)
    start_month += 1
    end_month += 1
    model_num += 1
    print(scores)
    break

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

In [268]:
np.array(X_train.columns)

array(['access_filled', 'house_rules_filled', 'space_filled',
       'accommodates', 'bed_type', 'extra_people', 'price_per_guest',
       'price_per_bedroom', 'guests_included', 'host_about_filled',
       'host_picture_url_filled', 'cancellation_policy', 'room_type',
       'property_type_new'], dtype=object)

In [266]:
rf.feature_importances_

array([0.01208263, 0.01212339, 0.0138335 , 0.07966735, 0.01365546,
       0.12504533, 0.29528097, 0.2933239 , 0.01923429, 0.02446119,
       0.00006632, 0.04731615, 0.02139722, 0.04251229])

In [51]:
from tempfile import TemporaryFile
scores_rf_baseline = TemporaryFile()
np.save(scores_rf_baseline,scores)x

In [53]:
scores_rf_baseline.seek(0) # Only needed here to simulate closing & reopening file
np.load(scores_rf_baseline)

array([[0.81988506, 0.89212093, 0.88964655, 0.88999322, 0.88094975,
        0.88061452, 0.88234636, 0.88732079, 0.85575529],
       [0.2336272 , 0.47006931, 0.48648649, 0.4973822 , 0.47533875,
        0.49067661, 0.49973698, 0.53308824, 0.5445224 ],
       [0.51456311, 0.88075561, 0.87473002, 0.88601036, 0.88855117,
        0.93692777, 0.9047619 , 0.89664311, 0.86982759],
       [0.32135123, 0.61298274, 0.62524122, 0.63710879, 0.61935028,
        0.64405594, 0.64384954, 0.66864295, 0.66976435]])

In [13]:
accuracy_2017 = np.mean(scores[0])
recall_2017 = np.mean(scores[1])
precision_2017 = np.mean(scores[2])
f1_score_2017 = np.mean(scores[3])
print(accuracy_2017)
print(recall_2017)
print(precision_2017)
print(f1_score_2017)

0.9033162298063634
0.6186129222152004
0.8858609049923262
0.7281356450812987


## GRID SEARCHING

# Logistic Regression (RF was much better)

In [6]:
from sklearn.linear_model import LogisticRegression

In [13]:
columns_to_keep = ['access_filled', 'house_rules_filled','space_filled','accommodates','bed_type', 'extra_people', 'price_per_guest', 'price_per_bedroom','guests_included', 'host_about_filled','host_picture_url_filled', 'cancellation_policy', 'room_type', 'property_type_new']

In [9]:
def split_data(start_month, end_month):
    df_sf_temp = df_sf_2017.copy()
    X_train = df_sf_temp[(df_sf_temp['month'] >= start_month) & (df_sf_temp['month'] < end_month)][columns_to_keep]
    y_train = df_sf_temp[(df_sf_temp['month'] >= start_month) & (df_sf_temp['month'] < end_month)]['popular']

    X_test = df_sf_temp[df_sf_temp['month'] == end_month][columns_to_keep]
    y_test = df_sf_temp[df_sf_temp['month'] == end_month]['popular']
    
    return X_train, y_train, X_test, y_test
    

In [17]:
def transform_logistic(X, columns_to_keep):

    df_temp = X[columns_to_keep]
    data_dummies = pd.DataFrame([])
    drop = []
    
    #dummify columns
    for column in columns_to_keep:
        if (df_temp[column].dtypes == 'O'):
            drop.append(column)
            dummies = pd.get_dummies(df_temp[column], prefix= column)
            data_dummies = pd.concat([data_dummies, dummies], axis=1)

    #join with original df
    df_temp = df_temp.join(data_dummies)

#     print(list(df_temp.columns.values))
    #drop columns that were dummified
    df_temp.drop(drop, axis=1, inplace = True)

    return df_temp

In [15]:
def predict_logistic(X_train, y_train, X_test, y_test):
    model = LogisticRegression(n_jobs=-1)
    model.fit(X_train, y_train)
    prediction = model.predict(X_test)
    beta_coef.append(model.coef_)
    scores_logit[0][model_num] = accuracy_score(y_test, prediction)
    scores_logit[1][model_num] = recall_score(y_test, prediction)
    scores_logit[2][model_num] = precision_score(y_test, prediction)
    scores_logit[3][model_num] = f1_score(y_test, prediction)
    return scores_logit, beta_coef

In [21]:
beta_coef = []
scores_logit = np.zeros(shape=(4,9))
model_num = 0
start_month = 1
end_month = 4

In [22]:
%%time
while end_month <13:
    X_train, y_train, X_test, y_test = split_data(start_month, end_month)
    X_train = transform_logistic(X_train, columns_to_keep)
    X_test = transform_logistic(X_test, columns_to_keep)
    scores, beta_coef = predict_logistic(X_train, y_train, X_test, y_test)
    print(start_month, end_month, model_num)
    start_month += 1
    end_month += 1
    model_num += 1
    print(scores_logit)

  " = {}.".format(self.n_jobs))


1 4 0
[[0.81779515 0.         0.         0.         0.         0.
  0.         0.         0.        ]
 [0.04219144 0.         0.         0.         0.         0.
  0.         0.         0.        ]
 [0.51145038 0.         0.         0.         0.         0.
  0.         0.         0.        ]
 [0.0779523  0.         0.         0.         0.         0.
  0.         0.         0.        ]]
2 5 1
[[0.81779515 0.81768209 0.         0.         0.         0.
  0.         0.         0.        ]
 [0.04219144 0.05229994 0.         0.         0.         0.
  0.         0.         0.        ]
 [0.51145038 0.48538012 0.         0.         0.         0.
  0.         0.         0.        ]
 [0.0779523  0.09442548 0.         0.         0.         0.
  0.         0.         0.        ]]
3 6 2
[[0.81779515 0.81768209 0.81311811 0.         0.         0.
  0.         0.         0.        ]
 [0.04219144 0.05229994 0.08533654 0.         0.         0.
  0.         0.         0.        ]
 [0.51145038 0.48538

In [23]:
accuracy_2017 = np.mean(scores_logit[0])
recall_2017 = np.mean(scores_logit[1])
precision_2017 = np.mean(scores_logit[2])
f1_score_2017 = np.mean(scores_logit[3])
print(accuracy_2017)
print(recall_2017)
print(precision_2017)
print(f1_score_2017)

0.7946514661754123
0.09126554689101034
0.5577129954488265
0.15490514368378314


In [24]:
beta_coef

[array([[ 0.53007962,  0.67561263,  0.15006667,  0.10604996, -0.00633706,
         -1.19853999,  0.18120679, -0.40059964,  0.20662239,  0.55194041,
         -0.62838955, -0.38660583, -0.17662322,  0.09120517,  0.24184857,
         -0.08783601,  0.81886815,  0.49857129, -1.57654837, -0.51161991,
         -0.57295504, -0.21805468, -0.06755512, -0.38932237, -0.08382664,
         -0.38541583]]),
 array([[ 0.4976066 ,  0.61940049,  0.23575569,  0.13464937, -0.00675129,
         -1.5968739 ,  0.19256912, -0.56326032,  0.21077625,  0.62926188,
         -0.63483379, -0.26512533, -0.10449351,  0.24839223,  0.33377135,
         -0.03210413,  0.84458651,  0.566181  , -1.45039404, -0.35055838,
         -0.3713931 , -0.0344765 , -0.01641945, -0.24013341,  0.05059158,
         -0.23274721]]),
 array([[ 0.46440042,  0.62811662,  0.30463579,  0.16218218, -0.00760389,
         -2.00973591,  0.22392864, -0.73386785,  0.24822121,  0.33242678,
         -0.6659505 , -0.10145976, -0.04422087,  0.4338944 ,  

In [14]:
# Compute the change in odds ratio from a one unit change in each feature.

In [143]:
from math import exp
for i, coef in enumerate(model.coef_[0]):
    print("beta%d: %.5f" % (i + 1, exp(coef)))

beta1: 1.69907
beta2: 1.96524
beta3: 1.16191
beta4: 1.11188
beta5: 0.99368
beta6: 0.30163
beta7: 1.19866
beta8: 0.66992
beta9: 1.22952
beta10: 1.73662
beta11: 0.53345
beta12: 0.67936
beta13: 0.83810
beta14: 1.09549
beta15: 1.27360
beta16: 0.91591
beta17: 2.26793
beta18: 1.64637
beta19: 0.20669
beta20: 0.59952
beta21: 0.56386
beta22: 0.80408
beta23: 0.93468
beta24: 0.67752
beta25: 0.91959
beta26: 0.68017


In [None]:
# Increasing the ____ feature by 1 point increases the chance of being popular by a factor of __ (1.00189).

In [144]:
for i, col in enumerate(list(X_train.columns.values)):
    print(i, col)

0 access_filled
1 house_rules_filled
2 space_filled
3 accommodates
4 extra_people
5 price_per_guest
6 price_per_bedroom
7 guests_included
8 host_about_filled
9 host_picture_url_filled
10 bed_type_Airbed
11 bed_type_Couch
12 bed_type_Futon
13 bed_type_Pull-out Sofa
14 bed_type_Real Bed
15 cancellation_policy_flexible
16 cancellation_policy_moderate
17 cancellation_policy_strict
18 cancellation_policy_super_strict_30
19 cancellation_policy_super_strict_60
20 room_type_Entire home/apt
21 room_type_Private room
22 room_type_Shared room
23 property_type_new_Apartment
24 property_type_new_House
25 property_type_new_Other


## XGBoost with one quarter

In [269]:
from xgboost import XGBClassifier

In [270]:
#test with first quarter
df_sf_temp = df_sf_2017.copy()
start_month = 1
end_month = 4
X_train = df_sf_temp[(df_sf_temp['month'] >= start_month) & (df_sf_temp['month'] < end_month)][columns_to_keep]
y_train = df_sf_temp[(df_sf_temp['month'] >= start_month) & (df_sf_temp['month'] < end_month)]['popular']

X_test = df_sf_temp[df_sf_temp['month'] == end_month][columns_to_keep]
y_test = df_sf_temp[df_sf_temp['month'] == end_month]['popular']


In [271]:
X_train = transform_boost(X_train, columns_to_keep)

In [273]:
X_test = transform_boost(X_test, columns_to_keep)

In [272]:
def transform_boost(X, columns_to_keep):
    df_temp = X[columns_to_keep]
    data_dummies = pd.DataFrame([])
    drop = []
    
    #dummify columns
    for column in columns_to_keep:
        if (df_temp[column].dtypes == 'O'):
            drop.append(column)
            dummies = pd.get_dummies(df_temp[column], prefix= column)
            data_dummies = pd.concat([data_dummies, dummies], axis=1)

    #join with original df
    df_temp = df_temp.join(data_dummies)

#     print(list(df_temp.columns.values))
    #drop columns that were dummified
    df_temp.drop(drop, axis=1, inplace = True)
    return df_temp

In [274]:
%%time
my_model = XGBClassifier(n_jobs=-1, booster='gbtree', n_estimators=1000, learning_rate=1, objective='binary:logistic')

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 29.8 µs


In [275]:
%%time
my_model.fit(X_train, y_train, verbose=False)

CPU times: user 16.3 s, sys: 76 ms, total: 16.3 s
Wall time: 5.46 s


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=1000,
       n_jobs=-1, nthread=None, objective='binary:logistic',
       random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=None, silent=True, subsample=1)

In [276]:
# Add silent=True to avoid printing out updates with each cycle
predictions = my_model.predict(X_test)

  if diff:


In [279]:
np.array(X_train.columns)

array(['access_filled', 'house_rules_filled', 'space_filled',
       'accommodates', 'extra_people', 'price_per_guest',
       'price_per_bedroom', 'guests_included', 'host_about_filled',
       'host_picture_url_filled', 'bed_type_Airbed', 'bed_type_Couch',
       'bed_type_Futon', 'bed_type_Pull-out Sofa', 'bed_type_Real Bed',
       'cancellation_policy_flexible', 'cancellation_policy_moderate',
       'cancellation_policy_strict',
       'cancellation_policy_super_strict_30',
       'cancellation_policy_super_strict_60', 'room_type_Entire home/apt',
       'room_type_Private room', 'room_type_Shared room',
       'property_type_new_Apartment', 'property_type_new_House',
       'property_type_new_Other'], dtype=object)

In [277]:
my_model.feature_importances_

array([0.02267267, 0.01831832, 0.01966967, 0.07507508, 0.10870871,
       0.31636637, 0.2671171 , 0.00495496, 0.02162162, 0.        ,
       0.00225225, 0.0009009 , 0.00540541, 0.0036036 , 0.0045045 ,
       0.01636637, 0.02282282, 0.01711712, 0.0003003 , 0.        ,
       0.00885886, 0.00735736, 0.00465465, 0.01891892, 0.01891892,
       0.01351351], dtype=float32)

In [236]:
accuracy_score(y_test, predictions)

0.9005632831359927

In [237]:
recall_score(y_test, predictions)

0.6202770780856424

In [238]:
precision_score(y_test, predictions)

0.7898957497995188

In [239]:
f1_score(y_test, predictions)

0.6948853615520283

## ADA Boosting

In [160]:
from sklearn.ensemble import AdaBoostClassifier #For Classification

In [228]:
dt = DecisionTreeClassifier() 
clf = AdaBoostClassifier(n_estimators=1000, base_estimator=dt,learning_rate=1)
#Above I have used decision tree as a base estimator, you can use any ML learner as base estimator if it ac# cepts sample weight 
clf.fit(X_train,y_train)
prediction = clf.predict(X_test)

In [229]:
accuracy_score(y_test, prediction)

0.9267731923209565

In [230]:
recall_score(y_test, prediction)

0.7550377833753149

In [231]:
precision_score(y_test, prediction)

0.8286109191430546

In [232]:
f1_score(y_test, prediction)

0.7901153212520594

In [240]:
clf.feature_importances_ 

  return self.tree_.compute_feature_importances()


array([           nan,            nan,            nan,            nan,
                  nan,            nan,            nan,            nan,
                  nan,            nan,            nan, 2.95272271e-05,
                  nan,            nan,            nan,            nan,
                  nan,            nan, 3.21736089e-05, 9.45490779e-06,
                  nan,            nan,            nan,            nan,
                  nan,            nan])

## Gradient Boosting with one quarter

In [184]:
from sklearn.ensemble import GradientBoostingClassifier #For Classification

In [222]:
clf_gradient = GradientBoostingClassifier(n_estimators=1000, learning_rate=1.0)
clf_gradient.fit(X_train, y_train)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=1.0, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=1000,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False)

In [223]:
pred = clf_gradient.predict(X_test)

In [224]:
accuracy_score(y_test, pred)

0.8807908955052305

In [225]:
recall_score(y_test, pred)

0.5516372795969773

In [226]:
precision_score(y_test, pred)

0.7293921731890092

In [227]:
f1_score(y_test, pred)

0.6281821441376837