## Baseline score -  randomly guessing given that we know that 20% of the data is popular

In [1]:
import random
import pandas as pd
import numpy as np
import pickle
from datetime import datetime as dt
from sklearn.ensemble import RandomForestClassifier

pd.set_option("display.max_colwidth",999)
pd.set_option("display.max_rows",999)
pd.set_option("display.max_columns",999)
np.set_printoptions(suppress=True)

# Load data

In [3]:
# df_sf_2017 = pickle.load(open('../data_sf_2017.p', 'rb'))

In [2]:
df_sf_2017 = pd.read_json('df_sf_2017')

In [11]:
len(df_sf_2017)

117262

# NLP

In [3]:
import nltk
from nltk.tokenize import RegexpTokenizer, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import time
from sklearn.metrics import accuracy_score, confusion_matrix, recall_score, precision_score, f1_score

In [4]:
df_sf_2017.description.fillna(value='None', inplace=True)

In [5]:
df_sf_2017["description_new"] = df_sf_2017['description'].str.replace('[^\w\s]','')

## DESCRIPTION COLUMN MODEL

In [6]:
from nltk import word_tokenize          
from nltk.stem import WordNetLemmatizer 
class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, articles):
        return [self.wnl.lemmatize(t) for t in word_tokenize(articles)]

In [82]:
# def split_data(start_month, end_month):
start_month = 1
end_month = 4

df_X_train_nlp = df_sf_2017[(df_sf_2017['month'] >= start_month) & (df_sf_2017['month'] < end_month)]['description_new']
y_train_nlp = df_sf_2017[(df_sf_2017['month'] >= start_month) & (df_sf_2017['month'] < end_month)]['popular']

df_X_test_nlp = df_sf_2017[df_sf_2017['month'] == end_month]['description_new']
y_test_nlp = df_sf_2017[df_sf_2017['month'] == end_month]['popular']

# return df_X_train, y_train, df_X_test, y_test

## Run CountVectorizer

In [83]:
def run_tf_vec(df_X_train, df_X_test):
    tf_vectorizer_train = CountVectorizer(tokenizer=LemmaTokenizer(), stop_words='english').fit(df_X_train)
    X_train = tf_vectorizer_train.transform(df_X_train)
    tf_vectorizer_test = CountVectorizer(tokenizer=LemmaTokenizer(), stop_words='english', vocabulary = tf_vectorizer_train.vocabulary_).fit(df_X_test)
    X_test = tf_vectorizer_test.transform(df_X_test)
    return X_train, X_test, tf_vectorizer_train

In [84]:
X_train_nlp, X_test_nlp, tf_vectorizer_train = run_tf_vec(df_X_train_nlp, df_X_test_nlp)

In [85]:
model_probs_tf_nlp = np.zeros(shape=(9,X_test_nb.shape[0])) 

model_num = 0

from sklearn.ensemble import RandomForestClassifier
nlp = RandomForestClassifier(n_estimators = 50, n_jobs=-1, random_state=0, class_weight = {0:.95, 1:.05})
nlp.fit(X_train_nb, y_train_nb)
pred_prob_nb = nlp.predict_proba(X_test_nb)


In [86]:
nlp_output = pd.DataFrame(pred_prob_nb, columns=["NLP - 0", "NLP - 1"])
nlp_output.drop("NLP - 0", axis=1, inplace=True)

In [92]:
# b_model = pd.DataFrame(np.array([[5,6],[7,8]]))
# pd.concat([a,b], axis=1)

In [87]:
y = pd.DataFrame(np.array(y_test_nlp), columns=['y'])

In [88]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, make_scorer, recall_score, precision_score, f1_score
from sklearn.preprocessing import LabelEncoder

In [89]:
columns_to_keep = ['accommodates','bed_type', 'extra_people', 'price_per_guest','guests_included', 'host_about_filled','host_picture_url_filled', 'cancellation_policy', 'room_type', 'property_type_new']

df_sf_2017.loc[df_sf_2017.cancellation_policy == 'flexible_new', 'cancellation_policy'] = "flexible"
prop_type = set(df_sf_2017.property_type.values)
keep = ['Apartment', 'House', 'Condomonium']
drop = list(set(prop_type).difference(keep))
df_sf_2017['property_type_new'] = df_sf_2017['property_type']
df_sf_2017.loc[df_sf_2017.property_type.isin(drop), 'property_type_new'] = "Other"


X_train_rf = df_sf_2017[(df_sf_2017['month'] >= start_month) & (df_sf_2017['month'] < end_month)][columns_to_keep]
y_train_rf = df_sf_2017[(df_sf_2017['month'] >= start_month) & (df_sf_2017['month'] < end_month)]['popular']

X_test_rf = df_sf_2017[df_sf_2017['month'] == end_month][columns_to_keep]
y_test_rf = df_sf_2017[df_sf_2017['month'] == end_month]['popular']



In [90]:
def transform_data(X):
    le = LabelEncoder()
    X['extra_people'].replace('[\=$,]', '', regex=True, inplace=True)
#     X['price'].replace('[\=$,]', '', regex=True, inplace=True)
    X['property_type_new'] = le.fit_transform(X['property_type_new'])
    X['bed_type'] = le.fit_transform(X['bed_type'])
    X['cancellation_policy'] = le.fit_transform(X['cancellation_policy'])
    X['room_type'] = le.fit_transform(X['room_type'])
 
    return X

In [91]:
X_train_rf = transform_data(X_train_rf)
X_test_rf = transform_data(X_test_rf)

In [102]:
from sklearn.ensemble import AdaBoostClassifier #For Classification

In [103]:
dt = DecisionTreeClassifier() 
clf = AdaBoostClassifier(n_estimators=500, base_estimator=dt,learning_rate=1)
#Above I have used decision tree as a base estimator, you can use any ML learner as base estimator if it ac# cepts sample weight 
clf.fit(X_train_rf,y_train_rf)
adaboost_pred_prob = clf.predict_proba(X_test_rf)

In [104]:
adaboost_output = pd.DataFrame(adaboost_pred_prob, columns=["AB - 0", "AB - 1"])
adaboost_output.drop("AB - 0", axis=1, inplace=True)

In [92]:
# rf = RandomForestClassifier(n_estimators = 100, n_jobs=-1, random_state=0, max_depth= 2, max_features= 'auto', class_weight = {0:.95, 1:.05})
# rf.fit(X_train_rf, y_train_rf)

# pred_prob_rf = rf.predict_proba(X_test_rf)

In [93]:
# rf_output = pd.DataFrame(pred_prob_rf, columns=["RF - 0", "RF - 1"])
# rf_output.drop("RF - 0", axis=1, inplace=True)

In [105]:
df_final = pd.concat([nlp_output, adaboost_output], axis=1) #concatenate models and the actual result so that we can do a random split for the decision tree test
df_final['average'] = (df_final['NLP - 1'] + df_final["AB - 1"])/2
df_final_combo = pd.concat([df_final, y], axis =1)
df_final_combo.loc[df_final_combo.average > 0.5,'combined_proba'] = True
df_final_combo.loc[df_final_combo.average <= 0.5,'combined_proba'] = False
df_final_combo

Unnamed: 0,NLP - 1,AB - 1,average,y,combined_proba
0,0.020000,0.118351,0.069176,False,False
1,0.000000,0.177789,0.088894,False,False
2,0.000000,0.226973,0.113486,False,False
3,0.000000,0.184869,0.092435,False,False
4,0.000000,0.168645,0.084322,False,False
5,0.020000,0.130909,0.075454,False,False
6,0.000000,0.820379,0.410190,False,False
7,0.000000,0.075530,0.037765,False,False
8,0.000000,0.235925,0.117963,False,False
9,0.720000,0.821669,0.770834,True,True


## adaboost

In [106]:
print(accuracy_score(df_final_combo.combined_proba, df_final_combo.y))
print(recall_score(df_final_combo.combined_proba, df_final_combo.y))
print(precision_score(df_final_combo.combined_proba, df_final_combo.y))
print(f1_score(df_final_combo.combined_proba, df_final_combo.y))

0.9577626884566693
0.9485672299779574
0.8129722921914357
0.8755510342488979


## random forest

In [96]:
print(accuracy_score(df_final_combo.combined_proba, df_final_combo.y))
print(recall_score(df_final_combo.combined_proba, df_final_combo.y))
print(precision_score(df_final_combo.combined_proba, df_final_combo.y))
print(f1_score(df_final_combo.combined_proba, df_final_combo.y))

0.845321671078375
0.9728682170542635
0.1580604534005038
0.27193932827735645


In [108]:
df_final_combo.drop('average', axis=1, inplace=True)

In [109]:
from sklearn.tree import DecisionTreeClassifier

In [110]:
X_train_final, X_test_final, y_train_final, y_test_final = train_test_split(df_final, y, test_size=0.3, random_state=423)

dt = DecisionTreeClassifier(random_state=11, class_weight = {0:.95, 1:.05}, max_features = 'auto', min_samples_split = 2)

## adaboost

In [111]:
dt.fit(X_train_final, y_train_final)
preds_final = dt.predict(X_test_final)
print(accuracy_score(y_test_final, preds_final))
print(recall_score(y_test_final, preds_final))
print(precision_score(y_test_final, preds_final))
print(f1_score(y_test_final, preds_final))


0.9359416954353663
0.8004338394793926
0.831081081081081
0.8154696132596684


## random forest

In [100]:
dt.fit(X_train_final, y_train_final)
preds_final = dt.predict(X_test_final)
print(accuracy_score(y_test_final, preds_final))
print(recall_score(y_test_final, preds_final))
print(precision_score(y_test_final, preds_final))
print(f1_score(y_test_final, preds_final))


0.9439969313387034
0.8199566160520607
0.8571428571428571
0.8381374722838137


## COUNT VECTORIZER NAIVE BAYES ORIGINAL

In [15]:
accuracy_2017_tf_nb = np.mean(scores_tf_nb[0])
recall_2017_tf_nb = np.mean(scores_tf_nb[1])
precision_2017_tf_nb = np.mean(scores_tf_nb[2])
f1_score_2017_tf_nb = np.mean(scores_tf_nb[3])
print(accuracy_2017_tf_nb)
print(recall_2017_tf_nb)
print(precision_2017_tf_nb)
print(f1_score_2017_tf_nb)

0.8742970560045102
0.7169107075713074
0.6929862495075171
0.7043799660889926


## COUNT VECTORIZER RANDOM FOREST ORIGINAL

In [14]:
accuracy_2017_tf_rf = np.mean(scores_tf_rf[0])
recall_2017_tf_rf = np.mean(scores_tf_rf[1])
precision_2017_tf_rf = np.mean(scores_tf_rf[2])
f1_score_2017_tf_rf = np.mean(scores_tf_rf[3])
print(accuracy_2017_tf_rf)
print(recall_2017_tf_rf)
print(precision_2017_tf_rf)
print(f1_score_2017_tf_rf)

0.9474790730435897
0.790758536328864
0.9494687554573628
0.8623895527424829


## Try running with TF-IDF

In [237]:
def run_tf_idf_vec(df_X_train, df_X_test):
    tf_idf_vectorizer_train = TfidfVectorizer(tokenizer=LemmaTokenizer(), stop_words='english')
    X_train2 = tf_idf_vectorizer_train.fit_transform(df_X_train).toarray()
    tf_idf_vectorizer_test = TfidfVectorizer(tokenizer=LemmaTokenizer(), stop_words='english', vocabulary = tf_vectorizer_train.vocabulary_)
    X_test2 = tf_idf_vectorizer_test.fit_transform(df_X_test).toarray()
    return X_train2, X_test2, tf_idf_vectorizer_train

In [76]:
def predict_tf_idf_nb(X_train2, y_train, X_test2, y_test):
    from sklearn.naive_bayes import GaussianNB
    nb = GaussianNB()
    nb.fit(X_train2, y_train)
    # pickle.dump(nb, open('nb_model_guassian.p', 'wb'))
    preds = nb.predict(X_test2)
    scores_tf_idf_nb[0][model_num] = accuracy_score(y_test, preds)
    scores_tf_idf_nb[1][model_num] = recall_score(y_test, preds)
    scores_tf_idf_nb[2][model_num] = precision_score(y_test, preds)
    scores_tf_idf_nb[3][model_num] = f1_score(y_test, preds)
    return scores_tf_idf_nb

## With the TF-IDF, run with RandomForest 

In [77]:
def predict_tf_idf_rf(X_train2, y_train, X_test2, y_test):
    rf = RandomForestClassifier(n_estimators = 10, n_jobs=-1, random_state=0, class_weight = {0:.95, 1:.05})
    rf.fit(X_train2, y_train)
    predicted = rf.predict(X_test2)
#     pickle.dump(rf, open('rf_nlp_50.p', 'wb'))
    scores_tf_idf_rf[0][model_num] = accuracy_score(y_test, predicted)
    scores_tf_idf_rf[1][model_num] = recall_score(y_test, predicted)
    scores_tf_idf_rf[2][model_num] = precision_score(y_test, predicted)
    scores_tf_idf_rf[3][model_num] = f1_score(y_test, predicted)
    return scores_tf_idf_rf

In [78]:
scores_tf_idf_nb = np.zeros(shape=(4,9))
scores_tf_idf_rf = np.zeros(shape=(4,9))

model_num = 0
start_month = 1
end_month = 4

In [79]:
%%time
while end_month <13:
    df_X_train, y_train, df_X_test, y_test = split_data(start_month, end_month) 
    X_train2, X_test2, tf_idf_vectorizer_train = run_tf_idf_vec(df_X_train, df_X_test)
    scores_tf_idf_nb = predict_tf_idf_nb(X_train2, y_train, X_test2, y_test)
    scores_tf_idf_rf = predict_tf_idf_rf(X_train2, y_train, X_test2, y_test)
    print(start_month, end_month, model_num)
    model_num += 1
    start_month += 1
    end_month += 1
    print('tf_idf_nb')
    print(scores_tf_idf_nb)
    print('tf_idf_rf')
    print(scores_tf_idf_rf)

1 4 0
tf_idf_nb
[[0.61328888 0.         0.         0.         0.         0.
  0.         0.         0.        ]
 [0.93702771 0.         0.         0.         0.         0.
  0.         0.         0.        ]
 [0.31313131 0.         0.         0.         0.         0.
  0.         0.         0.        ]
 [0.46940063 0.         0.         0.         0.         0.
  0.         0.         0.        ]]
tf_idf_rf
[[0.94643062 0.         0.         0.         0.         0.
  0.         0.         0.        ]
 [0.76448363 0.         0.         0.         0.         0.
  0.         0.         0.        ]
 [0.9295559  0.         0.         0.         0.         0.
  0.         0.         0.        ]
 [0.83897719 0.         0.         0.         0.         0.
  0.         0.         0.        ]]
CPU times: user 6min 52s, sys: 3.94 s, total: 6min 56s
Wall time: 2min 24s


## TF IDF NAIVE BAYES ORIGINAL

In [30]:
accuracy_2017_tf_idf_nb = np.mean(scores_tf_idf_nb[0])
recall_2017_tf_idf_nb = np.mean(scores_tf_idf_nb[1])
precision_2017_tf_idf_nb = np.mean(scores_tf_idf_nb[2])
f1_score_2017_tf_idf_nb = np.mean(scores_tf_idf_nb[3])
print(accuracy_2017_tf_idf_nb)
print(recall_2017_tf_idf_nb)
print(precision_2017_tf_idf_nb)
print(f1_score_2017_tf_idf_nb)

0.6223984688066541
0.9360869500755025
0.3467272958014258
0.5052056786500755
