## Baseline score -  randomly guessing given that we know that 20% of the data is popular

In [225]:
import random
import pandas as pd
import numpy as np
import pickle
from datetime import datetime as dt
from sklearn.ensemble import RandomForestClassifier

pd.set_option("display.max_colwidth",999)
pd.set_option("display.max_rows",999)
pd.set_option("display.max_columns",999)
np.set_printoptions(suppress=True)

In [233]:
def randomly_guessing(n_simulations = 10):
    
    count = 0 #
    #keep track of 
    accuracy_baseline = np.zeros(shape=(1,9*n_simulations)) #we have 9 test sets for 2017. Multiply by 9 for each simulation.  
    recall_baseline = np.zeros(shape=(1,9*n_simulations))
    precision_baseline = np.zeros(shape=(1,9*n_simulations))
    f1_baseline = np.zeros(shape=(1,9*n_simulations))
    
    for i in range(n_simulations):
        my_list = [True for x in range(2)] +  [False for x in range(8)]
        start_month = 1
        end_month = 4
        while end_month <13:
            y_test = df_sf_2017[df_sf_2017['month'] == end_month]['popular']
            y_pred = pd.Series(random.choice(my_list) for x in range(y_test.size))
            accuracy_baseline[0][count] = accuracy_score(y_test, y_pred)
            recall_baseline[0][count] = recall_score(y_test, y_pred)
            precision_baseline[0][count] = precision_score(y_test, y_pred)
            f1_baseline[0][count] = f1_score(y_test, y_pred)
            count+=1
            start_month += 1
            end_month += 1
            
    return accuracy_baseline, recall_baseline, precision_baseline, f1_baseline


In [None]:
accuracy_baseline, recall_baseline, precision_baseline, f1_baseline = randomly_guessing(n_simulations = 1000)
print(accuracy_baseline.mean())
print(recall_baseline.mean()) 
print(precision_baseline.mean()) 
print(f1_baseline.mean())

In [4]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/ubuntu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [62]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/ubuntu/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [86]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /home/ubuntu/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

# Load data

In [9]:
# df_sf_2017 = pickle.load(open('../data_sf_2017.p', 'rb'))

In [3]:
df_sf_2017 = pd.read_json('df_sf_2017')

In [11]:
len(df_sf_2017)

117262

# NLP

In [4]:
import nltk
from nltk.tokenize import RegexpTokenizer, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import time
from sklearn.metrics import accuracy_score, confusion_matrix, recall_score, precision_score, f1_score

In [5]:
df_sf_2017.description.fillna(value='None', inplace=True)

In [6]:
df_sf_2017["description_new"] = df_sf_2017['description'].str.replace('[^\w\s]','')

## Test

In [181]:
test1 = df_sf_2017['description'].iloc[0:2]

In [200]:
# import string 
# def remove_punctuations(text):
#     return text.translate(None,string.punctuation)


In [177]:
# punc = RegexpTokenizer(r'\w+')

In [225]:
# token_pattern = r'\w+'

In [213]:
testing = df_sf_2017['description_new'].iloc[1:3]
tf_vectorizer_train = CountVectorizer(tokenizer=LemmaTokenizer(), stop_words='english', token_pattern = r'\w+').fit(testing)

In [215]:
tf_vectorizer_train.vocabulary_

{'floor': 30,
 'breath': 15,
 'taking': 86,
 'view': 96,
 'easy': 25,
 'transportation': 91,
 'downtown': 24,
 'great': 35,
 'neighborhood': 55,
 'san': 76,
 'francisco': 31,
 'safe': 75,
 'free': 32,
 'parking': 59,
 'located': 46,
 'geographically': 34,
 'center': 17,
 'city': 18,
 'public': 68,
 'available': 10,
 '15': 1,
 'minute': 51,
 'private': 67,
 'bathroom': 11,
 'room': 74,
 'street': 83,
 'lovely': 48,
 'noe': 57,
 'valley': 94,
 'access': 5,
 'muni': 54,
 'market': 49,
 'perfectly': 63,
 'enjoy': 26,
 'peaceful': 61,
 'environment': 28,
 'want': 97,
 'close': 20,
 'shopping': 81,
 'restaurant': 72,
 'nightlife': 56,
 'block': 14,
 'j': 41,
 'train': 90,
 '20m': 3,
 'ride': 73,
 '3': 4,
 'mission': 52,
 'apartment': 8,
 '2': 2,
 'bedroom': 12,
 '1': 0,
 'recently': 70,
 'remodeled': 71,
 'modern': 53,
 'kitchen': 42,
 'dishwasher': 23,
 'washerdryer': 98,
 'building': 16,
 'use': 93,
 'best': 13,
 'hill': 38,
 'climb': 19,
 'sunny': 85,
 'airy': 6,
 'minimalist': 50,
 'terr

In [160]:
text = 'walk you i san are to .be hippo geese. geese goose francisco walked'

stop_words = stopwords.words("english")
stop_words.append('san')
stop_words.append('francisco')
tokenizer = RegexpTokenizer(r'\w+')
tokens = tokenizer.tokenize(text)
stemmed = []
snowball = SnowballStemmer("english")
for item in tokens:
    if item not in stop_words:
        stemmed.append(snowball.stem(item))
print(stemmed)

['walk', 'hippo', 'gees', 'gees', 'goos', 'walk']


In [138]:
stop_words

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [161]:
%%time

# def process_text(text):
#     stop_words = stopwords.words("english")
#     stop_words.append('san')
#     stop_words.append('francisco')
#     tokenizer = RegexpTokenizer(r'\w+')
#     tokens = tokenizer.tokenize(text)
    
#     stemmed = []
#     snowball = SnowballStemmer("english")
#     for item in tokens:
#         if item not in stop_words:
#             stemmed.append(snowball.stem(item))
        
#     lemmatized = []
#     wordnet = WordNetLemmatizer()
#     for item in stemmed:
#         lemmatized.append(wordnet.lemmatize(item))
    
#     return lemmatized
    

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 7.15 µs


## LemmaTokenizer

In [7]:
from nltk import word_tokenize          
from nltk.stem import WordNetLemmatizer 
class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, articles):
        return [self.wnl.lemmatize(t) for t in word_tokenize(articles)]

## split data - running NLP on description column

In [8]:
def split_data(start_month, end_month):
    df_X_train = df_sf_2017[(df_sf_2017['month'] >= start_month) & (df_sf_2017['month'] < end_month)]['description_new']
    y_train = df_sf_2017[(df_sf_2017['month'] >= start_month) & (df_sf_2017['month'] < end_month)]['popular']

    df_X_test = df_sf_2017[df_sf_2017['month'] == end_month]['description_new']
    y_test = df_sf_2017[df_sf_2017['month'] == end_month]['popular']
    
    return df_X_train, y_train, df_X_test, y_test

## Run CountVectorizer

In [9]:
%%time
def run_tf_vec(df_X_train, df_X_test):
    tf_vectorizer_train = CountVectorizer(tokenizer=LemmaTokenizer(), stop_words='english').fit(df_X_train)
    X_train = tf_vectorizer_train.transform(df_X_train)
    tf_vectorizer_test = CountVectorizer(tokenizer=LemmaTokenizer(), stop_words='english', vocabulary = tf_vectorizer_train.vocabulary_).fit(df_X_test)
    X_test = tf_vectorizer_test.transform(df_X_test)
    return X_train, X_test, tf_vectorizer_train

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 6.91 µs


In [95]:
X_train

<26526x12784 sparse matrix of type '<class 'numpy.int64'>'
	with 1848230 stored elements in Compressed Sparse Row format>

In [10]:
def predict_tf_nb(X_train, y_train, X_test, y_test):
    from sklearn.naive_bayes import MultinomialNB
    nb = MultinomialNB()
    nb.fit(X_train, y_train)
    pickle.dump(nb, open('nb_model'+ str(model_num) + '.p', 'wb'))
    preds = nb.predict(X_test)
    scores_tf_nb[0][model_num] = accuracy_score(y_test, preds)
    scores_tf_nb[1][model_num] = recall_score(y_test, nb.predict(X_test))
    scores_tf_nb[2][model_num] = precision_score(y_test, nb.predict(X_test))
    scores_tf_nb[3][model_num] = f1_score(y_test, nb.predict(X_test))
    return scores_tf_nb

## With the CountVectorizer, run with RandomForest 

In [11]:
def predict_tf_rf(X_train, y_train, X_test, y_test):
    rf = RandomForestClassifier(n_estimators = 10, n_jobs = -1, random_state=0, class_weight = {0:.95, 1:.05})
    rf.fit(X_train, y_train)
    predicted = rf.predict(X_test)
    pickle.dump(rf, open('rf_nlp_countvec_50' + str(model_num) + '.p', 'wb'))
    scores_tf_rf[0][model_num] = accuracy_score(y_test, predicted)
    scores_tf_rf[1][model_num] = recall_score(y_test, predicted)
    scores_tf_rf[2][model_num] = precision_score(y_test, predicted)
    scores_tf_rf[3][model_num] = f1_score(y_test, predicted)
    return scores_tf_rf

In [12]:
scores_tf_nb = np.zeros(shape=(4,9))
scores_tf_rf = np.zeros(shape=(4,9))

model_num = 0
start_month = 1
end_month = 4

In [13]:
%%time
while end_month <13:
    df_X_train, y_train, df_X_test, y_test = split_data(start_month, end_month)
    X_train, X_test, tf_vectorizer_train = run_tf_vec(df_X_train, df_X_test)
    scores_tf_nb = predict_tf_nb(X_train, y_train, X_test, y_test)
    scores_tf_rf = predict_tf_rf(X_train, y_train, X_test, y_test)
    print(start_month, end_month, model_num)
    model_num += 1
    start_month += 1
    end_month += 1
    print('tf_nb')
    print(scores_tf_nb)
    print('tf_rf')
    print(scores_tf_rf)

1 4 0
tf_nb
[[0.88228532 0.         0.         0.         0.         0.
  0.         0.         0.        ]
 [0.70465995 0.         0.         0.         0.         0.
  0.         0.         0.        ]
 [0.66845878 0.         0.         0.         0.         0.
  0.         0.         0.        ]
 [0.68608216 0.         0.         0.         0.         0.
  0.         0.         0.        ]]
tf_rf
[[0.95574204 0.         0.         0.         0.         0.
  0.         0.         0.        ]
 [0.81171285 0.         0.         0.         0.         0.
  0.         0.         0.        ]
 [0.93745455 0.         0.         0.         0.         0.
  0.         0.         0.        ]
 [0.87006412 0.         0.         0.         0.         0.
  0.         0.         0.        ]]
2 5 1
tf_nb
[[0.88228532 0.88582226 0.         0.         0.         0.
  0.         0.         0.        ]
 [0.70465995 0.71770636 0.         0.         0.         0.
  0.         0.         0.        ]
 [0.6684

In [15]:
accuracy_2017_tf_nb = np.mean(scores_tf_nb[0])
recall_2017_tf_nb = np.mean(scores_tf_nb[1])
precision_2017_tf_nb = np.mean(scores_tf_nb[2])
f1_score_2017_tf_nb = np.mean(scores_tf_nb[3])
print(accuracy_2017_tf_nb)
print(recall_2017_tf_nb)
print(precision_2017_tf_nb)
print(f1_score_2017_tf_nb)

0.8742970560045102
0.7169107075713074
0.6929862495075171
0.7043799660889926


In [14]:
accuracy_2017_tf_rf = np.mean(scores_tf_rf[0])
recall_2017_tf_rf = np.mean(scores_tf_rf[1])
precision_2017_tf_rf = np.mean(scores_tf_rf[2])
f1_score_2017_tf_rf = np.mean(scores_tf_rf[3])
print(accuracy_2017_tf_rf)
print(recall_2017_tf_rf)
print(precision_2017_tf_rf)
print(f1_score_2017_tf_rf)

0.9474790730435897
0.790758536328864
0.9494687554573628
0.8623895527424829


## Try running with TF-IDF

In [16]:
%%time
def run_tf_idf_vec(df_X_train, df_X_test):
    tf_idf_vectorizer_train = TfidfVectorizer(tokenizer=LemmaTokenizer(), stop_words='english')
    X_train2 = tf_idf_vectorizer_train.fit_transform(df_X_train).toarray()
    tf_idf_vectorizer_test = TfidfVectorizer(tokenizer=LemmaTokenizer(), stop_words='english', vocabulary = tf_vectorizer_train.vocabulary_)
    X_test2 = tf_idf_vectorizer_test.fit_transform(df_X_test).toarray()
    return X_train2, X_test2, tf_idf_vectorizer_train

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 6.91 µs


In [76]:
def predict_tf_idf_nb(X_train2, y_train, X_test2, y_test):
    from sklearn.naive_bayes import GaussianNB
    nb = GaussianNB()
    nb.fit(X_train2, y_train)
    # pickle.dump(nb, open('nb_model_guassian.p', 'wb'))
    preds = nb.predict(X_test2)
    scores_tf_idf_nb[0][model_num] = accuracy_score(y_test, preds)
    scores_tf_idf_nb[1][model_num] = recall_score(y_test, preds)
    scores_tf_idf_nb[2][model_num] = precision_score(y_test, preds)
    scores_tf_idf_nb[3][model_num] = f1_score(y_test, preds)
    return scores_tf_idf_nb

## With the TF-IDF, run with RandomForest 

In [77]:
def predict_tf_idf_rf(X_train2, y_train, X_test2, y_test):
    rf = RandomForestClassifier(n_estimators = 10, n_jobs=-1, random_state=0, class_weight = {0:.95, 1:.05})
    rf.fit(X_train2, y_train)
    predicted = rf.predict(X_test2)
#     pickle.dump(rf, open('rf_nlp_50.p', 'wb'))
    scores_tf_idf_rf[0][model_num] = accuracy_score(y_test, predicted)
    scores_tf_idf_rf[1][model_num] = recall_score(y_test, predicted)
    scores_tf_idf_rf[2][model_num] = precision_score(y_test, predicted)
    scores_tf_idf_rf[3][model_num] = f1_score(y_test, predicted)
    return scores_tf_idf_rf

In [78]:
scores_tf_idf_nb = np.zeros(shape=(4,9))
scores_tf_idf_rf = np.zeros(shape=(4,9))

model_num = 0
start_month = 1
end_month = 4

In [79]:
%%time
while end_month <13:
    df_X_train, y_train, df_X_test, y_test = split_data(start_month, end_month) 
    X_train2, X_test2, tf_idf_vectorizer_train = run_tf_idf_vec(df_X_train, df_X_test)
    scores_tf_idf_nb = predict_tf_idf_nb(X_train2, y_train, X_test2, y_test)
    scores_tf_idf_rf = predict_tf_idf_rf(X_train2, y_train, X_test2, y_test)
    print(start_month, end_month, model_num)
    model_num += 1
    start_month += 1
    end_month += 1
    print('tf_idf_nb')
    print(scores_tf_idf_nb)
    print('tf_idf_rf')
    print(scores_tf_idf_rf)

1 4 0
tf_idf_nb
[[0.61328888 0.         0.         0.         0.         0.
  0.         0.         0.        ]
 [0.93702771 0.         0.         0.         0.         0.
  0.         0.         0.        ]
 [0.31313131 0.         0.         0.         0.         0.
  0.         0.         0.        ]
 [0.46940063 0.         0.         0.         0.         0.
  0.         0.         0.        ]]
tf_idf_rf
[[0.94643062 0.         0.         0.         0.         0.
  0.         0.         0.        ]
 [0.76448363 0.         0.         0.         0.         0.
  0.         0.         0.        ]
 [0.9295559  0.         0.         0.         0.         0.
  0.         0.         0.        ]
 [0.83897719 0.         0.         0.         0.         0.
  0.         0.         0.        ]]
CPU times: user 6min 52s, sys: 3.94 s, total: 6min 56s
Wall time: 2min 24s


In [30]:
accuracy_2017_tf_idf_nb = np.mean(scores_tf_idf_nb[0])
recall_2017_tf_idf_nb = np.mean(scores_tf_idf_nb[1])
precision_2017_tf_idf_nb = np.mean(scores_tf_idf_nb[2])
f1_score_2017_tf_idf_nb = np.mean(scores_tf_idf_nb[3])
print(accuracy_2017_tf_idf_nb)
print(recall_2017_tf_idf_nb)
print(precision_2017_tf_idf_nb)
print(f1_score_2017_tf_idf_nb)

0.6223984688066541
0.9360869500755025
0.3467272958014258
0.5052056786500755


In [31]:
accuracy_2017_tf_idf_rf = np.mean(scores_tf_idf_rf[0])
recall_2017_tf_idf_rf = np.mean(scores_tf_idf_rf[1])
precision_2017_tf_idf_rf = np.mean(scores_tf_idf_rf[2])
f1_score_2017_tf_idf_rf = np.mean(scores_tf_idf_rf[3])
print(accuracy_2017_tf_idf_rf)
print(recall_2017_tf_idf_rf)
print(precision_2017_tf_idf_rf)
print(f1_score_2017_tf_idf_rf)

0.9383264864764923
0.7493591861607887
0.9398408080684738
0.8334356589996533


In [None]:
# Do I need k means? Or just top twenty words for popular and top twenty words for not popular

## Kmeans

In [172]:
from sklearn.cluster import KMeans
from collections import Counter
from scipy.spatial.distance import pdist, squareform
from scipy.cluster.hierarchy import linkage, dendrogram

In [222]:
features =  tf_vectorizer_train.get_feature_names()

In [27]:
features[150:200]

['1300',
 '1300sqft',
 '1304',
 '1315',
 '132',
 '135',
 '1350',
 '135cm',
 '136',
 '137',
 '1370123',
 '13724120',
 '139',
 '1394',
 '1396',
 '13ft',
 '13in',
 '13lb',
 '13mn',
 '13th',
 '13x12',
 '14',
 '140',
 '1400',
 '1400ft²',
 '1400sf',
 '1408',
 '140sq',
 '140sqm',
 '14154147',
 '143',
 '144',
 '145',
 '1450',
 '14611926',
 '1474',
 '149',
 '14ft',
 '14l',
 '14min',
 '14r',
 '14sqm',
 '14th',
 '14x',
 '14x12',
 '15',
 '150',
 '1500',
 '1500ft',
 '1500sf']

In [223]:
%%time
kmeans = KMeans(n_clusters=2, n_jobs=-1)
kmeans.fit(X_train)

CPU times: user 988 ms, sys: 0 ns, total: 988 ms
Wall time: 2min 30s


In [224]:
%%time
# TF vectorizer
top_centroids = kmeans.cluster_centers_.argsort()[:,-1:-60:-1]
print("top features (words) for each cluster:")
for num, centroid in enumerate(top_centroids):
    print("%d: %s" % (num, ", ".join(features[i] for i in centroid)))


top features (words) for each cluster:
0: san, apartment, block, francisco, street, place, restaurant, neighborhood, park, kitchen, room, city, bedroom, walk, located, access, home, close, great, bed, ha, away, private, view, location, mission, space, sf, minute, 2, walking, bathroom, downtown, area, parking, house, bar, floor, distance, bus, bart, quiet, gate, easy, guest, available, district, public, golden, building, stay, 1, square, shop, just, unit, heart, youll, beautiful
1: room, bedroom, bed, kitchen, ha, bathroom, living, private, home, san, apartment, large, 2, access, francisco, house, floor, guest, street, queen, block, space, park, area, neighborhood, located, view, 1, city, restaurant, great, tv, parking, dining, quiet, walk, spacious, available, bath, shared, away, flat, beautiful, comfortable, sf, mission, garden, place, stay, size, 3, downtown, open, deck, easy, just, unit, close, new
CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 2.05 ms


In [89]:
%%time
# TF-IDF vectorizer
top_centroids = kmeans.cluster_centers_.argsort()[:,-1:-50:-1]
print("top features (words) for each cluster:")
for num, centroid in enumerate(top_centroids):
    print("%d: %s" % (num, ", ".join(features[i] for i in centroid)))


top features (words) for each cluster:
0: place, walk, locat, park, san, room, francisco, the, restaur, street, block, close, apart, kitchen, my, bedroom, citi, neighborhood, you, bed, we, i, great, access, sf, one, this, 2, view, travel, shop, privat, love, home, away, mission, downtown, 1, live, bart, bathroom, busi, space, minut, bar, hous, full, squar, good
1: room, the, bedroom, park, kitchen, bed, live, locat, san, apart, walk, francisco, privat, bathroom, home, one, 2, access, block, street, neighborhood, i, floor, this, hous, restaur, full, we, larg, 1, guest, space, two, citi, area, queen, view, great, there, you, away, size, comfort, it, quiet, shop, mission, avail, share
CPU times: user 4 ms, sys: 0 ns, total: 4 ms
Wall time: 1.83 ms
