# NLP on neighborhood overview

In [4]:
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize          
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import time
from sklearn.metrics import accuracy_score, confusion_matrix, recall_score, precision_score, f1_score

# Load data

In [2]:
df_sf_2017 = pd.read_json('df_sf_2017')

In [17]:
df_sf_2017.neighborhood_overview.fillna(value='None', inplace=True)

In [18]:
df_sf_2017["neighborhood_overview_new"] = df_sf_2017['neighborhood_overview'].str.replace('[^\w\s]','')

## LemmaTokenizer

In [20]:
class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, articles):
        return [self.wnl.lemmatize(t) for t in word_tokenize(articles)]

## split data - running NLP on neighborhood overview column

In [21]:
def split_data(start_month, end_month):
    df_X_train = df_sf_2017[(df_sf_2017['month'] >= start_month) & (df_sf_2017['month'] < end_month)]['neighborhood_overview_new']
    y_train = df_sf_2017[(df_sf_2017['month'] >= start_month) & (df_sf_2017['month'] < end_month)]['popular']

    df_X_test = df_sf_2017[df_sf_2017['month'] == end_month]['neighborhood_overview_new']
    y_test = df_sf_2017[df_sf_2017['month'] == end_month]['popular']
    
    return df_X_train, y_train, df_X_test, y_test

## Run CountVectorizer

In [22]:
def run_tf_vec(df_X_train, df_X_test):
    tf_vectorizer_train = CountVectorizer(tokenizer=LemmaTokenizer(), stop_words='english').fit(df_X_train)
    X_train = tf_vectorizer_train.transform(df_X_train)
    tf_vectorizer_test = CountVectorizer(tokenizer=LemmaTokenizer(), stop_words='english', vocabulary = tf_vectorizer_train.vocabulary_).fit(df_X_test)
    X_test = tf_vectorizer_test.transform(df_X_test)
    return X_train, X_test, tf_vectorizer_train

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 7.63 µs


In [24]:
def predict_tf_nb(X_train, y_train, X_test, y_test):
    from sklearn.naive_bayes import MultinomialNB
    nb = MultinomialNB()
    nb.fit(X_train, y_train)
    pickle.dump(nb, open('nb_model'+ str(model_num) + '.p', 'wb'))
    preds = nb.predict(X_test)
    scores_tf_nb[0][model_num] = accuracy_score(y_test, preds)
    scores_tf_nb[1][model_num] = recall_score(y_test, nb.predict(X_test))
    scores_tf_nb[2][model_num] = precision_score(y_test, nb.predict(X_test))
    scores_tf_nb[3][model_num] = f1_score(y_test, nb.predict(X_test))
    return scores_tf_nb

In [25]:
def grid_searching (param_grid, model):

    grid_search = GridSearchCV(model, 
                               param_grid=param_grid, cv=5, 
                               n_jobs=-1, scoring=make_scorer(f1_score))
    fit = grid_search.fit(X_train, y_train)
    predicted = fit.predict(X_test)
    return grid_search.best_params_


## With the CountVectorizer, run with RandomForest 

In [33]:
def predict_tf_rf(X_train, y_train, X_test, y_test):
#     param_grid = {'n_estimators': [500, 1000, 1500], 
#                   'max_features': ['auto'], 
#                   'max_depth': [None, 10, 5],
#                   'class_weight': [None, 'balanced']}

#     model = RandomForestClassifier()
    
#     best_parameters = grid_searching(param_grid, model)
#     rf = RandomForestClassifier(n_estimators = best_parameters['n_estimators'], 
#                                 n_jobs = -1, 
#                                 random_state = 0, 
#                                 max_features = ['auto'], 
#                                 max_depth = best_parameters['max_depth'], 
#                                 class_weight = best_parameters['class_weight'])
    
    rf = RandomForestClassifier(n_estimators = 500, 
                                n_jobs = -1, 
                                random_state = 0, 
                                max_depth = 10)
    rf.fit(X_train, y_train)
    predicted = rf.predict(X_test)
#     pickle.dump(rf, open('rf_nlp_countvec_50' + str(model_num) + '.p', 'wb'))
    scores_tf_rf[0][model_num] = accuracy_score(y_test, predicted)
    scores_tf_rf[1][model_num] = recall_score(y_test, predicted)
    scores_tf_rf[2][model_num] = precision_score(y_test, predicted)
    scores_tf_rf[3][model_num] = f1_score(y_test, predicted)
    return scores_tf_rf

In [27]:
scores_tf_nb = np.zeros(shape=(4,9))
scores_tf_rf = np.zeros(shape=(4,9))

model_num = 0
start_month = 1
end_month = 4

In [34]:
%%time
while end_month <13:
    df_X_train, y_train, df_X_test, y_test = split_data(start_month, end_month)
    X_train, X_test, tf_vectorizer_train = run_tf_vec(df_X_train, df_X_test)
    scores_tf_nb = predict_tf_nb(X_train, y_train, X_test, y_test)
    scores_tf_rf = predict_tf_rf(X_train, y_train, X_test, y_test)
    print(start_month, end_month, model_num)
    model_num += 1
    start_month += 1
    end_month += 1
    print('tf_nb')
    print(scores_tf_nb)
    print('tf_rf')
    print(scores_tf_rf)

1 4 0
tf_nb
[[0.86396593 0.         0.         0.         0.         0.
  0.         0.         0.        ]
 [0.40428212 0.         0.         0.         0.         0.
  0.         0.         0.        ]
 [0.73120729 0.         0.         0.         0.         0.
  0.         0.         0.        ]
 [0.52068127 0.         0.         0.         0.         0.
  0.         0.         0.        ]]
tf_rf
[[0.8190816  0.         0.         0.         0.         0.
  0.         0.         0.        ]
 [0.01070529 0.         0.         0.         0.         0.
  0.         0.         0.        ]
 [0.94444444 0.         0.         0.         0.         0.
  0.         0.         0.        ]
 [0.02117061 0.         0.         0.         0.         0.
  0.         0.         0.        ]]
2 5 1
tf_nb
[[0.86396593 0.86616972 0.         0.         0.         0.
  0.         0.         0.        ]
 [0.40428212 0.4057971  0.         0.         0.         0.
  0.         0.         0.        ]
 [0.7312

## COUNT VECTORIZER NAIVE BAYES SCORE

In [35]:
accuracy_2017_tf_nb = np.mean(scores_tf_nb[0])
recall_2017_tf_nb = np.mean(scores_tf_nb[1])
precision_2017_tf_nb = np.mean(scores_tf_nb[2])
f1_score_2017_tf_nb = np.mean(scores_tf_nb[3])
print(accuracy_2017_tf_nb)
print(recall_2017_tf_nb)
print(precision_2017_tf_nb)
print(f1_score_2017_tf_nb)

0.844590665167787
0.4053018089526731
0.7367201578300496
0.5222166918129059


## COUNT VECTORIZER RANDOM FOREST SCORE

In [36]:
accuracy_2017_tf_rf = np.mean(scores_tf_rf[0])
recall_2017_tf_rf = np.mean(scores_tf_rf[1])
precision_2017_tf_rf = np.mean(scores_tf_rf[2])
f1_score_2017_tf_rf = np.mean(scores_tf_rf[3])
print(accuracy_2017_tf_rf)
print(recall_2017_tf_rf)
print(precision_2017_tf_rf)
print(f1_score_2017_tf_rf)

0.7926353907420132
0.014053951831842569
0.9540243706910374
0.02764635476005918


## Try running with TF-IDF

In [37]:
def run_tf_idf_vec(df_X_train, df_X_test):
    tf_idf_vectorizer_train = TfidfVectorizer(tokenizer=LemmaTokenizer(), stop_words='english')
    X_train2 = tf_idf_vectorizer_train.fit_transform(df_X_train).toarray()
    tf_idf_vectorizer_test = TfidfVectorizer(tokenizer=LemmaTokenizer(), stop_words='english', vocabulary = tf_idf_vectorizer_train.vocabulary_)
    X_test2 = tf_idf_vectorizer_test.fit_transform(df_X_test).toarray()
    return X_train2, X_test2, tf_idf_vectorizer_train

In [38]:
def predict_tf_idf_nb(X_train2, y_train, X_test2, y_test):
    from sklearn.naive_bayes import GaussianNB
    nb = GaussianNB()
    nb.fit(X_train2, y_train)
    # pickle.dump(nb, open('nb_model_guassian.p', 'wb'))
    preds = nb.predict(X_test2)
    scores_tf_idf_nb[0][model_num] = accuracy_score(y_test, preds)
    scores_tf_idf_nb[1][model_num] = recall_score(y_test, preds)
    scores_tf_idf_nb[2][model_num] = precision_score(y_test, preds)
    scores_tf_idf_nb[3][model_num] = f1_score(y_test, preds)
    return scores_tf_idf_nb

## With the TF-IDF, run with RandomForest 

In [39]:
def predict_tf_idf_rf(X_train2, y_train, X_test2, y_test):
    rf = RandomForestClassifier(n_estimators = 10, n_jobs=-1, random_state=0, class_weight = {0:.95, 1:.05})
    rf.fit(X_train2, y_train)
    predicted = rf.predict(X_test2)
#     pickle.dump(rf, open('rf_nlp_50.p', 'wb'))
    scores_tf_idf_rf[0][model_num] = accuracy_score(y_test, predicted)
    scores_tf_idf_rf[1][model_num] = recall_score(y_test, predicted)
    scores_tf_idf_rf[2][model_num] = precision_score(y_test, predicted)
    scores_tf_idf_rf[3][model_num] = f1_score(y_test, predicted)
    return scores_tf_idf_rf

In [40]:
scores_tf_idf_nb = np.zeros(shape=(4,9))
scores_tf_idf_rf = np.zeros(shape=(4,9))

model_num = 0
start_month = 1
end_month = 4

In [41]:
%%time
while end_month <13:
    df_X_train, y_train, df_X_test, y_test = split_data(start_month, end_month) 
    X_train2, X_test2, tf_idf_vectorizer_train = run_tf_idf_vec(df_X_train, df_X_test)
    scores_tf_idf_nb = predict_tf_idf_nb(X_train2, y_train, X_test2, y_test)
    scores_tf_idf_rf = predict_tf_idf_rf(X_train2, y_train, X_test2, y_test)
    print(start_month, end_month, model_num)
    model_num += 1
    start_month += 1
    end_month += 1
    print('tf_idf_nb')
    print(scores_tf_idf_nb)
    print('tf_idf_rf')
    print(scores_tf_idf_rf)

1 4 0
tf_idf_nb
[[0.41213028 0.         0.         0.         0.         0.
  0.         0.         0.        ]
 [0.96284635 0.         0.         0.         0.         0.
  0.         0.         0.        ]
 [0.23244147 0.         0.         0.         0.         0.
  0.         0.         0.        ]
 [0.37447955 0.         0.         0.         0.         0.
  0.         0.         0.        ]]
tf_idf_rf
[[0.9143745  0.         0.         0.         0.         0.
  0.         0.         0.        ]
 [0.57241814 0.         0.         0.         0.         0.
  0.         0.         0.        ]
 [0.93326489 0.         0.         0.         0.         0.
  0.         0.         0.        ]
 [0.70960187 0.         0.         0.         0.         0.
  0.         0.         0.        ]]
2 5 1
tf_idf_nb
[[0.41213028 0.40630734 0.         0.         0.         0.
  0.         0.         0.        ]
 [0.96284635 0.96408318 0.         0.         0.         0.
  0.         0.         0.      

## TF IDF NAIVE BAYES SCORE

In [42]:
accuracy_2017_tf_idf_nb = np.mean(scores_tf_idf_nb[0])
recall_2017_tf_idf_nb = np.mean(scores_tf_idf_nb[1])
precision_2017_tf_idf_nb = np.mean(scores_tf_idf_nb[2])
f1_score_2017_tf_idf_nb = np.mean(scores_tf_idf_nb[3])
print(accuracy_2017_tf_idf_nb)
print(recall_2017_tf_idf_nb)
print(precision_2017_tf_idf_nb)
print(f1_score_2017_tf_idf_nb)

0.4311131750822347
0.9656682252696371
0.26555770016660807
0.41544440641100033


## TF IDF RANDOM FOREST SCORE

In [43]:
accuracy_2017_tf_idf_rf = np.mean(scores_tf_idf_rf[0])
recall_2017_tf_idf_rf = np.mean(scores_tf_idf_rf[1])
precision_2017_tf_idf_rf = np.mean(scores_tf_idf_rf[2])
f1_score_2017_tf_idf_rf = np.mean(scores_tf_idf_rf[3])
print(accuracy_2017_tf_idf_rf)
print(recall_2017_tf_idf_rf)
print(precision_2017_tf_idf_rf)
print(f1_score_2017_tf_idf_rf)

0.8999882714822403
0.5622184994071395
0.9341822070758552
0.7013918982999527
