# NLP on description column, Random Forest was the best model

In [19]:
import random
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt
from datetime import datetime as dt
from sklearn.model_selection import GridSearchCV
from nltk.tokenize import RegexpTokenizer
from sklearn.ensemble import RandomForestClassifier
from nltk.corpus import stopwords
from nltk import word_tokenize          
from nltk.stem import WordNetLemmatizer 
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import time
from sklearn.metrics import accuracy_score, confusion_matrix, recall_score, make_scorer, precision_score, f1_score

pd.set_option("display.max_colwidth",999)
pd.set_option("display.max_rows",999)
pd.set_option("display.max_columns",999)
np.set_printoptions(suppress=True)

In [4]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/ubuntu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [62]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/ubuntu/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [86]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /home/ubuntu/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

# Load data

In [3]:
df_sf_2017 = pd.read_json('df_sf_2017')

In [5]:
df_sf_2017.description.fillna(value='None', inplace=True)

In [6]:
df_sf_2017["description_new"] = df_sf_2017['description'].str.replace('[^\w\s]','')

## LemmaTokenizer

In [7]:
class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, articles):
        return [self.wnl.lemmatize(t) for t in word_tokenize(articles)]

## split data - running NLP on description column

In [8]:
def split_data(start_month, end_month):
    df_X_train = df_sf_2017[(df_sf_2017['month'] >= start_month) & (df_sf_2017['month'] < end_month)]['description_new']
    y_train = df_sf_2017[(df_sf_2017['month'] >= start_month) & (df_sf_2017['month'] < end_month)]['popular']

    df_X_test = df_sf_2017[df_sf_2017['month'] == end_month]['description_new']
    y_test = df_sf_2017[df_sf_2017['month'] == end_month]['popular']
    
    return df_X_train, y_train, df_X_test, y_test

## Run CountVectorizer

In [9]:
def run_tf_vec(df_X_train, df_X_test, model_num):
    tf_vectorizer_train = CountVectorizer(tokenizer=LemmaTokenizer(), stop_words='english').fit(df_X_train)
    X_train = tf_vectorizer_train.transform(df_X_train)
    
    tf_vectorizer_test = CountVectorizer(tokenizer=LemmaTokenizer(), stop_words='english', vocabulary = tf_vectorizer_train.vocabulary_).fit(df_X_test)
    X_test = tf_vectorizer_test.transform(df_X_test)
    
    pickle.dump(tf_vectorizer_test, open('tf_vec_descrip'+ str(model_num) + '.p', 'wb'))
    
    return X_train, X_test, tf_vectorizer_train

In [53]:
X_train

<26498x20903 sparse matrix of type '<class 'numpy.int64'>'
	with 1649921 stored elements in Compressed Sparse Row format>

In [10]:
def predict_tf_nb(X_train, y_train, X_test, y_test):
    from sklearn.naive_bayes import MultinomialNB
    nb = MultinomialNB()
    nb.fit(X_train, y_train)
    pickle.dump(nb, open('tf_nb_descrip'+ str(model_num) + '.p', 'wb'))
    preds = nb.predict(X_test)
    scores_tf_nb[0][model_num] = accuracy_score(y_test, preds)
    scores_tf_nb[1][model_num] = recall_score(y_test, nb.predict(X_test))
    scores_tf_nb[2][model_num] = precision_score(y_test, nb.predict(X_test))
    scores_tf_nb[3][model_num] = f1_score(y_test, nb.predict(X_test))
    return scores_tf_nb

In [11]:
def grid_searching (param_grid, model):

    grid_search = GridSearchCV(model, 
                               param_grid=param_grid, cv=5, 
                               n_jobs=-1, scoring=make_scorer(f1_score))
    fit = grid_search.fit(X_train, y_train)
    predicted = fit.predict(X_test)
    return grid_search.best_params_


## With the CountVectorizer, run with RandomForest 

In [21]:
def predict_tf_rf(X_train, y_train, X_test, y_test):
    param_grid = {'n_estimators': [500, 1000], 
                  'class_weight': [None, {0: .8, 1: .2}, {0: .9, 1: .1}]}

    model = RandomForestClassifier()
    
    best_parameters = grid_searching(param_grid, model)
    print(best_parameters)
    rf = RandomForestClassifier(n_estimators = best_parameters['n_estimators'], 
                                n_jobs = -1, 
                                random_state = 0, 
                                class_weight = best_parameters['class_weight'])
    
#     rf = RandomForestClassifier(n_estimators = 500, 
#                                 n_jobs = -1, 
#                                 random_state = 0, 
#                                 max_depth = None, 
#                                 class_weight = {0: .9, 1: .1})
    
    
    rf.fit(X_train, y_train)
    predicted = rf.predict(X_test)
#     pickle.dump(rf, open('tf_rf_descrip' + str(model_num) + '.p', 'wb'))
    scores_tf_rf[0][model_num] = accuracy_score(y_test, predicted)
    scores_tf_rf[1][model_num] = recall_score(y_test, predicted)
    scores_tf_rf[2][model_num] = precision_score(y_test, predicted)
    scores_tf_rf[3][model_num] = f1_score(y_test, predicted)
    return scores_tf_rf

In [22]:
scores_tf_nb = np.zeros(shape=(4,9))
scores_tf_rf = np.zeros(shape=(4,9))

model_num = 0
start_month = 1
end_month = 4

In [None]:
%%time
while end_month <13:
    df_X_train, y_train, df_X_test, y_test = split_data(start_month, end_month)
    X_train, X_test, tf_vectorizer_train = run_tf_vec(df_X_train, df_X_test, model_num)
#     scores_tf_nb = predict_tf_nb(X_train, y_train, X_test, y_test)
    scores_tf_rf = predict_tf_rf(X_train, y_train, X_test, y_test)
    print(start_month, end_month, model_num)
    model_num += 1
    start_month += 1
    end_month += 1
    print('tf_nb')
    print(scores_tf_nb)
    print('tf_rf')
    print(scores_tf_rf)

{'class_weight': None, 'n_estimators': 500}
1 4 0
tf_nb
[[0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0.]]
tf_rf
[[0.96179077 0.         0.         0.         0.         0.
  0.         0.         0.        ]
 [0.85292186 0.         0.         0.         0.         0.
  0.         0.         0.        ]
 [0.92324094 0.         0.         0.         0.         0.
  0.         0.         0.        ]
 [0.88668942 0.         0.         0.         0.         0.
  0.         0.         0.        ]]
{'class_weight': None, 'n_estimators': 500}
2 5 1
tf_nb
[[0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0.]]
tf_rf
[[0.96179077 0.96020642 0.         0.         0.         0.
  0.         0.         0.        ]
 [0.85292186 0.82776686 0.         0.         0.         0.
  0.         0.         0.        ]
 [0.92324094 0.93768546 0.         0.         0.         

In [56]:
%%time
while end_month <13:
    df_X_train, y_train, df_X_test, y_test = split_data(start_month, end_month)
    X_train, X_test, tf_vectorizer_train = run_tf_vec(df_X_train, df_X_test, model_num)
#     scores_tf_nb = predict_tf_nb(X_train, y_train, X_test, y_test)
    scores_tf_rf = predict_tf_rf(X_train, y_train, X_test, y_test)
    print(start_month, end_month, model_num)
    model_num += 1
    start_month += 1
    end_month += 1
    print('tf_nb')
    print(scores_tf_nb)
    print('tf_rf')
    print(scores_tf_rf)

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


{'class_weight': None, 'max_depth': None, 'n_estimators': 1000}
1 4 0
tf_nb
[[0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0.]]
tf_rf
[[0.96121533 0.         0.         0.         0.         0.
  0.         0.         0.        ]
 [0.84701248 0.         0.         0.         0.         0.
  0.         0.         0.        ]
 [0.92539455 0.         0.         0.         0.         0.
  0.         0.         0.        ]
 [0.88447035 0.         0.         0.         0.         0.
  0.         0.         0.        ]]


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


{'class_weight': None, 'max_depth': None, 'n_estimators': 500}
2 5 1
tf_nb
[[0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0.]]
tf_rf
[[0.96121533 0.96020642 0.         0.         0.         0.
  0.         0.         0.        ]
 [0.84701248 0.82776686 0.         0.         0.         0.
  0.         0.         0.        ]
 [0.92539455 0.93768546 0.         0.         0.         0.
  0.         0.         0.        ]
 [0.88447035 0.87930435 0.         0.         0.         0.
  0.         0.         0.        ]]
{'class_weight': None, 'max_depth': None, 'n_estimators': 500}
3 6 2
tf_nb
[[0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0.]]
tf_rf
[[0.96121533 0.96020642 0.95980414 0.         0.         0.
  0.         0.         0.        ]
 [0.84701248 0.82776686 0.83676012 0.         0.         0.
  0.         0.         0.        ]
 [0.92539455 0.9376

KeyboardInterrupt: 

## COUNT VECTORIZER NAIVE BAYES SCORE

In [15]:
accuracy_2017_tf_nb = np.mean(scores_tf_nb[0])
recall_2017_tf_nb = np.mean(scores_tf_nb[1])
precision_2017_tf_nb = np.mean(scores_tf_nb[2])
f1_score_2017_tf_nb = np.mean(scores_tf_nb[3])
print(accuracy_2017_tf_nb)
print(recall_2017_tf_nb)
print(precision_2017_tf_nb)
print(f1_score_2017_tf_nb)

0.8742970560045102
0.7169107075713074
0.6929862495075171
0.7043799660889926


## COUNT VECTORIZER RANDOM FOREST SCORE

In [14]:
accuracy_2017_tf_rf = np.mean(scores_tf_rf[0])
recall_2017_tf_rf = np.mean(scores_tf_rf[1])
precision_2017_tf_rf = np.mean(scores_tf_rf[2])
f1_score_2017_tf_rf = np.mean(scores_tf_rf[3])
print(accuracy_2017_tf_rf)
print(recall_2017_tf_rf)
print(precision_2017_tf_rf)
print(f1_score_2017_tf_rf)

0.9474790730435897
0.790758536328864
0.9494687554573628
0.8623895527424829


## Try running with TF-IDF

In [74]:
def run_tf_idf_vec(df_X_train, df_X_test):
    tf_idf_vectorizer_train = TfidfVectorizer(tokenizer=LemmaTokenizer(), stop_words='english')
    X_train2 = tf_idf_vectorizer_train.fit_transform(df_X_train).toarray()
    tf_idf_vectorizer_test = TfidfVectorizer(tokenizer=LemmaTokenizer(), stop_words='english', vocabulary = tf_idf_vectorizer_train.vocabulary_)
    X_test2 = tf_idf_vectorizer_test.fit_transform(df_X_test).toarray()
    return X_train2, X_test2, tf_idf_vectorizer_train

In [75]:
def predict_tf_idf_nb(X_train2, y_train, X_test2, y_test):
    from sklearn.naive_bayes import GaussianNB
    nb = GaussianNB()
    nb.fit(X_train2, y_train)
    pickle.dump(nb, open('tf_idf_nb_descrip' + str(model_num) + '.p', 'wb'))
    preds = nb.predict(X_test2)
    scores_tf_idf_nb[0][model_num] = accuracy_score(y_test, preds)
    scores_tf_idf_nb[1][model_num] = recall_score(y_test, preds)
    scores_tf_idf_nb[2][model_num] = precision_score(y_test, preds)
    scores_tf_idf_nb[3][model_num] = f1_score(y_test, preds)
    return scores_tf_idf_nb, nb

In [76]:
def predict_tf_idf_rf(X_train2, y_train, X_test2, y_test):
    rf = RandomForestClassifier(n_estimators = 10, n_jobs=-1, random_state=0, class_weight = {0:.95, 1:.05})
    rf.fit(X_train2, y_train)
    predicted = rf.predict(X_test2)
    pickle.dump(rf, open('tf_idf_rf_descrip'+ str(model_num) + '.p', 'wb'))
    scores_tf_idf_rf[0][model_num] = accuracy_score(y_test, predicted)
    scores_tf_idf_rf[1][model_num] = recall_score(y_test, predicted)
    scores_tf_idf_rf[2][model_num] = precision_score(y_test, predicted)
    scores_tf_idf_rf[3][model_num] = f1_score(y_test, predicted)
    return scores_tf_idf_rf, rf

In [77]:
scores_tf_idf_nb = np.zeros(shape=(4,9))
scores_tf_idf_rf = np.zeros(shape=(4,9))

model_num = 0
start_month = 1
end_month = 4

In [78]:
%%time
while end_month <13:
    df_X_train, y_train, df_X_test, y_test = split_data(start_month, end_month) 
    X_train2, X_test2, tf_idf_vectorizer_train = run_tf_idf_vec(df_X_train, df_X_test)
    scores_tf_idf_nb, nb = predict_tf_idf_nb(X_train2, y_train, X_test2, y_test)
    scores_tf_idf_rf, rf = predict_tf_idf_rf(X_train2, y_train, X_test2, y_test)
    print(start_month, end_month, model_num)
    model_num += 1
    start_month += 1
    end_month += 1
    print('tf_idf_nb')
    print(scores_tf_idf_nb)
    print('tf_idf_rf')
    print(scores_tf_idf_rf)

1 4 0
tf_idf_nb
[[0.73403153 0.         0.         0.         0.         0.
  0.         0.         0.        ]
 [0.93237032 0.         0.         0.         0.         0.
  0.         0.         0.        ]
 [0.39140022 0.         0.         0.         0.         0.
  0.         0.         0.        ]
 [0.55134925 0.         0.         0.         0.         0.
  0.         0.         0.        ]]
tf_idf_rf
[[0.94936126 0.         0.         0.         0.         0.
  0.         0.         0.        ]
 [0.76493762 0.         0.         0.         0.         0.
  0.         0.         0.        ]
 [0.93424218 0.         0.         0.         0.         0.
  0.         0.         0.        ]
 [0.84115523 0.         0.         0.         0.         0.
  0.         0.         0.        ]]
2 5 1
tf_idf_nb
[[0.73403153 0.73084862 0.         0.         0.         0.
  0.         0.         0.        ]
 [0.93237032 0.9194499  0.         0.         0.         0.
  0.         0.         0.      

## TF IDF NAIVE BAYES SCORE

In [80]:
accuracy_2017_tf_idf_nb = np.mean(scores_tf_idf_nb[0])
recall_2017_tf_idf_nb = np.mean(scores_tf_idf_nb[1])
precision_2017_tf_idf_nb = np.mean(scores_tf_idf_nb[2])
f1_score_2017_tf_idf_nb = np.mean(scores_tf_idf_nb[3])
print(accuracy_2017_tf_idf_nb)
print(recall_2017_tf_idf_nb)
print(precision_2017_tf_idf_nb)
print(f1_score_2017_tf_idf_nb)

0.7404176382546709
0.9231443684365492
0.4319956286015536
0.5876026924060398


## TF IDF RANDOM FOREST SCORE

In [81]:
accuracy_2017_tf_idf_rf = np.mean(scores_tf_idf_rf[0])
recall_2017_tf_idf_rf = np.mean(scores_tf_idf_rf[1])
precision_2017_tf_idf_rf = np.mean(scores_tf_idf_rf[2])
f1_score_2017_tf_idf_rf = np.mean(scores_tf_idf_rf[3])
print(accuracy_2017_tf_idf_rf)
print(recall_2017_tf_idf_rf)
print(precision_2017_tf_idf_rf)
print(f1_score_2017_tf_idf_rf)

0.9402834453899148
0.7532084507262276
0.9364673471514087
0.8344959114674015
