In [26]:
import pandas as pd
import numpy as np
import pickle
from datetime import datetime as dt
from sklearn.ensemble import RandomForestClassifier

pd.set_option("display.max_colwidth",999)
pd.set_option("display.max_rows",999)
pd.set_option("display.max_columns",999)

In [4]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/ubuntu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [62]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/ubuntu/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [86]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /home/ubuntu/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

# Load pickle data

In [7]:
df_sf_2017 = pickle.load(open('data_sf_2017.p', 'rb'))

In [22]:
len(df_sf_2017)

103956

# NLP

In [20]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import time
from sklearn.metrics import accuracy_score, confusion_matrix, recall_score, precision_score, f1_score

In [9]:
df_sf_2017.description.fillna(value='None', inplace=True)

In [10]:
%%time

def process_text(text):
    from nltk.tokenize import RegexpTokenizer
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(text)
    
    stemmed = []
    snowball = SnowballStemmer("english")
    for item in tokens:
        stemmed.append(snowball.stem(item))
        
    lemmatized = []
    wordnet = WordNetLemmatizer()
    for item in stemmed:
        lemmatized.append(wordnet.lemmatize(item))
    
    return lemmatized
    

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 6.68 µs


## split data - running NLP on description column

In [11]:
df_X_train = df_sf_2017[(df_sf_2017['month'] >= 1) & (df_sf_2017['month'] < 4)]['description']
y_train = df_sf_2017[(df_sf_2017['month'] >= 1) & (df_sf_2017['month'] < 4)]['popular']

df_X_test = df_sf_2017[df_sf_2017['month'] == 4]['description']
y_test = df_sf_2017[df_sf_2017['month'] == 4]['popular']

## Run CountVectorizer

In [15]:
%%time

tf_vectorizer_train = CountVectorizer(analyzer=process_text).fit(df_X_train)
X_train = tf_vectorizer_train.transform(df_X_train)


CPU times: user 1min 56s, sys: 68 ms, total: 1min 56s
Wall time: 1min 56s


In [16]:
%%time
tf_vectorizer_test = CountVectorizer(analyzer=process_text, vocabulary=tf_vectorizer_train.vocabulary_).fit(df_X_test)
X_test = tf_vectorizer_test.transform(df_X_test)

CPU times: user 38.3 s, sys: 0 ns, total: 38.3 s
Wall time: 38.3 s


In [None]:
# vect = CountVectorizer(stop_words='english',lowercase=True, ngram_range=(1,2)) 

In [181]:
tf_vectorizer

CountVectorizer(analyzer=<function process_text at 0x7f57a9f9a510>,
        binary=False, decode_error='strict', dtype=<class 'numpy.int64'>,
        encoding='utf-8', input='content', lowercase=True, max_df=1.0,
        max_features=None, min_df=1, ngram_range=(1, 1), preprocessor=None,
        stop_words=None, strip_accents=None,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, vocabulary=None)

In [17]:
X_test.shape

(8700, 12804)

In [18]:
X_train.shape

(26530, 12804)

In [21]:
# def predict_data(X_train, y_train, X_test, y_test):
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()
nb.fit(X_train, y_train)
# pickle.dump(nb, open('nb_model.p', 'wb'))
preds = nb.predict(X_test)
print(accuracy_score(y_test, preds))
print(recall_score(y_test, nb.predict(X_test)))
print(precision_score(y_test, nb.predict(X_test)))
print(f1_score(y_test, nb.predict(X_test)))

0.8544827586206897
0.663727959697733
0.5901455767077267
0.6247777119146414


## read pickle file on multinomialNB if needed

In [37]:
# nb = pickle.load(open('nb_model.p', 'rb'))

## With the CountVectorizer, run with RandomForest 

In [28]:
rf = RandomForestClassifier(n_estimators = 50, random_state=0, class_weight = {0:.95, 1:.05})
rf.fit(X_train, y_train)
predicted = rf.predict(X_test)
pickle.dump(rf, open('rf_nlp_countvec_50.p', 'wb'))
print(accuracy_score(y_test, predicted))
print(recall_score(y_test, rf.predict(X_test)))
print(precision_score(y_test, rf.predict(X_test)))
print(f1_score(y_test, rf.predict(X_test)))

0.9570114942528736
0.8035264483627204
0.953662182361734
0.8721804511278195


## Try running with TF-IDF

In [29]:
%%time
tf_idf_vectorizer_train = TfidfVectorizer(analyzer=process_text)
X_train2 = tf_idf_vectorizer_train.fit_transform(df_X_train).toarray()

CPU times: user 58.4 s, sys: 528 ms, total: 59 s
Wall time: 59 s


In [30]:
%%time
tf_idf_vectorizer_test = TfidfVectorizer(analyzer=process_text, vocabulary=tf_idf_vectorizer_train.vocabulary_)
X_test2 = tf_idf_vectorizer_test.fit_transform(df_X_test).toarray()


CPU times: user 19.3 s, sys: 192 ms, total: 19.5 s
Wall time: 19.5 s


In [31]:
X_test2.shape

(8700, 12804)

In [32]:
X_train2.shape

(26530, 12804)

In [34]:
# def predict_data(X_train, y_train, X_test, y_test):
from sklearn.naive_bayes import MultinomialNB, GaussianNB
nb = GaussianNB()
nb.fit(X_train2, y_train)
# pickle.dump(nb, open('nb_model_guassian.p', 'wb'))
preds = nb.predict(X_test2)
print(accuracy_score(y_test, preds))
print(recall_score(y_test, nb.predict(X_test2)))
print(precision_score(y_test, nb.predict(X_test2)))
print(f1_score(y_test, nb.predict(X_test2)))

0.6135632183908046
0.9370277078085643
0.31326315789473685
0.4695487535500158


In [36]:
# print('Shape of Sparse Matrix: ', X.shape)
# print('Amount of Non-Zero occurrences: ', X.nnz)
# # Percentage of non-zero values
# density = (100.0 * X.nnz / (X.shape[0] * X.shape[1]))
# print('Density: {}'.format((density)))

## With the TF-IDF, run with RandomForest 

In [None]:
rf = RandomForestClassifier(n_estimators = 200, random_state=0, class_weight = {0:.95, 1:.05})
rf.fit(X_train2, y_train)
predicted = rf.predict(X_test2)
pickle.dump(rf, open('rf_nlp_200.p', 'wb'))
print(accuracy_score(y_test, predicted))
print(recall_score(y_test, rf.predict(X_test2)))
print(precision_score(y_test, rf.predict(X_test2)))
print(f1_score(y_test, rf.predict(X_test2)))