## Natural Language Processing
### Using the Bag of Words algorithm

In [118]:
# Load in the data THIS DATASET IS TAB SEPERATED
import os
import pandas as pd

DATASET_PATH = "datasets/"
DATASET_NAME = "Restaurant_Reviews.tsv"
DATASET_URL = DATASET_PATH + DATASET_NAME

def fetch_data(dataset_url=DATASET_URL, dataset_path=DATASET_PATH):
    if not os.path.isdir(dataset_path):
        os.makedirs(dataset_path)

dataset = fetch_data()

def load_data(dataset_path=DATASET_PATH, dataset_name=DATASET_NAME):
    csv_path = os.path.join(dataset_path, dataset_name)
    return pd.read_csv(csv_path, delimiter='\t', quoting = 3)

dataset = load_data()

In [3]:
dataset.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [32]:
index = dataset['Review'].shape
index = index[0]
print(index)

1000


In [119]:
# Any natural language processing algorithm needs to clean the dataset
import re
import nltk

nltk.download('stopwords') #stop words helps remove words that aren't considered 'relevant'
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

clean_dataset = []

for i in range(0, index):
    review = re.sub('[^a-zA-Z]', ' ', dataset['Review'][i])
    review = review.lower()
    review = review.split()
    ps = PorterStemmer()
    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
    review = " ".join(review)
    clean_dataset.append(review)
print(review)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/werlingk/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
wast enough life pour salt wound draw time took bring check


In [120]:
# Build out a big ol' bag of words model
#this tool can be used as a shortcut but I broke everything out above should I need to customize it later
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(max_features = 1500)
X = cv.fit_transform(clean_dataset).toarray()

In [37]:
X.shape

(1000, 1500)

In [121]:
y = dataset['Liked'].values
y.shape

(1000,)

In [122]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 0)

In [123]:
# Create Naive Bayes Classifier
from sklearn.naive_bayes import MultinomialNB
nb_clf = MultinomialNB()
nb_clf.fit(X_train, y_train)

y_pred = nb_clf.predict(X_test)

In [124]:
# Check out the performance
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score

cf_mat= confusion_matrix(y_test, y_pred)
tp = cf_mat[0,0]
tn = cf_mat[1,1]
fp = cf_mat[1,0]
fn = cf_mat[0,1]
total = tp+tn+fp+fn

print("Precision: %.2f" % precision_score(y_test, y_pred))
print('Recall: %.2f' % recall_score(y_test, y_pred))
print('F1 Score: %.2f' % f1_score(y_test, y_pred))
print("Percentage True Positive: %.2f" % ((tp/total)*100))
print("Percentage True Negative: %.2f" % ((tn/total)*100))
confusion_matrix(y_test, y_pred)

Precision: 0.76
Recall: 0.79
F1 Score: 0.78
Percentage True Positive: 36.00
Percentage True Negative: 40.50


array([[72, 25],
       [22, 81]])

In [125]:
acc_clf = round(nb_clf.score(X_train, y_train) * 100, 2)
print("Training set performance: %.2f" % acc_clf)

Training set performance: 93.88


In [126]:
acc_clf = round(nb_clf.score(X_test, y_test) * 100, 2)
print("Test set performance: %.2f" % acc_clf)

Test set performance: 76.50


In [96]:
# Try SVM
from sklearn.linear_model import SGDClassifier
sv_clf = SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=5, random_state=42)
sv_clf.fit(X_train, y_train)

y_pred = sv_clf.predict(X_test)

In [97]:
# Check out the performance
cf_mat= confusion_matrix(y_test, y_pred)
tp = cf_mat[0,0]
tn = cf_mat[1,1]
fp = cf_mat[1,0]
fn = cf_mat[0,1]
total = tp+tn+fp+fn

print("Precision: %.2f" % precision_score(y_test, y_pred))
print('Recall: %.2f' % recall_score(y_test, y_pred))
print('F1 Score: %.2f' % f1_score(y_test, y_pred))
print("Percentage True Positive: %.2f" % ((tp/total)*100))
print("Percentage True Negative: %.2f" % ((tn/total)*100))
confusion_matrix(y_test, y_pred)

Precision: 0.70
Recall: 0.81
F1 Score: 0.75
Percentage True Positive: 30.50
Percentage True Negative: 41.50


array([[61, 36],
       [20, 83]])

In [98]:
acc_clf = round(sv_clf.score(X_train, y_train) * 100, 2)
print("Training set performance: %.2f" % acc_clf)

Training set performance: 94.25


In [99]:
acc_clf = round(sv_clf.score(X_test, y_test) * 100, 2)
print("Test set performance: %.2f" % acc_clf)

Test set performance: 72.00


In [87]:
# Create Decision Tree Classifier
from sklearn.ensemble import RandomForestClassifier
rf_clf = RandomForestClassifier(criterion='entropy', random_state = 0, n_estimators=12)
rf_clf.fit(X_train, y_train)

y_pred = rf_clf.predict(X_test)

In [88]:
# Check out the performance
cf_mat= confusion_matrix(y_test, y_pred)
tp = cf_mat[0,0]
tn = cf_mat[1,1]
fp = cf_mat[1,0]
fn = cf_mat[0,1]
total = tp+tn+fp+fn

print("Precision: %.2f" % precision_score(y_test, y_pred))
print('Recall: %.2f' % recall_score(y_test, y_pred))
print('F1 Score: %.2f' % f1_score(y_test, y_pred))
print("Percentage True Positive: %.2f" % ((tp/total)*100))
print("Percentage True Negative: %.2f" % ((tn/total)*100))
confusion_matrix(y_test, y_pred)

Precision: 0.83
Recall: 0.57
F1 Score: 0.68
Percentage True Positive: 42.50
Percentage True Negative: 29.50


array([[85, 12],
       [44, 59]])

In [89]:
acc_clf = round(rf_clf.score(X_train, y_train) * 100, 2)
print("Training set performance: %.2f" % acc_clf)

Training set performance: 98.25


In [90]:
acc_clf = round(rf_clf.score(X_test, y_test) * 100, 2)
print("Test set performance: %.2f" % acc_clf)

Test set performance: 72.00


In [130]:
# Lets try a grid search for Naive Bayes since it was the most promising
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

text_clf = Pipeline([('vect', CountVectorizer()), ('clf', MultinomialNB()) ])

parameters = {'vect__ngram_range': [(1, 1), (1, 2)], 'tfidf__use_idf': (True, False), 'clf__alpha': (1e-2, 1e-3) }

gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1)
gs_clf = gs_clf.fit(X_train, y_train)
y_pred = gs_clf.predict(X_test)

JoblibValueError: JoblibValueError
___________________________________________________________________________
Multiprocessing exception:
...........................................................................
/usr/local/anaconda3/lib/python3.6/runpy.py in _run_module_as_main(mod_name='ipykernel_launcher', alter_argv=1)
    188         sys.exit(msg)
    189     main_globals = sys.modules["__main__"].__dict__
    190     if alter_argv:
    191         sys.argv[0] = mod_spec.origin
    192     return _run_code(code, main_globals, None,
--> 193                      "__main__", mod_spec)
        mod_spec = ModuleSpec(name='ipykernel_launcher', loader=<_f...b/python3.6/site-packages/ipykernel_launcher.py')
    194 
    195 def run_module(mod_name, init_globals=None,
    196                run_name=None, alter_sys=False):
    197     """Execute a module's code without importing it

...........................................................................
/usr/local/anaconda3/lib/python3.6/runpy.py in _run_code(code=<code object <module> at 0x10751fc90, file "/usr...3.6/site-packages/ipykernel_launcher.py", line 5>, run_globals={'__annotations__': {}, '__builtins__': <module 'builtins' (built-in)>, '__cached__': '/usr/local/anaconda3/lib/python3.6/site-packages/__pycache__/ipykernel_launcher.cpython-36.pyc', '__doc__': 'Entry point for launching an IPython kernel.\n\nTh...orts until\nafter removing the cwd from sys.path.\n', '__file__': '/usr/local/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py', '__loader__': <_frozen_importlib_external.SourceFileLoader object>, '__name__': '__main__', '__package__': '', '__spec__': ModuleSpec(name='ipykernel_launcher', loader=<_f...b/python3.6/site-packages/ipykernel_launcher.py'), 'app': <module 'ipykernel.kernelapp' from '/usr/local/a.../python3.6/site-packages/ipykernel/kernelapp.py'>, ...}, init_globals=None, mod_name='__main__', mod_spec=ModuleSpec(name='ipykernel_launcher', loader=<_f...b/python3.6/site-packages/ipykernel_launcher.py'), pkg_name='', script_name=None)
     80                        __cached__ = cached,
     81                        __doc__ = None,
     82                        __loader__ = loader,
     83                        __package__ = pkg_name,
     84                        __spec__ = mod_spec)
---> 85     exec(code, run_globals)
        code = <code object <module> at 0x10751fc90, file "/usr...3.6/site-packages/ipykernel_launcher.py", line 5>
        run_globals = {'__annotations__': {}, '__builtins__': <module 'builtins' (built-in)>, '__cached__': '/usr/local/anaconda3/lib/python3.6/site-packages/__pycache__/ipykernel_launcher.cpython-36.pyc', '__doc__': 'Entry point for launching an IPython kernel.\n\nTh...orts until\nafter removing the cwd from sys.path.\n', '__file__': '/usr/local/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py', '__loader__': <_frozen_importlib_external.SourceFileLoader object>, '__name__': '__main__', '__package__': '', '__spec__': ModuleSpec(name='ipykernel_launcher', loader=<_f...b/python3.6/site-packages/ipykernel_launcher.py'), 'app': <module 'ipykernel.kernelapp' from '/usr/local/a.../python3.6/site-packages/ipykernel/kernelapp.py'>, ...}
     86     return run_globals
     87 
     88 def _run_module_code(code, init_globals=None,
     89                     mod_name=None, mod_spec=None,

...........................................................................
/usr/local/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py in <module>()
     11     # This is added back by InteractiveShellApp.init_path()
     12     if sys.path[0] == '':
     13         del sys.path[0]
     14 
     15     from ipykernel import kernelapp as app
---> 16     app.launch_new_instance()
     17 
     18 
     19 
     20 

...........................................................................
/usr/local/anaconda3/lib/python3.6/site-packages/traitlets/config/application.py in launch_instance(cls=<class 'ipykernel.kernelapp.IPKernelApp'>, argv=None, **kwargs={})
    653 
    654         If a global instance already exists, this reinitializes and starts it
    655         """
    656         app = cls.instance(**kwargs)
    657         app.initialize(argv)
--> 658         app.start()
        app.start = <bound method IPKernelApp.start of <ipykernel.kernelapp.IPKernelApp object>>
    659 
    660 #-----------------------------------------------------------------------------
    661 # utility functions, for convenience
    662 #-----------------------------------------------------------------------------

...........................................................................
/usr/local/anaconda3/lib/python3.6/site-packages/ipykernel/kernelapp.py in start(self=<ipykernel.kernelapp.IPKernelApp object>)
    472             return self.subapp.start()
    473         if self.poller is not None:
    474             self.poller.start()
    475         self.kernel.start()
    476         try:
--> 477             ioloop.IOLoop.instance().start()
    478         except KeyboardInterrupt:
    479             pass
    480 
    481 launch_new_instance = IPKernelApp.launch_instance

...........................................................................
/usr/local/anaconda3/lib/python3.6/site-packages/zmq/eventloop/ioloop.py in start(self=<zmq.eventloop.ioloop.ZMQIOLoop object>)
    172             )
    173         return loop
    174     
    175     def start(self):
    176         try:
--> 177             super(ZMQIOLoop, self).start()
        self.start = <bound method ZMQIOLoop.start of <zmq.eventloop.ioloop.ZMQIOLoop object>>
    178         except ZMQError as e:
    179             if e.errno == ETERM:
    180                 # quietly return on ETERM
    181                 pass

...........................................................................
/usr/local/anaconda3/lib/python3.6/site-packages/tornado/ioloop.py in start(self=<zmq.eventloop.ioloop.ZMQIOLoop object>)
    883                 self._events.update(event_pairs)
    884                 while self._events:
    885                     fd, events = self._events.popitem()
    886                     try:
    887                         fd_obj, handler_func = self._handlers[fd]
--> 888                         handler_func(fd_obj, events)
        handler_func = <function wrap.<locals>.null_wrapper>
        fd_obj = <zmq.sugar.socket.Socket object>
        events = 1
    889                     except (OSError, IOError) as e:
    890                         if errno_from_exception(e) == errno.EPIPE:
    891                             # Happens when the client closes the connection
    892                             pass

...........................................................................
/usr/local/anaconda3/lib/python3.6/site-packages/tornado/stack_context.py in null_wrapper(*args=(<zmq.sugar.socket.Socket object>, 1), **kwargs={})
    272         # Fast path when there are no active contexts.
    273         def null_wrapper(*args, **kwargs):
    274             try:
    275                 current_state = _state.contexts
    276                 _state.contexts = cap_contexts[0]
--> 277                 return fn(*args, **kwargs)
        args = (<zmq.sugar.socket.Socket object>, 1)
        kwargs = {}
    278             finally:
    279                 _state.contexts = current_state
    280         null_wrapper._wrapped = True
    281         return null_wrapper

...........................................................................
/usr/local/anaconda3/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py in _handle_events(self=<zmq.eventloop.zmqstream.ZMQStream object>, fd=<zmq.sugar.socket.Socket object>, events=1)
    435             # dispatch events:
    436             if events & IOLoop.ERROR:
    437                 gen_log.error("got POLLERR event on ZMQStream, which doesn't make sense")
    438                 return
    439             if events & IOLoop.READ:
--> 440                 self._handle_recv()
        self._handle_recv = <bound method ZMQStream._handle_recv of <zmq.eventloop.zmqstream.ZMQStream object>>
    441                 if not self.socket:
    442                     return
    443             if events & IOLoop.WRITE:
    444                 self._handle_send()

...........................................................................
/usr/local/anaconda3/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py in _handle_recv(self=<zmq.eventloop.zmqstream.ZMQStream object>)
    467                 gen_log.error("RECV Error: %s"%zmq.strerror(e.errno))
    468         else:
    469             if self._recv_callback:
    470                 callback = self._recv_callback
    471                 # self._recv_callback = None
--> 472                 self._run_callback(callback, msg)
        self._run_callback = <bound method ZMQStream._run_callback of <zmq.eventloop.zmqstream.ZMQStream object>>
        callback = <function wrap.<locals>.null_wrapper>
        msg = [<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>]
    473                 
    474         # self.update_state()
    475         
    476 

...........................................................................
/usr/local/anaconda3/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py in _run_callback(self=<zmq.eventloop.zmqstream.ZMQStream object>, callback=<function wrap.<locals>.null_wrapper>, *args=([<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>],), **kwargs={})
    409         close our socket."""
    410         try:
    411             # Use a NullContext to ensure that all StackContexts are run
    412             # inside our blanket exception handler rather than outside.
    413             with stack_context.NullContext():
--> 414                 callback(*args, **kwargs)
        callback = <function wrap.<locals>.null_wrapper>
        args = ([<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>],)
        kwargs = {}
    415         except:
    416             gen_log.error("Uncaught exception, closing connection.",
    417                           exc_info=True)
    418             # Close the socket on an uncaught exception from a user callback

...........................................................................
/usr/local/anaconda3/lib/python3.6/site-packages/tornado/stack_context.py in null_wrapper(*args=([<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>],), **kwargs={})
    272         # Fast path when there are no active contexts.
    273         def null_wrapper(*args, **kwargs):
    274             try:
    275                 current_state = _state.contexts
    276                 _state.contexts = cap_contexts[0]
--> 277                 return fn(*args, **kwargs)
        args = ([<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>],)
        kwargs = {}
    278             finally:
    279                 _state.contexts = current_state
    280         null_wrapper._wrapped = True
    281         return null_wrapper

...........................................................................
/usr/local/anaconda3/lib/python3.6/site-packages/ipykernel/kernelbase.py in dispatcher(msg=[<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>])
    278         if self.control_stream:
    279             self.control_stream.on_recv(self.dispatch_control, copy=False)
    280 
    281         def make_dispatcher(stream):
    282             def dispatcher(msg):
--> 283                 return self.dispatch_shell(stream, msg)
        msg = [<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>]
    284             return dispatcher
    285 
    286         for s in self.shell_streams:
    287             s.on_recv(make_dispatcher(s), copy=False)

...........................................................................
/usr/local/anaconda3/lib/python3.6/site-packages/ipykernel/kernelbase.py in dispatch_shell(self=<ipykernel.ipkernel.IPythonKernel object>, stream=<zmq.eventloop.zmqstream.ZMQStream object>, msg={'buffers': [], 'content': {'allow_stdin': True, 'code': '# Lets try a grid search for Naive Bayes since i...X_train, y_train)\ny_pred = gs_clf.predict(X_test)', 'silent': False, 'stop_on_error': True, 'store_history': True, 'user_expressions': {}}, 'header': {'date': datetime.datetime(2017, 11, 2, 2, 49, 17, 603206, tzinfo=datetime.timezone.utc), 'msg_id': 'C8F6387707C3475F8B9A20A40EDBC0E7', 'msg_type': 'execute_request', 'session': 'B3E4845EBF624D6287377CEC056948A4', 'username': 'username', 'version': '5.0'}, 'metadata': {}, 'msg_id': 'C8F6387707C3475F8B9A20A40EDBC0E7', 'msg_type': 'execute_request', 'parent_header': {}})
    230             self.log.warn("Unknown message type: %r", msg_type)
    231         else:
    232             self.log.debug("%s: %s", msg_type, msg)
    233             self.pre_handler_hook()
    234             try:
--> 235                 handler(stream, idents, msg)
        handler = <bound method Kernel.execute_request of <ipykernel.ipkernel.IPythonKernel object>>
        stream = <zmq.eventloop.zmqstream.ZMQStream object>
        idents = [b'B3E4845EBF624D6287377CEC056948A4']
        msg = {'buffers': [], 'content': {'allow_stdin': True, 'code': '# Lets try a grid search for Naive Bayes since i...X_train, y_train)\ny_pred = gs_clf.predict(X_test)', 'silent': False, 'stop_on_error': True, 'store_history': True, 'user_expressions': {}}, 'header': {'date': datetime.datetime(2017, 11, 2, 2, 49, 17, 603206, tzinfo=datetime.timezone.utc), 'msg_id': 'C8F6387707C3475F8B9A20A40EDBC0E7', 'msg_type': 'execute_request', 'session': 'B3E4845EBF624D6287377CEC056948A4', 'username': 'username', 'version': '5.0'}, 'metadata': {}, 'msg_id': 'C8F6387707C3475F8B9A20A40EDBC0E7', 'msg_type': 'execute_request', 'parent_header': {}}
    236             except Exception:
    237                 self.log.error("Exception in message handler:", exc_info=True)
    238             finally:
    239                 self.post_handler_hook()

...........................................................................
/usr/local/anaconda3/lib/python3.6/site-packages/ipykernel/kernelbase.py in execute_request(self=<ipykernel.ipkernel.IPythonKernel object>, stream=<zmq.eventloop.zmqstream.ZMQStream object>, ident=[b'B3E4845EBF624D6287377CEC056948A4'], parent={'buffers': [], 'content': {'allow_stdin': True, 'code': '# Lets try a grid search for Naive Bayes since i...X_train, y_train)\ny_pred = gs_clf.predict(X_test)', 'silent': False, 'stop_on_error': True, 'store_history': True, 'user_expressions': {}}, 'header': {'date': datetime.datetime(2017, 11, 2, 2, 49, 17, 603206, tzinfo=datetime.timezone.utc), 'msg_id': 'C8F6387707C3475F8B9A20A40EDBC0E7', 'msg_type': 'execute_request', 'session': 'B3E4845EBF624D6287377CEC056948A4', 'username': 'username', 'version': '5.0'}, 'metadata': {}, 'msg_id': 'C8F6387707C3475F8B9A20A40EDBC0E7', 'msg_type': 'execute_request', 'parent_header': {}})
    394         if not silent:
    395             self.execution_count += 1
    396             self._publish_execute_input(code, parent, self.execution_count)
    397 
    398         reply_content = self.do_execute(code, silent, store_history,
--> 399                                         user_expressions, allow_stdin)
        user_expressions = {}
        allow_stdin = True
    400 
    401         # Flush output before sending the reply.
    402         sys.stdout.flush()
    403         sys.stderr.flush()

...........................................................................
/usr/local/anaconda3/lib/python3.6/site-packages/ipykernel/ipkernel.py in do_execute(self=<ipykernel.ipkernel.IPythonKernel object>, code='# Lets try a grid search for Naive Bayes since i...X_train, y_train)\ny_pred = gs_clf.predict(X_test)', silent=False, store_history=True, user_expressions={}, allow_stdin=True)
    191 
    192         self._forward_input(allow_stdin)
    193 
    194         reply_content = {}
    195         try:
--> 196             res = shell.run_cell(code, store_history=store_history, silent=silent)
        res = undefined
        shell.run_cell = <bound method ZMQInteractiveShell.run_cell of <ipykernel.zmqshell.ZMQInteractiveShell object>>
        code = '# Lets try a grid search for Naive Bayes since i...X_train, y_train)\ny_pred = gs_clf.predict(X_test)'
        store_history = True
        silent = False
    197         finally:
    198             self._restore_input()
    199 
    200         if res.error_before_exec is not None:

...........................................................................
/usr/local/anaconda3/lib/python3.6/site-packages/ipykernel/zmqshell.py in run_cell(self=<ipykernel.zmqshell.ZMQInteractiveShell object>, *args=('# Lets try a grid search for Naive Bayes since i...X_train, y_train)\ny_pred = gs_clf.predict(X_test)',), **kwargs={'silent': False, 'store_history': True})
    528             )
    529         self.payload_manager.write_payload(payload)
    530 
    531     def run_cell(self, *args, **kwargs):
    532         self._last_traceback = None
--> 533         return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
        self.run_cell = <bound method ZMQInteractiveShell.run_cell of <ipykernel.zmqshell.ZMQInteractiveShell object>>
        args = ('# Lets try a grid search for Naive Bayes since i...X_train, y_train)\ny_pred = gs_clf.predict(X_test)',)
        kwargs = {'silent': False, 'store_history': True}
    534 
    535     def _showtraceback(self, etype, evalue, stb):
    536         # try to preserve ordering of tracebacks and print statements
    537         sys.stdout.flush()

...........................................................................
/usr/local/anaconda3/lib/python3.6/site-packages/IPython/core/interactiveshell.py in run_cell(self=<ipykernel.zmqshell.ZMQInteractiveShell object>, raw_cell='# Lets try a grid search for Naive Bayes since i...X_train, y_train)\ny_pred = gs_clf.predict(X_test)', store_history=True, silent=False, shell_futures=True)
   2712                 self.displayhook.exec_result = result
   2713 
   2714                 # Execute the user code
   2715                 interactivity = "none" if silent else self.ast_node_interactivity
   2716                 has_raised = self.run_ast_nodes(code_ast.body, cell_name,
-> 2717                    interactivity=interactivity, compiler=compiler, result=result)
        interactivity = 'last_expr'
        compiler = <IPython.core.compilerop.CachingCompiler object>
   2718                 
   2719                 self.last_execution_succeeded = not has_raised
   2720 
   2721                 # Reset this so later displayed values do not modify the

...........................................................................
/usr/local/anaconda3/lib/python3.6/site-packages/IPython/core/interactiveshell.py in run_ast_nodes(self=<ipykernel.zmqshell.ZMQInteractiveShell object>, nodelist=[<_ast.ImportFrom object>, <_ast.ImportFrom object>, <_ast.Assign object>, <_ast.Assign object>, <_ast.Assign object>, <_ast.Assign object>, <_ast.Assign object>], cell_name='<ipython-input-130-8559eefa13d3>', interactivity='none', compiler=<IPython.core.compilerop.CachingCompiler object>, result=<ExecutionResult object at 1152f10f0, execution_..._before_exec=None error_in_exec=None result=None>)
   2816 
   2817         try:
   2818             for i, node in enumerate(to_run_exec):
   2819                 mod = ast.Module([node])
   2820                 code = compiler(mod, cell_name, "exec")
-> 2821                 if self.run_code(code, result):
        self.run_code = <bound method InteractiveShell.run_code of <ipykernel.zmqshell.ZMQInteractiveShell object>>
        code = <code object <module> at 0x115385420, file "<ipython-input-130-8559eefa13d3>", line 10>
        result = <ExecutionResult object at 1152f10f0, execution_..._before_exec=None error_in_exec=None result=None>
   2822                     return True
   2823 
   2824             for i, node in enumerate(to_run_interactive):
   2825                 mod = ast.Interactive([node])

...........................................................................
/usr/local/anaconda3/lib/python3.6/site-packages/IPython/core/interactiveshell.py in run_code(self=<ipykernel.zmqshell.ZMQInteractiveShell object>, code_obj=<code object <module> at 0x115385420, file "<ipython-input-130-8559eefa13d3>", line 10>, result=<ExecutionResult object at 1152f10f0, execution_..._before_exec=None error_in_exec=None result=None>)
   2876         outflag = 1  # happens in more places, so it's easier as default
   2877         try:
   2878             try:
   2879                 self.hooks.pre_run_code_hook()
   2880                 #rprint('Running code', repr(code_obj)) # dbg
-> 2881                 exec(code_obj, self.user_global_ns, self.user_ns)
        code_obj = <code object <module> at 0x115385420, file "<ipython-input-130-8559eefa13d3>", line 10>
        self.user_global_ns = {'CountVectorizer': <class 'sklearn.feature_extraction.text.CountVectorizer'>, 'DATASET_NAME': 'Restaurant_Reviews.tsv', 'DATASET_PATH': 'datasets/', 'DATASET_URL': 'datasets/Restaurant_Reviews.tsv', 'DecisionTreeClassifier': <class 'sklearn.tree.tree.DecisionTreeClassifier'>, 'GaussianNB': <class 'sklearn.naive_bayes.GaussianNB'>, 'GridSearchCV': <class 'sklearn.model_selection._search.GridSearchCV'>, 'In': ['', "# Load in the data THIS DATASET IS TAB SEPERATED...(csv_path, delimiter='\\t')\n\ndataset = load_data()", "# Load in the data THIS DATASET IS TAB SEPERATED...limiter='\\t', quoting = 3)\n\ndataset = load_data()", 'dataset.head()', 'dataset.shape()', 'dataset.shape', "# Any natural language processing algorithm need...-zA-Z]', ' ', dataset['Review'][0])\nprint(review)", "# Any natural language processing algorithm need...eview'][0])\nreview = revier.lower()\nprint(review)", "# Any natural language processing algorithm need...eview'][0])\nreview = review.lower()\nprint(review)", "# Any natural language processing algorithm need...word in stopwords.words['english']]\nprint(review)", "# Any natural language processing algorithm need...word in stopwords.words['english']]\nprint(review)", "# Any natural language processing algorithm need...in set(stopwords.words['english'])]\nprint(review)", "# Any natural language processing algorithm need...in set(stopwords.words('english'))]\nprint(review)", "# Load in the data THIS DATASET IS TAB SEPERATED...limiter='\\t', quoting = 3)\n\ndataset = load_data()", "# Any natural language processing algorithm need...in set(stopwords.words('english'))]\nprint(review)", "# Any natural language processing algorithm need...in set(stopwords.words('english'))]\nprint(review)", "# Any natural language processing algorithm need...lish'))]\nreview = join(review, ' ')\nprint(review)", "# Any natural language processing algorithm need...h'))]\nreview = pd.join(review, ' ')\nprint(review)", '# Any natural language processing algorithm need...glish\'))]\nreview = " ".join(review)\nprint(review)', 'dataset.shape(axis=0)', ...], 'MultinomialNB': <class 'sklearn.naive_bayes.MultinomialNB'>, 'NamespaceMagics': <class 'IPython.core.magics.namespace.NamespaceMagics'>, ...}
        self.user_ns = {'CountVectorizer': <class 'sklearn.feature_extraction.text.CountVectorizer'>, 'DATASET_NAME': 'Restaurant_Reviews.tsv', 'DATASET_PATH': 'datasets/', 'DATASET_URL': 'datasets/Restaurant_Reviews.tsv', 'DecisionTreeClassifier': <class 'sklearn.tree.tree.DecisionTreeClassifier'>, 'GaussianNB': <class 'sklearn.naive_bayes.GaussianNB'>, 'GridSearchCV': <class 'sklearn.model_selection._search.GridSearchCV'>, 'In': ['', "# Load in the data THIS DATASET IS TAB SEPERATED...(csv_path, delimiter='\\t')\n\ndataset = load_data()", "# Load in the data THIS DATASET IS TAB SEPERATED...limiter='\\t', quoting = 3)\n\ndataset = load_data()", 'dataset.head()', 'dataset.shape()', 'dataset.shape', "# Any natural language processing algorithm need...-zA-Z]', ' ', dataset['Review'][0])\nprint(review)", "# Any natural language processing algorithm need...eview'][0])\nreview = revier.lower()\nprint(review)", "# Any natural language processing algorithm need...eview'][0])\nreview = review.lower()\nprint(review)", "# Any natural language processing algorithm need...word in stopwords.words['english']]\nprint(review)", "# Any natural language processing algorithm need...word in stopwords.words['english']]\nprint(review)", "# Any natural language processing algorithm need...in set(stopwords.words['english'])]\nprint(review)", "# Any natural language processing algorithm need...in set(stopwords.words('english'))]\nprint(review)", "# Load in the data THIS DATASET IS TAB SEPERATED...limiter='\\t', quoting = 3)\n\ndataset = load_data()", "# Any natural language processing algorithm need...in set(stopwords.words('english'))]\nprint(review)", "# Any natural language processing algorithm need...in set(stopwords.words('english'))]\nprint(review)", "# Any natural language processing algorithm need...lish'))]\nreview = join(review, ' ')\nprint(review)", "# Any natural language processing algorithm need...h'))]\nreview = pd.join(review, ' ')\nprint(review)", '# Any natural language processing algorithm need...glish\'))]\nreview = " ".join(review)\nprint(review)', 'dataset.shape(axis=0)', ...], 'MultinomialNB': <class 'sklearn.naive_bayes.MultinomialNB'>, 'NamespaceMagics': <class 'IPython.core.magics.namespace.NamespaceMagics'>, ...}
   2882             finally:
   2883                 # Reset our crash handler in place
   2884                 sys.excepthook = old_excepthook
   2885         except SystemExit as e:

...........................................................................
/Users/werlingk/AnacondaProjects/Language Processing/<ipython-input-130-8559eefa13d3> in <module>()
      5 text_clf = Pipeline([('vect', CountVectorizer()), ('clf', MultinomialNB()) ])
      6 
      7 parameters = {'vect__ngram_range': [(1, 1), (1, 2)], 'tfidf__use_idf': (True, False), 'clf__alpha': (1e-2, 1e-3) }
      8 
      9 gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1)
---> 10 gs_clf = gs_clf.fit(X_train, y_train)
     11 y_pred = gs_clf.predict(X_test)
     12 
     13 
     14 

...........................................................................
/usr/local/anaconda3/lib/python3.6/site-packages/sklearn/model_selection/_search.py in fit(self=GridSearchCV(cv=None, error_score='raise',
     ...train_score=True,
       scoring=None, verbose=0), X=array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0,...0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64), y=array([1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1,... 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1]), groups=None)
    940 
    941         groups : array-like, with shape (n_samples,), optional
    942             Group labels for the samples used while splitting the dataset into
    943             train/test set.
    944         """
--> 945         return self._fit(X, y, groups, ParameterGrid(self.param_grid))
        self._fit = <bound method BaseSearchCV._fit of GridSearchCV(...rain_score=True,
       scoring=None, verbose=0)>
        X = array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0,...0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)
        y = array([1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1,... 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1])
        groups = None
        self.param_grid = {'clf__alpha': (0.01, 0.001), 'tfidf__use_idf': (True, False), 'vect__ngram_range': [(1, 1), (1, 2)]}
    946 
    947 
    948 class RandomizedSearchCV(BaseSearchCV):
    949     """Randomized search on hyper parameters.

...........................................................................
/usr/local/anaconda3/lib/python3.6/site-packages/sklearn/model_selection/_search.py in _fit(self=GridSearchCV(cv=None, error_score='raise',
     ...train_score=True,
       scoring=None, verbose=0), X=array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0,...0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64), y=array([1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1,... 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1]), groups=None, parameter_iterable=<sklearn.model_selection._search.ParameterGrid object>)
    559                                   fit_params=self.fit_params,
    560                                   return_train_score=self.return_train_score,
    561                                   return_n_test_samples=True,
    562                                   return_times=True, return_parameters=True,
    563                                   error_score=self.error_score)
--> 564           for parameters in parameter_iterable
        parameters = undefined
        parameter_iterable = <sklearn.model_selection._search.ParameterGrid object>
    565           for train, test in cv_iter)
    566 
    567         # if one choose to see train score, "out" will contain train score info
    568         if self.return_train_score:

...........................................................................
/usr/local/anaconda3/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in __call__(self=Parallel(n_jobs=-1), iterable=<generator object BaseSearchCV._fit.<locals>.<genexpr>>)
    763             if pre_dispatch == "all" or n_jobs == 1:
    764                 # The iterable was consumed all at once by the above for loop.
    765                 # No need to wait for async callbacks to trigger to
    766                 # consumption.
    767                 self._iterating = False
--> 768             self.retrieve()
        self.retrieve = <bound method Parallel.retrieve of Parallel(n_jobs=-1)>
    769             # Make sure that we get a last message telling us we are done
    770             elapsed_time = time.time() - self._start_time
    771             self._print('Done %3i out of %3i | elapsed: %s finished',
    772                         (len(self._output), len(self._output),

---------------------------------------------------------------------------
Sub-process traceback:
---------------------------------------------------------------------------
ValueError                                         Wed Nov  1 22:49:17 2017
PID: 39414                    Python 3.6.1: /usr/local/anaconda3/bin/python
...........................................................................
/usr/local/anaconda3/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in __call__(self=<sklearn.externals.joblib.parallel.BatchedCalls object>)
    126     def __init__(self, iterator_slice):
    127         self.items = list(iterator_slice)
    128         self._size = len(self.items)
    129 
    130     def __call__(self):
--> 131         return [func(*args, **kwargs) for func, args, kwargs in self.items]
        self.items = [(<function _fit_and_score>, (Pipeline(steps=[('vect', CountVectorizer(analyze...(alpha=0.01, class_prior=None, fit_prior=True))]), memmap([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0..., ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]]), array([1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1,... 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1]), <function _passthrough_scorer>, array([265, 266, 270, 271, 272, 273, 274, 275, 2...90, 791, 792, 793, 794, 795, 796, 797, 798, 799]), array([  0,   1,   2,   3,   4,   5,   6,   7,  ...,
       260, 261, 262, 263, 264, 267, 268, 269]), 0, {'clf__alpha': 0.01, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 1)}), {'error_score': 'raise', 'fit_params': {}, 'return_n_test_samples': True, 'return_parameters': True, 'return_times': True, 'return_train_score': True})]
    132 
    133     def __len__(self):
    134         return self._size
    135 

...........................................................................
/usr/local/anaconda3/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in <listcomp>(.0=<list_iterator object>)
    126     def __init__(self, iterator_slice):
    127         self.items = list(iterator_slice)
    128         self._size = len(self.items)
    129 
    130     def __call__(self):
--> 131         return [func(*args, **kwargs) for func, args, kwargs in self.items]
        func = <function _fit_and_score>
        args = (Pipeline(steps=[('vect', CountVectorizer(analyze...(alpha=0.01, class_prior=None, fit_prior=True))]), memmap([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0..., ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]]), array([1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1,... 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1]), <function _passthrough_scorer>, array([265, 266, 270, 271, 272, 273, 274, 275, 2...90, 791, 792, 793, 794, 795, 796, 797, 798, 799]), array([  0,   1,   2,   3,   4,   5,   6,   7,  ...,
       260, 261, 262, 263, 264, 267, 268, 269]), 0, {'clf__alpha': 0.01, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 1)})
        kwargs = {'error_score': 'raise', 'fit_params': {}, 'return_n_test_samples': True, 'return_parameters': True, 'return_times': True, 'return_train_score': True}
    132 
    133     def __len__(self):
    134         return self._size
    135 

...........................................................................
/usr/local/anaconda3/lib/python3.6/site-packages/sklearn/model_selection/_validation.py in _fit_and_score(estimator=Pipeline(steps=[('vect', CountVectorizer(analyze...(alpha=0.01, class_prior=None, fit_prior=True))]), X=memmap([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0..., ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]]), y=array([1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1,... 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1]), scorer=<function _passthrough_scorer>, train=array([265, 266, 270, 271, 272, 273, 274, 275, 2...90, 791, 792, 793, 794, 795, 796, 797, 798, 799]), test=array([  0,   1,   2,   3,   4,   5,   6,   7,  ...,
       260, 261, 262, 263, 264, 267, 268, 269]), verbose=0, parameters={'clf__alpha': 0.01, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 1)}, fit_params={}, return_train_score=True, return_parameters=True, return_n_test_samples=True, return_times=True, error_score='raise')
    222     fit_params = fit_params if fit_params is not None else {}
    223     fit_params = dict([(k, _index_param_value(X, v, train))
    224                       for k, v in fit_params.items()])
    225 
    226     if parameters is not None:
--> 227         estimator.set_params(**parameters)
        estimator.set_params = <bound method Pipeline.set_params of Pipeline(st...alpha=0.01, class_prior=None, fit_prior=True))])>
        parameters = {'clf__alpha': 0.01, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 1)}
    228 
    229     start_time = time.time()
    230 
    231     X_train, y_train = _safe_split(estimator, X, y, train)

...........................................................................
/usr/local/anaconda3/lib/python3.6/site-packages/sklearn/pipeline.py in set_params(self=Pipeline(steps=[('vect', CountVectorizer(analyze...(alpha=0.01, class_prior=None, fit_prior=True))]), **kwargs={'clf__alpha': 0.01, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 1)})
    175 
    176         Returns
    177         -------
    178         self
    179         """
--> 180         self._set_params('steps', **kwargs)
        self._set_params = <bound method _BasePipeline._set_params of Pipel...alpha=0.01, class_prior=None, fit_prior=True))])>
        kwargs = {'clf__alpha': 0.01, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 1)}
    181         return self
    182 
    183     def _validate_steps(self):
    184         names, estimators = zip(*self.steps)

...........................................................................
/usr/local/anaconda3/lib/python3.6/site-packages/sklearn/pipeline.py in _set_params(self=Pipeline(steps=[('vect', CountVectorizer(analyze...(alpha=0.01, class_prior=None, fit_prior=True))]), steps_attr='steps', **params={'clf__alpha': 0.01, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 1)})
     64         step_names, _ = zip(*getattr(self, steps_attr))
     65         for name in list(six.iterkeys(params)):
     66             if '__' not in name and name in step_names:
     67                 self._replace_step(steps_attr, name, params.pop(name))
     68         # 3. Step parameters and other initilisation arguments
---> 69         super(_BasePipeline, self).set_params(**params)
        self.set_params = <bound method Pipeline.set_params of Pipeline(st...alpha=0.01, class_prior=None, fit_prior=True))])>
        params = {'clf__alpha': 0.01, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 1)}
     70         return self
     71 
     72     def _validate_names(self, names):
     73         if len(set(names)) != len(names):

...........................................................................
/usr/local/anaconda3/lib/python3.6/site-packages/sklearn/base.py in set_params(self=Pipeline(steps=[('vect', CountVectorizer(analyze...(alpha=0.01, class_prior=None, fit_prior=True))]), **params={'clf__alpha': 0.01, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 1)})
    277                 name, sub_name = split
    278                 if name not in valid_params:
    279                     raise ValueError('Invalid parameter %s for estimator %s. '
    280                                      'Check the list of available parameters '
    281                                      'with `estimator.get_params().keys()`.' %
--> 282                                      (name, self))
        name = 'tfidf'
        self = Pipeline(steps=[('vect', CountVectorizer(analyze...(alpha=0.01, class_prior=None, fit_prior=True))])
    283                 sub_object = valid_params[name]
    284                 sub_object.set_params(**{sub_name: value})
    285             else:
    286                 # simple objects case

ValueError: Invalid parameter tfidf for estimator Pipeline(steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)), ('clf', MultinomialNB(alpha=0.01, class_prior=None, fit_prior=True))]). Check the list of available parameters with `estimator.get_params().keys()`.
___________________________________________________________________________