# Fake News Detection with NLP

## Initial data reading and preparing

In [21]:
import numpy as np
import pandas as pd

Loading Dataset

In [22]:
train = pd.read_csv('fake_or_real_news_training.csv', index_col="ID", encoding='utf-8')
test = pd.read_csv('fake_or_real_news_test.csv', index_col="ID" , encoding='utf-8')

Taking a look to the data

In [23]:
train.head()

Unnamed: 0_level_0,title,text,label,X1,X2
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE,,
10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE,,
3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL,,
10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE,,
875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL,,


In [24]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3999 entries, 8476 to 9673
Data columns (total 5 columns):
title    3999 non-null object
text     3999 non-null object
label    3999 non-null object
X1       33 non-null object
X2       2 non-null object
dtypes: object(5)
memory usage: 187.5+ KB


### Joining available data
Let's join title and text fields

In [35]:
df = train.title + str(' ') + train.text
df.head()

ID
8476     You Can Smell Hillary’s Fear Daniel Greenfield...
10294    Watch The Exact Moment Paul Ryan Committed Pol...
3608     Kerry to go to Paris in gesture of sympathy U....
10142    Bernie supporters on Twitter erupt in anger ag...
875      The Battle of New York: Why This Primary Matte...
dtype: object

## Preprocessing and building the term document matrix

### Tokenization and Lemmatization

In [36]:
import nltk
nltk.download()
nltk.download('wordnet')
from nltk import word_tokenize          
from nltk.stem import WordNetLemmatizer 

class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, doc):
        return [self.wnl.lemmatize(t) for t in word_tokenize(doc)]

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/javiergranda/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


### Term document matrix

Building the term document matrix using sklearn CountVectorizer. Also removing stop words and combining bi-grams to capture word relationships. Also using previously defined tokenizer.

In [37]:
from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer(tokenizer=LemmaTokenizer(), stop_words='english', ngram_range=(1, 2)) # remove english stop words and applying bi-grams

X_train_counts = count_vect.fit_transform(df) # using the text and title from training set

X_train_counts.shape # take a look at how many featuress where created

(3999, 1002421)

## Removing sparcity with TF-IDF

Using the tf–idf transform to re-weight the count features into floating point values suitable to use with the classifier

In [38]:
from sklearn.feature_extraction.text import TfidfTransformer

tfidf_transformer = TfidfTransformer()

X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

## Naive Bayes

Building the classifier

In [40]:
from sklearn.naive_bayes import MultinomialNB

clf = MultinomialNB().fit(X_train_tfidf, train.label)

### Cross-validation of the Naive Bayes classifier

In [41]:
import warnings; warnings.simplefilter('ignore')
from sklearn.model_selection import cross_val_score

cv_score_nb = cross_val_score(
    clf,
    X_train_tfidf,
    train.label,
    cv = 5,
    n_jobs = -1)

print('Train - Naive Bayes cross validated score is: '+ str(np.mean(cv_score_nb)))

Train - Naive Bayes cross validated score is: 0.7975685067801546


# ****** THEO'S MAX ENTROPY CLASSIFIER ********** ######

In [45]:
### CODE HERE......

## SVM on regularized linear models with stochastic gradient descent (SGD) learning

In [46]:
from sklearn.linear_model import SGDClassifier

svm_clf = SGDClassifier(
    loss='perceptron', 
    penalty='l2',
    alpha=1e-3,
    max_iter=5,
    random_state=42).fit(X_train_tfidf, train.label)

In [47]:
import warnings; warnings.simplefilter('ignore')

cv_score_svm = cross_val_score(
    svm_clf,
    X_train_tfidf,
    train.label,
    cv = 5,
    n_jobs = -1)

print('Train - SVM cross validated score is: '+ str(np.mean(cv_score_svm)))

Train - SVM cross validated score is: 0.930557126977132


In [276]:
# FIX GRADIENT SEARCH
from sklearn.model_selection import GridSearchCV
params_svm = {
    'vect__ngram_range': [(1, 1), (1, 2)],
    'tfidf__use_idf': (True, False),
    'clf-svm__alpha': (1e-2, 1e-3),}

gs_clf_svm = GridSearchCV(svm_clf, params_svm, n_jobs=-1)
gs_clf_svm = gs_clf_svm.fit(X_train_tfidf, train.label)
gs_clf_svm.best_score_
gs_clf_svm.best_params_

JoblibValueError: JoblibValueError
___________________________________________________________________________
Multiprocessing exception:
...........................................................................
/Users/javiergranda/anaconda3/lib/python3.6/runpy.py in _run_module_as_main(mod_name='ipykernel_launcher', alter_argv=1)
    188         sys.exit(msg)
    189     main_globals = sys.modules["__main__"].__dict__
    190     if alter_argv:
    191         sys.argv[0] = mod_spec.origin
    192     return _run_code(code, main_globals, None,
--> 193                      "__main__", mod_spec)
        mod_spec = ModuleSpec(name='ipykernel_launcher', loader=<_f...b/python3.6/site-packages/ipykernel_launcher.py')
    194 
    195 def run_module(mod_name, init_globals=None,
    196                run_name=None, alter_sys=False):
    197     """Execute a module's code without importing it

...........................................................................
/Users/javiergranda/anaconda3/lib/python3.6/runpy.py in _run_code(code=<code object <module> at 0x10c03d5d0, file "/Use...3.6/site-packages/ipykernel_launcher.py", line 5>, run_globals={'__annotations__': {}, '__builtins__': <module 'builtins' (built-in)>, '__cached__': '/Users/javiergranda/anaconda3/lib/python3.6/site...ges/__pycache__/ipykernel_launcher.cpython-36.pyc', '__doc__': 'Entry point for launching an IPython kernel.\n\nTh...orts until\nafter removing the cwd from sys.path.\n', '__file__': '/Users/javiergranda/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py', '__loader__': <_frozen_importlib_external.SourceFileLoader object>, '__name__': '__main__', '__package__': '', '__spec__': ModuleSpec(name='ipykernel_launcher', loader=<_f...b/python3.6/site-packages/ipykernel_launcher.py'), 'app': <module 'ipykernel.kernelapp' from '/Users/javie.../python3.6/site-packages/ipykernel/kernelapp.py'>, ...}, init_globals=None, mod_name='__main__', mod_spec=ModuleSpec(name='ipykernel_launcher', loader=<_f...b/python3.6/site-packages/ipykernel_launcher.py'), pkg_name='', script_name=None)
     80                        __cached__ = cached,
     81                        __doc__ = None,
     82                        __loader__ = loader,
     83                        __package__ = pkg_name,
     84                        __spec__ = mod_spec)
---> 85     exec(code, run_globals)
        code = <code object <module> at 0x10c03d5d0, file "/Use...3.6/site-packages/ipykernel_launcher.py", line 5>
        run_globals = {'__annotations__': {}, '__builtins__': <module 'builtins' (built-in)>, '__cached__': '/Users/javiergranda/anaconda3/lib/python3.6/site...ges/__pycache__/ipykernel_launcher.cpython-36.pyc', '__doc__': 'Entry point for launching an IPython kernel.\n\nTh...orts until\nafter removing the cwd from sys.path.\n', '__file__': '/Users/javiergranda/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py', '__loader__': <_frozen_importlib_external.SourceFileLoader object>, '__name__': '__main__', '__package__': '', '__spec__': ModuleSpec(name='ipykernel_launcher', loader=<_f...b/python3.6/site-packages/ipykernel_launcher.py'), 'app': <module 'ipykernel.kernelapp' from '/Users/javie.../python3.6/site-packages/ipykernel/kernelapp.py'>, ...}
     86     return run_globals
     87 
     88 def _run_module_code(code, init_globals=None,
     89                     mod_name=None, mod_spec=None,

...........................................................................
/Users/javiergranda/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py in <module>()
     11     # This is added back by InteractiveShellApp.init_path()
     12     if sys.path[0] == '':
     13         del sys.path[0]
     14 
     15     from ipykernel import kernelapp as app
---> 16     app.launch_new_instance()

...........................................................................
/Users/javiergranda/anaconda3/lib/python3.6/site-packages/traitlets/config/application.py in launch_instance(cls=<class 'ipykernel.kernelapp.IPKernelApp'>, argv=None, **kwargs={})
    653 
    654         If a global instance already exists, this reinitializes and starts it
    655         """
    656         app = cls.instance(**kwargs)
    657         app.initialize(argv)
--> 658         app.start()
        app.start = <bound method IPKernelApp.start of <ipykernel.kernelapp.IPKernelApp object>>
    659 
    660 #-----------------------------------------------------------------------------
    661 # utility functions, for convenience
    662 #-----------------------------------------------------------------------------

...........................................................................
/Users/javiergranda/anaconda3/lib/python3.6/site-packages/ipykernel/kernelapp.py in start(self=<ipykernel.kernelapp.IPKernelApp object>)
    481         if self.poller is not None:
    482             self.poller.start()
    483         self.kernel.start()
    484         self.io_loop = ioloop.IOLoop.current()
    485         try:
--> 486             self.io_loop.start()
        self.io_loop.start = <bound method ZMQIOLoop.start of <zmq.eventloop.ioloop.ZMQIOLoop object>>
    487         except KeyboardInterrupt:
    488             pass
    489 
    490 launch_new_instance = IPKernelApp.launch_instance

...........................................................................
/Users/javiergranda/anaconda3/lib/python3.6/site-packages/zmq/eventloop/ioloop.py in start(self=<zmq.eventloop.ioloop.ZMQIOLoop object>)
    172             )
    173         return loop
    174     
    175     def start(self):
    176         try:
--> 177             super(ZMQIOLoop, self).start()
        self.start = <bound method ZMQIOLoop.start of <zmq.eventloop.ioloop.ZMQIOLoop object>>
    178         except ZMQError as e:
    179             if e.errno == ETERM:
    180                 # quietly return on ETERM
    181                 pass

...........................................................................
/Users/javiergranda/anaconda3/lib/python3.6/site-packages/tornado/ioloop.py in start(self=<zmq.eventloop.ioloop.ZMQIOLoop object>)
    883                 self._events.update(event_pairs)
    884                 while self._events:
    885                     fd, events = self._events.popitem()
    886                     try:
    887                         fd_obj, handler_func = self._handlers[fd]
--> 888                         handler_func(fd_obj, events)
        handler_func = <function wrap.<locals>.null_wrapper>
        fd_obj = <zmq.sugar.socket.Socket object>
        events = 1
    889                     except (OSError, IOError) as e:
    890                         if errno_from_exception(e) == errno.EPIPE:
    891                             # Happens when the client closes the connection
    892                             pass

...........................................................................
/Users/javiergranda/anaconda3/lib/python3.6/site-packages/tornado/stack_context.py in null_wrapper(*args=(<zmq.sugar.socket.Socket object>, 1), **kwargs={})
    272         # Fast path when there are no active contexts.
    273         def null_wrapper(*args, **kwargs):
    274             try:
    275                 current_state = _state.contexts
    276                 _state.contexts = cap_contexts[0]
--> 277                 return fn(*args, **kwargs)
        args = (<zmq.sugar.socket.Socket object>, 1)
        kwargs = {}
    278             finally:
    279                 _state.contexts = current_state
    280         null_wrapper._wrapped = True
    281         return null_wrapper

...........................................................................
/Users/javiergranda/anaconda3/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py in _handle_events(self=<zmq.eventloop.zmqstream.ZMQStream object>, fd=<zmq.sugar.socket.Socket object>, events=1)
    435             # dispatch events:
    436             if events & IOLoop.ERROR:
    437                 gen_log.error("got POLLERR event on ZMQStream, which doesn't make sense")
    438                 return
    439             if events & IOLoop.READ:
--> 440                 self._handle_recv()
        self._handle_recv = <bound method ZMQStream._handle_recv of <zmq.eventloop.zmqstream.ZMQStream object>>
    441                 if not self.socket:
    442                     return
    443             if events & IOLoop.WRITE:
    444                 self._handle_send()

...........................................................................
/Users/javiergranda/anaconda3/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py in _handle_recv(self=<zmq.eventloop.zmqstream.ZMQStream object>)
    467                 gen_log.error("RECV Error: %s"%zmq.strerror(e.errno))
    468         else:
    469             if self._recv_callback:
    470                 callback = self._recv_callback
    471                 # self._recv_callback = None
--> 472                 self._run_callback(callback, msg)
        self._run_callback = <bound method ZMQStream._run_callback of <zmq.eventloop.zmqstream.ZMQStream object>>
        callback = <function wrap.<locals>.null_wrapper>
        msg = [<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>]
    473                 
    474         # self.update_state()
    475         
    476 

...........................................................................
/Users/javiergranda/anaconda3/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py in _run_callback(self=<zmq.eventloop.zmqstream.ZMQStream object>, callback=<function wrap.<locals>.null_wrapper>, *args=([<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>],), **kwargs={})
    409         close our socket."""
    410         try:
    411             # Use a NullContext to ensure that all StackContexts are run
    412             # inside our blanket exception handler rather than outside.
    413             with stack_context.NullContext():
--> 414                 callback(*args, **kwargs)
        callback = <function wrap.<locals>.null_wrapper>
        args = ([<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>],)
        kwargs = {}
    415         except:
    416             gen_log.error("Uncaught exception, closing connection.",
    417                           exc_info=True)
    418             # Close the socket on an uncaught exception from a user callback

...........................................................................
/Users/javiergranda/anaconda3/lib/python3.6/site-packages/tornado/stack_context.py in null_wrapper(*args=([<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>],), **kwargs={})
    272         # Fast path when there are no active contexts.
    273         def null_wrapper(*args, **kwargs):
    274             try:
    275                 current_state = _state.contexts
    276                 _state.contexts = cap_contexts[0]
--> 277                 return fn(*args, **kwargs)
        args = ([<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>],)
        kwargs = {}
    278             finally:
    279                 _state.contexts = current_state
    280         null_wrapper._wrapped = True
    281         return null_wrapper

...........................................................................
/Users/javiergranda/anaconda3/lib/python3.6/site-packages/ipykernel/kernelbase.py in dispatcher(msg=[<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>])
    278         if self.control_stream:
    279             self.control_stream.on_recv(self.dispatch_control, copy=False)
    280 
    281         def make_dispatcher(stream):
    282             def dispatcher(msg):
--> 283                 return self.dispatch_shell(stream, msg)
        msg = [<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>]
    284             return dispatcher
    285 
    286         for s in self.shell_streams:
    287             s.on_recv(make_dispatcher(s), copy=False)

...........................................................................
/Users/javiergranda/anaconda3/lib/python3.6/site-packages/ipykernel/kernelbase.py in dispatch_shell(self=<ipykernel.ipkernel.IPythonKernel object>, stream=<zmq.eventloop.zmqstream.ZMQStream object>, msg={'buffers': [], 'content': {'allow_stdin': True, 'code': 'from sklearn.model_selection import GridSearchCV...l)\ngs_clf_svm.best_score_\ngs_clf_svm.best_params_', 'silent': False, 'stop_on_error': True, 'store_history': True, 'user_expressions': {}}, 'header': {'date': datetime.datetime(2018, 5, 20, 15, 47, 28, 487587, tzinfo=tzutc()), 'msg_id': '4cbe275b52dc1b9565c4d10689f545ce', 'msg_type': 'execute_request', 'session': '1994415af49bb9e6d9bf2537966d1ae9', 'username': '', 'version': '5.2'}, 'metadata': {}, 'msg_id': '4cbe275b52dc1b9565c4d10689f545ce', 'msg_type': 'execute_request', 'parent_header': {}})
    228             self.log.warn("Unknown message type: %r", msg_type)
    229         else:
    230             self.log.debug("%s: %s", msg_type, msg)
    231             self.pre_handler_hook()
    232             try:
--> 233                 handler(stream, idents, msg)
        handler = <bound method Kernel.execute_request of <ipykernel.ipkernel.IPythonKernel object>>
        stream = <zmq.eventloop.zmqstream.ZMQStream object>
        idents = [b'1994415af49bb9e6d9bf2537966d1ae9']
        msg = {'buffers': [], 'content': {'allow_stdin': True, 'code': 'from sklearn.model_selection import GridSearchCV...l)\ngs_clf_svm.best_score_\ngs_clf_svm.best_params_', 'silent': False, 'stop_on_error': True, 'store_history': True, 'user_expressions': {}}, 'header': {'date': datetime.datetime(2018, 5, 20, 15, 47, 28, 487587, tzinfo=tzutc()), 'msg_id': '4cbe275b52dc1b9565c4d10689f545ce', 'msg_type': 'execute_request', 'session': '1994415af49bb9e6d9bf2537966d1ae9', 'username': '', 'version': '5.2'}, 'metadata': {}, 'msg_id': '4cbe275b52dc1b9565c4d10689f545ce', 'msg_type': 'execute_request', 'parent_header': {}}
    234             except Exception:
    235                 self.log.error("Exception in message handler:", exc_info=True)
    236             finally:
    237                 self.post_handler_hook()

...........................................................................
/Users/javiergranda/anaconda3/lib/python3.6/site-packages/ipykernel/kernelbase.py in execute_request(self=<ipykernel.ipkernel.IPythonKernel object>, stream=<zmq.eventloop.zmqstream.ZMQStream object>, ident=[b'1994415af49bb9e6d9bf2537966d1ae9'], parent={'buffers': [], 'content': {'allow_stdin': True, 'code': 'from sklearn.model_selection import GridSearchCV...l)\ngs_clf_svm.best_score_\ngs_clf_svm.best_params_', 'silent': False, 'stop_on_error': True, 'store_history': True, 'user_expressions': {}}, 'header': {'date': datetime.datetime(2018, 5, 20, 15, 47, 28, 487587, tzinfo=tzutc()), 'msg_id': '4cbe275b52dc1b9565c4d10689f545ce', 'msg_type': 'execute_request', 'session': '1994415af49bb9e6d9bf2537966d1ae9', 'username': '', 'version': '5.2'}, 'metadata': {}, 'msg_id': '4cbe275b52dc1b9565c4d10689f545ce', 'msg_type': 'execute_request', 'parent_header': {}})
    394         if not silent:
    395             self.execution_count += 1
    396             self._publish_execute_input(code, parent, self.execution_count)
    397 
    398         reply_content = self.do_execute(code, silent, store_history,
--> 399                                         user_expressions, allow_stdin)
        user_expressions = {}
        allow_stdin = True
    400 
    401         # Flush output before sending the reply.
    402         sys.stdout.flush()
    403         sys.stderr.flush()

...........................................................................
/Users/javiergranda/anaconda3/lib/python3.6/site-packages/ipykernel/ipkernel.py in do_execute(self=<ipykernel.ipkernel.IPythonKernel object>, code='from sklearn.model_selection import GridSearchCV...l)\ngs_clf_svm.best_score_\ngs_clf_svm.best_params_', silent=False, store_history=True, user_expressions={}, allow_stdin=True)
    203 
    204         self._forward_input(allow_stdin)
    205 
    206         reply_content = {}
    207         try:
--> 208             res = shell.run_cell(code, store_history=store_history, silent=silent)
        res = undefined
        shell.run_cell = <bound method ZMQInteractiveShell.run_cell of <ipykernel.zmqshell.ZMQInteractiveShell object>>
        code = 'from sklearn.model_selection import GridSearchCV...l)\ngs_clf_svm.best_score_\ngs_clf_svm.best_params_'
        store_history = True
        silent = False
    209         finally:
    210             self._restore_input()
    211 
    212         if res.error_before_exec is not None:

...........................................................................
/Users/javiergranda/anaconda3/lib/python3.6/site-packages/ipykernel/zmqshell.py in run_cell(self=<ipykernel.zmqshell.ZMQInteractiveShell object>, *args=('from sklearn.model_selection import GridSearchCV...l)\ngs_clf_svm.best_score_\ngs_clf_svm.best_params_',), **kwargs={'silent': False, 'store_history': True})
    532             )
    533         self.payload_manager.write_payload(payload)
    534 
    535     def run_cell(self, *args, **kwargs):
    536         self._last_traceback = None
--> 537         return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
        self.run_cell = <bound method ZMQInteractiveShell.run_cell of <ipykernel.zmqshell.ZMQInteractiveShell object>>
        args = ('from sklearn.model_selection import GridSearchCV...l)\ngs_clf_svm.best_score_\ngs_clf_svm.best_params_',)
        kwargs = {'silent': False, 'store_history': True}
    538 
    539     def _showtraceback(self, etype, evalue, stb):
    540         # try to preserve ordering of tracebacks and print statements
    541         sys.stdout.flush()

...........................................................................
/Users/javiergranda/anaconda3/lib/python3.6/site-packages/IPython/core/interactiveshell.py in run_cell(self=<ipykernel.zmqshell.ZMQInteractiveShell object>, raw_cell='from sklearn.model_selection import GridSearchCV...l)\ngs_clf_svm.best_score_\ngs_clf_svm.best_params_', store_history=True, silent=False, shell_futures=True)
   2693                 self.displayhook.exec_result = result
   2694 
   2695                 # Execute the user code
   2696                 interactivity = "none" if silent else self.ast_node_interactivity
   2697                 has_raised = self.run_ast_nodes(code_ast.body, cell_name,
-> 2698                    interactivity=interactivity, compiler=compiler, result=result)
        interactivity = 'last_expr'
        compiler = <IPython.core.compilerop.CachingCompiler object>
   2699                 
   2700                 self.last_execution_succeeded = not has_raised
   2701 
   2702                 # Reset this so later displayed values do not modify the

...........................................................................
/Users/javiergranda/anaconda3/lib/python3.6/site-packages/IPython/core/interactiveshell.py in run_ast_nodes(self=<ipykernel.zmqshell.ZMQInteractiveShell object>, nodelist=[<_ast.ImportFrom object>, <_ast.Assign object>, <_ast.Assign object>, <_ast.Assign object>, <_ast.Expr object>, <_ast.Expr object>], cell_name='<ipython-input-276-2e5faaccddd0>', interactivity='last', compiler=<IPython.core.compilerop.CachingCompiler object>, result=<ExecutionResult object at 115b8bcf8, execution_..._before_exec=None error_in_exec=None result=None>)
   2797 
   2798         try:
   2799             for i, node in enumerate(to_run_exec):
   2800                 mod = ast.Module([node])
   2801                 code = compiler(mod, cell_name, "exec")
-> 2802                 if self.run_code(code, result):
        self.run_code = <bound method InteractiveShell.run_code of <ipykernel.zmqshell.ZMQInteractiveShell object>>
        code = <code object <module> at 0x113eae390, file "<ipython-input-276-2e5faaccddd0>", line 8>
        result = <ExecutionResult object at 115b8bcf8, execution_..._before_exec=None error_in_exec=None result=None>
   2803                     return True
   2804 
   2805             for i, node in enumerate(to_run_interactive):
   2806                 mod = ast.Interactive([node])

...........................................................................
/Users/javiergranda/anaconda3/lib/python3.6/site-packages/IPython/core/interactiveshell.py in run_code(self=<ipykernel.zmqshell.ZMQInteractiveShell object>, code_obj=<code object <module> at 0x113eae390, file "<ipython-input-276-2e5faaccddd0>", line 8>, result=<ExecutionResult object at 115b8bcf8, execution_..._before_exec=None error_in_exec=None result=None>)
   2857         outflag = True  # happens in more places, so it's easier as default
   2858         try:
   2859             try:
   2860                 self.hooks.pre_run_code_hook()
   2861                 #rprint('Running code', repr(code_obj)) # dbg
-> 2862                 exec(code_obj, self.user_global_ns, self.user_ns)
        code_obj = <code object <module> at 0x113eae390, file "<ipython-input-276-2e5faaccddd0>", line 8>
        self.user_global_ns = {'CountVectorizer': <class 'sklearn.feature_extraction.text.CountVectorizer'>, 'GridSearchCV': <class 'sklearn.model_selection._search.GridSearchCV'>, 'In': ['', 'import numpy as np\nimport pandas as pd', 'train = pd.read_csv(\'fake_or_real_news_training....ews_test.csv\', index_col="ID" , encoding=\'utf-8\')', 'df = pd.concat([train.text, test.text], axis=0)', 'train.info()', 'train.head()', 'from sklearn.feature_extraction.text import Coun...count_vect.fit_transform(df)\nX_train_counts.shape', 'from sklearn.feature_extraction.text import Tfid...fit_transform(X_train_counts)\nX_train_tfidf.shape', 'from sklearn.naive_bayes import MultinomialNB\nclf = MultinomialNB().fit(X_train_tfidf, train.label)', 'from sklearn.feature_extraction.text import Coun...ct.fit_transform(train.text)\nX_train_counts.shape', 'from sklearn.feature_extraction.text import Tfid...fit_transform(X_train_counts)\nX_train_tfidf.shape', 'from sklearn.naive_bayes import MultinomialNB\nclf = MultinomialNB().fit(X_train_tfidf, train.label)', 'from sklearn.feature_extraction.text import Coun...sform(train.text, test.text)\nX_train_counts.shape', 'from sklearn.feature_extraction.text import Tfid...fit_transform(X_train_counts)\nX_train_tfidf.shape', 'from sklearn.naive_bayes import MultinomialNB\nclf = MultinomialNB().fit(X_train_tfidf, train.label)', "from sklearn.model_selection import cross_val_sc...1)\nprint('CV Score is: '+ str(np.mean(cv_score)))", "from sklearn.model_selection import cross_val_sc...1)\nprint('CV Score is: '+ str(np.mean(cv_score)))", '#from sklearn.pipeline import Pipeline\nfrom skle...=5, random_state=42).fit(X_train_tfidf, df.label)', '#from sklearn.pipeline import Pipeline\nfrom skle... random_state=42).fit(X_train_tfidf, train.label)', '#from sklearn.pipeline import Pipeline\nfrom skle... random_state=42).fit(X_train_tfidf, train.label)', ...], 'MultinomialNB': <class 'sklearn.naive_bayes.MultinomialNB'>, 'Out': {5:                                                 ...n New York and front-runners...  REAL  NaN  NaN  , 6: (6320, 66855), 7: (6320, 66855), 9: (3999, 55009), 10: (3999, 55009), 12: (3999, 55009), 13: (3999, 55009), 80: (3999, 209), 84:                                                 ...n New York and front-runners...  REAL  NaN  NaN  , 85: (3999, 209), ...}, 'SGDClassifier': <class 'sklearn.linear_model.stochastic_gradient.SGDClassifier'>, 'TfidfTransformer': <class 'sklearn.feature_extraction.text.TfidfTransformer'>, 'X_train_counts': <3999x1101591 sparse matrix of type '<class 'num... stored elements in Compressed Sparse Row format>, 'X_train_tfidf': <3999x1101591 sparse matrix of type '<class 'num... stored elements in Compressed Sparse Row format>, '_': (3999, 1101591), ...}
        self.user_ns = {'CountVectorizer': <class 'sklearn.feature_extraction.text.CountVectorizer'>, 'GridSearchCV': <class 'sklearn.model_selection._search.GridSearchCV'>, 'In': ['', 'import numpy as np\nimport pandas as pd', 'train = pd.read_csv(\'fake_or_real_news_training....ews_test.csv\', index_col="ID" , encoding=\'utf-8\')', 'df = pd.concat([train.text, test.text], axis=0)', 'train.info()', 'train.head()', 'from sklearn.feature_extraction.text import Coun...count_vect.fit_transform(df)\nX_train_counts.shape', 'from sklearn.feature_extraction.text import Tfid...fit_transform(X_train_counts)\nX_train_tfidf.shape', 'from sklearn.naive_bayes import MultinomialNB\nclf = MultinomialNB().fit(X_train_tfidf, train.label)', 'from sklearn.feature_extraction.text import Coun...ct.fit_transform(train.text)\nX_train_counts.shape', 'from sklearn.feature_extraction.text import Tfid...fit_transform(X_train_counts)\nX_train_tfidf.shape', 'from sklearn.naive_bayes import MultinomialNB\nclf = MultinomialNB().fit(X_train_tfidf, train.label)', 'from sklearn.feature_extraction.text import Coun...sform(train.text, test.text)\nX_train_counts.shape', 'from sklearn.feature_extraction.text import Tfid...fit_transform(X_train_counts)\nX_train_tfidf.shape', 'from sklearn.naive_bayes import MultinomialNB\nclf = MultinomialNB().fit(X_train_tfidf, train.label)', "from sklearn.model_selection import cross_val_sc...1)\nprint('CV Score is: '+ str(np.mean(cv_score)))", "from sklearn.model_selection import cross_val_sc...1)\nprint('CV Score is: '+ str(np.mean(cv_score)))", '#from sklearn.pipeline import Pipeline\nfrom skle...=5, random_state=42).fit(X_train_tfidf, df.label)', '#from sklearn.pipeline import Pipeline\nfrom skle... random_state=42).fit(X_train_tfidf, train.label)', '#from sklearn.pipeline import Pipeline\nfrom skle... random_state=42).fit(X_train_tfidf, train.label)', ...], 'MultinomialNB': <class 'sklearn.naive_bayes.MultinomialNB'>, 'Out': {5:                                                 ...n New York and front-runners...  REAL  NaN  NaN  , 6: (6320, 66855), 7: (6320, 66855), 9: (3999, 55009), 10: (3999, 55009), 12: (3999, 55009), 13: (3999, 55009), 80: (3999, 209), 84:                                                 ...n New York and front-runners...  REAL  NaN  NaN  , 85: (3999, 209), ...}, 'SGDClassifier': <class 'sklearn.linear_model.stochastic_gradient.SGDClassifier'>, 'TfidfTransformer': <class 'sklearn.feature_extraction.text.TfidfTransformer'>, 'X_train_counts': <3999x1101591 sparse matrix of type '<class 'num... stored elements in Compressed Sparse Row format>, 'X_train_tfidf': <3999x1101591 sparse matrix of type '<class 'num... stored elements in Compressed Sparse Row format>, '_': (3999, 1101591), ...}
   2863             finally:
   2864                 # Reset our crash handler in place
   2865                 sys.excepthook = old_excepthook
   2866         except SystemExit as e:

...........................................................................
/Volumes/GoogleDrive/My Drive/term3/nlp/fake-news-detection/<ipython-input-276-2e5faaccddd0> in <module>()
      3     'vect__ngram_range': [(1, 1), (1, 2)],
      4     'tfidf__use_idf': (True, False),
      5     'clf-svm__alpha': (1e-2, 1e-3),}
      6 
      7 gs_clf_svm = GridSearchCV(svm_clf, params_svm, n_jobs=-1)
----> 8 gs_clf_svm = gs_clf_svm.fit(X_train_tfidf, train.label)
      9 gs_clf_svm.best_score_
     10 gs_clf_svm.best_params_

...........................................................................
/Users/javiergranda/anaconda3/lib/python3.6/site-packages/sklearn/model_selection/_search.py in fit(self=GridSearchCV(cv=None, error_score='raise',
     ...ain_score='warn',
       scoring=None, verbose=0), X=<3999x1101591 sparse matrix of type '<class 'num... stored elements in Compressed Sparse Row format>, y=ID
8476     FAKE
10294    FAKE
3608     REAL
101...    FAKE
Name: label, Length: 3999, dtype: object, groups=None, **fit_params={})
    634                                   return_train_score=self.return_train_score,
    635                                   return_n_test_samples=True,
    636                                   return_times=True, return_parameters=False,
    637                                   error_score=self.error_score)
    638           for parameters, (train, test) in product(candidate_params,
--> 639                                                    cv.split(X, y, groups)))
        cv.split = <bound method StratifiedKFold.split of Stratifie...ld(n_splits=3, random_state=None, shuffle=False)>
        X = <3999x1101591 sparse matrix of type '<class 'num... stored elements in Compressed Sparse Row format>
        y = ID
8476     FAKE
10294    FAKE
3608     REAL
101...    FAKE
Name: label, Length: 3999, dtype: object
        groups = None
    640 
    641         # if one choose to see train score, "out" will contain train score info
    642         if self.return_train_score:
    643             (train_score_dicts, test_score_dicts, test_sample_counts, fit_time,

...........................................................................
/Users/javiergranda/anaconda3/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in __call__(self=Parallel(n_jobs=-1), iterable=<generator object BaseSearchCV.fit.<locals>.<genexpr>>)
    784             if pre_dispatch == "all" or n_jobs == 1:
    785                 # The iterable was consumed all at once by the above for loop.
    786                 # No need to wait for async callbacks to trigger to
    787                 # consumption.
    788                 self._iterating = False
--> 789             self.retrieve()
        self.retrieve = <bound method Parallel.retrieve of Parallel(n_jobs=-1)>
    790             # Make sure that we get a last message telling us we are done
    791             elapsed_time = time.time() - self._start_time
    792             self._print('Done %3i out of %3i | elapsed: %s finished',
    793                         (len(self._output), len(self._output),

---------------------------------------------------------------------------
Sub-process traceback:
---------------------------------------------------------------------------
ValueError                                         Sun May 20 17:47:28 2018
PID: 12477           Python 3.6.3: /Users/javiergranda/anaconda3/bin/python
...........................................................................
/Users/javiergranda/anaconda3/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in __call__(self=<sklearn.externals.joblib.parallel.BatchedCalls object>)
    126     def __init__(self, iterator_slice):
    127         self.items = list(iterator_slice)
    128         self._size = len(self.items)
    129 
    130     def __call__(self):
--> 131         return [func(*args, **kwargs) for func, args, kwargs in self.items]
        self.items = [(<function _fit_and_score>, (SGDClassifier(alpha=0.01, average=False, class_w...ue,
       tol=None, verbose=0, warm_start=False), <3999x1101591 sparse matrix of type '<class 'num... stored elements in Compressed Sparse Row format>, ID
8476     FAKE
10294    FAKE
3608     REAL
101...    FAKE
Name: label, Length: 3999, dtype: object, {'score': <function _passthrough_scorer>}, array([1316, 1319, 1321, ..., 3996, 3997, 3998]), array([   0,    1,    2, ..., 3578, 3649, 3706]), 0, {'clf-svm__alpha': 0.01, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 1)}), {'error_score': 'raise', 'fit_params': {}, 'return_n_test_samples': True, 'return_parameters': False, 'return_times': True, 'return_train_score': 'warn'})]
    132 
    133     def __len__(self):
    134         return self._size
    135 

...........................................................................
/Users/javiergranda/anaconda3/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in <listcomp>(.0=<list_iterator object>)
    126     def __init__(self, iterator_slice):
    127         self.items = list(iterator_slice)
    128         self._size = len(self.items)
    129 
    130     def __call__(self):
--> 131         return [func(*args, **kwargs) for func, args, kwargs in self.items]
        func = <function _fit_and_score>
        args = (SGDClassifier(alpha=0.01, average=False, class_w...ue,
       tol=None, verbose=0, warm_start=False), <3999x1101591 sparse matrix of type '<class 'num... stored elements in Compressed Sparse Row format>, ID
8476     FAKE
10294    FAKE
3608     REAL
101...    FAKE
Name: label, Length: 3999, dtype: object, {'score': <function _passthrough_scorer>}, array([1316, 1319, 1321, ..., 3996, 3997, 3998]), array([   0,    1,    2, ..., 3578, 3649, 3706]), 0, {'clf-svm__alpha': 0.01, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 1)})
        kwargs = {'error_score': 'raise', 'fit_params': {}, 'return_n_test_samples': True, 'return_parameters': False, 'return_times': True, 'return_train_score': 'warn'}
    132 
    133     def __len__(self):
    134         return self._size
    135 

...........................................................................
/Users/javiergranda/anaconda3/lib/python3.6/site-packages/sklearn/model_selection/_validation.py in _fit_and_score(estimator=SGDClassifier(alpha=0.01, average=False, class_w...ue,
       tol=None, verbose=0, warm_start=False), X=<3999x1101591 sparse matrix of type '<class 'num... stored elements in Compressed Sparse Row format>, y=ID
8476     FAKE
10294    FAKE
3608     REAL
101...    FAKE
Name: label, Length: 3999, dtype: object, scorer={'score': <function _passthrough_scorer>}, train=array([1316, 1319, 1321, ..., 3996, 3997, 3998]), test=array([   0,    1,    2, ..., 3578, 3649, 3706]), verbose=0, parameters={'clf-svm__alpha': 0.01, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 1)}, fit_params={}, return_train_score='warn', return_parameters=False, return_n_test_samples=True, return_times=True, error_score='raise')
    439                       for k, v in fit_params.items()])
    440 
    441     test_scores = {}
    442     train_scores = {}
    443     if parameters is not None:
--> 444         estimator.set_params(**parameters)
        estimator.set_params = <bound method BaseSGD.set_params of SGDClassifie...e,
       tol=None, verbose=0, warm_start=False)>
        parameters = {'clf-svm__alpha': 0.01, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 1)}
    445 
    446     start_time = time.time()
    447 
    448     X_train, y_train = _safe_split(estimator, X, y, train)

...........................................................................
/Users/javiergranda/anaconda3/lib/python3.6/site-packages/sklearn/linear_model/stochastic_gradient.py in set_params(self=SGDClassifier(alpha=0.01, average=False, class_w...ue,
       tol=None, verbose=0, warm_start=False), *args=(), **kwargs={'clf-svm__alpha': 0.01, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 1)})
     72         # current tests expect init to do parameter validation
     73         # but we are not allowed to set attributes
     74         self._validate_params(set_max_iter=False)
     75 
     76     def set_params(self, *args, **kwargs):
---> 77         super(BaseSGD, self).set_params(*args, **kwargs)
        self.set_params = <bound method BaseSGD.set_params of SGDClassifie...e,
       tol=None, verbose=0, warm_start=False)>
        args = ()
        kwargs = {'clf-svm__alpha': 0.01, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 1)}
     78         self._validate_params(set_max_iter=False)
     79         return self
     80 
     81     @abstractmethod

...........................................................................
/Users/javiergranda/anaconda3/lib/python3.6/site-packages/sklearn/base.py in set_params(self=SGDClassifier(alpha=0.01, average=False, class_w...ue,
       tol=None, verbose=0, warm_start=False), **params={'clf-svm__alpha': 0.01, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 1)})
    269             key, delim, sub_key = key.partition('__')
    270             if key not in valid_params:
    271                 raise ValueError('Invalid parameter %s for estimator %s. '
    272                                  'Check the list of available parameters '
    273                                  'with `estimator.get_params().keys()`.' %
--> 274                                  (key, self))
        key = 'clf-svm'
        self = SGDClassifier(alpha=0.01, average=False, class_w...ue,
       tol=None, verbose=0, warm_start=False)
    275 
    276             if delim:
    277                 nested_params[key][sub_key] = value
    278             else:

ValueError: Invalid parameter clf-svm for estimator SGDClassifier(alpha=0.01, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='perceptron', max_iter=5, n_iter=None,
       n_jobs=1, penalty='l2', power_t=0.5, random_state=42, shuffle=True,
       tol=None, verbose=0, warm_start=False). Check the list of available parameters with `estimator.get_params().keys()`.
___________________________________________________________________________