## Classifier and preprocessing

In this notebook, the noironicos dataset will be treated, since ironicos's tweets are all ironic and we want a mixture of ironic and non ironic.

In [36]:
# General import and load data
from sklearn.model_selection import train_test_split
import numpy
import nltk
import pandas as pd
from pandas import Series, DataFrame
import matplotlib.pyplot as plt
from nltk.tokenize import sent_tokenize, word_tokenize
import re

# Needed for running
nltk.download('punkt')
nltk.download('stopwords')

# Import database
df=pd.read_csv('final_dataset.csv', encoding='utf-8', delimiter=",", header=0)
df.groupby('ironic').size()

# Delete rows containing nan
df=df.dropna(subset=['tweet'])


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/juanalvarez/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/juanalvarez/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# Before splitting database, a shuffling action will be performed since data is not randomized.
# That way the train and test splitting will be more balanced

df = df.sample(frac=1).reset_index(drop=True)

# Define X and Y
X = df['tweet'].values
y = df['ironic'].values.astype(int)
print(X[34])

"@icacabelos: @Jocoserio ?Vamos!!!!"a x el 10%%%!!!!!


In [3]:
df.groupby('ironic').size()

ironic
0    5444
1    5638
dtype: int64

### Train and test splitting

In [20]:

# Splitting
# Test set will be the 25% taken randomly
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=33)
X_dev, X_train, y_dev, y_train = train_test_split(X_train, y_train, test_size=0.5)
print(len(X_train))
print(len(X_dev))
print(len(X_test))
print(len(X))


4156
4155
2771
11082


## Lexical features
The lexical features analysis will be performed by using the twitter tokenizer provided by nltk library.
Important: This feature extractor is NOT used since tweets are considered to contain only one sentence


# Sample statistics using NLTK
# A transformer will be implemented

from nltk.tokenize import sent_tokenize, word_tokenize


class LexicalStats (BaseEstimator, TransformerMixin):
    """Extract lexical features from each document"""
    
    def number_sentences(self, doc):
        sentences = sent_tokenize(doc, language='spanish')
        return len(sentences)

    def fit(self, x, y=None):
        return self

    def transform(self, docs):
       
        return [{'length': len(doc),
                 'num_sentences': self.number_sentences(doc)}
                
                for doc in docs]

In [21]:
# A tokenizer will be defined
from sklearn.base import BaseEstimator, TransformerMixin
from nltk.stem import SnowballStemmer
from nltk import word_tokenize
from nltk.corpus import stopwords
import string

def custom_tokenizer(words):
    tokens = word_tokenize(words.lower())
    stemmer = SnowballStemmer('spanish')
    lemmas = [stemmer.stem(t) for t in tokens]
    stoplist = stopwords.words('spanish')
    lemmas_clean = [w for w in lemmas if w not in stoplist]
    punctuation = set(string.punctuation)
    lemmas_punct = [w for w in lemmas_clean if  w not in punctuation]
    return lemmas_punct



## Syntactic features

ALOMEJOR HAY QUE QUITARLO

In [22]:
# We will use NLTK's tag set
from sklearn.base import BaseEstimator, TransformerMixin
from nltk import pos_tag, word_tokenize
import collections

# We can extract particular chunks (trozos, pedazos) from the sentence
# if we use a RegExpParser. See Syntactic Processing
def PosStats(BaseEstimator, TransformerMixin):
    
    def stats(self, doc):
        tokens = custom_tokenizer(doc)
        
        tagged = pos_tag(tokens, tagset = 'universal' )
        counts = collections.Counter(tag for word, tag in tagged)
        total = sum(counts.values())
        #copy tags so that we return always the same number of features
        pos_features = {'NOUN': 0, 'ADJ': 0, 'VERB': 0, 'ADV': 0, 'CONJ': 0, 
                        'ADP': 0, 'PRON':0, 'NUM': 0}
        
        pos_dic = dict((tag, float(count)/total) for tag,count in counts.items())
        for k in pos_dic:
            if k in pos_features:
                pos_features[k] = pos_dic[k]
        return pos_features
    
    def transform(self, docs, y=None):
        return [self.stats(doc) for doc in docs]
    
    def fit(self, docs, y=None):
        """Returns `self` unless something different happens in train and test"""
        return self
        

## Feature extraction Pipeline
The feature extraction will be carried out by using pipelines. The defined pipelines are selected in order to extract the desired features

In [23]:
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer


ngrams_featurizer = Pipeline([
  ('count_vectorizer',  CountVectorizer(ngram_range = (1, 2), encoding = 'ISO-8859-1', 
                                        tokenizer=custom_tokenizer)),
  ('tfidf_transformer', TfidfTransformer())
])

## Feature Union Pipeline
Now we define which features we want to extract, how to combine them and later apple machine learning in the resulting feature set.

In [26]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.cross_validation import cross_val_score, KFold
from sklearn.metrics import classification_report
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import FunctionTransformer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.svm import SVC

def my_pipeline(clf):
    pipeline = Pipeline([
       ('features', FeatureUnion([
                    
                ('words', TfidfVectorizer(tokenizer=custom_tokenizer)),
                ('ngrams', ngrams_featurizer),
                    #('pos_stats', Pipeline([
                                #('pos_stats', PosStats()),
                                #('vectors', DictVectorizer())
                            #])),
                ('lda', Pipeline([ 
                             ('count', CountVectorizer(tokenizer=custom_tokenizer)),
                            ('lda',  LatentDirichletAllocation(n_components=45, max_iter=5, # Change ntopics
                                                       learning_method='online', 
                                                       learning_offset=50.,
                                                       random_state=0))
                         ])),
             ])),
       
    ('clf', clf)  # classifier
    ])
    return pipeline
    

## Multinomial NaiveBayes

In [27]:
from sklearn.naive_bayes import  MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
print("Size of training set: {}   size of test set: {}".format(X_train.shape[0], X_test.shape[0]))
model = MultinomialNB(alpha=.01)
modelNB = my_pipeline(model)
modelNB.fit(X_train, y_train)


Size of training set: 4156   size of test set: 2771


Pipeline(memory=None,
     steps=[('features', FeatureUnion(n_jobs=1,
       transformer_list=[('words', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_r...   transformer_weights=None)), ('clf', MultinomialNB(alpha=0.01, class_prior=None, fit_prior=True))])

In [28]:
predicted1 = modelNB.predict(X_test)
expected = y_test

In [29]:
from sklearn import metrics
# Accuracy
metrics.accuracy_score(expected, predicted1)

0.82785997834716707

In [30]:
print(classification_report(expected, predicted1, digits=5))

             precision    recall  f1-score   support

          0    0.81552   0.83196   0.82366      1339
          1    0.83986   0.82402   0.83186      1432

avg / total    0.82810   0.82786   0.82790      2771



### SVC

In [31]:
from sklearn.svm import SVC
from sklearn import metrics

types_of_kernels = ['linear', 'rbf', 'poly']

kernel = types_of_kernels[0]
gamma = 3.0

# Create SVC model
model = SVC(kernel=kernel, probability=True, gamma=gamma)
modelSVC = my_pipeline(model)
modelSVC.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('features', FeatureUnion(n_jobs=1,
       transformer_list=[('words', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_r...',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False))])

In [32]:
predicted2 = modelSVC.predict(X_test)
expected = y_test
metrics.accuracy_score(expected, predicted2)
print(classification_report(expected, predicted2, digits=5))

             precision    recall  f1-score   support

          0    0.89529   0.89395   0.89462      1339
          1    0.90098   0.90223   0.90161      1432

avg / total    0.89823   0.89823   0.89823      2771



### Kneighbors Classifier

In [65]:
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier(n_neighbors=7, algorithm='ball_tree')
modelKnn = my_pipeline(model)
modelKnn.fit(X_train, y_train)



Pipeline(memory=None,
     steps=[('features', FeatureUnion(n_jobs=1,
       transformer_list=[('words', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_r...owski',
           metric_params=None, n_jobs=1, n_neighbors=7, p=2,
           weights='uniform'))])

In [66]:
predicted3 = modelKnn.predict(X_test)
expected = y_test
metrics.accuracy_score(expected, predicted3)
print(classification_report(expected, predicted3, digits=5))

             precision    recall  f1-score   support

          0    0.74566   0.81712   0.77976      1367
          1    0.80361   0.72863   0.76429      1404

avg / total    0.77502   0.77228   0.77192      2771



### Logistic Regression classifier

In [33]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(n_jobs = -1)
modelLR = my_pipeline(model)
modelLR.fit(X_train, y_train)

  " = {}.".format(self.n_jobs))


Pipeline(memory=None,
     steps=[('features', FeatureUnion(n_jobs=1,
       transformer_list=[('words', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_r...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

In [34]:
predicted4 = modelLR.predict(X_test)
expected = y_test
metrics.accuracy_score(expected, predicted4)
print(classification_report(expected, predicted4, digits=5))

             precision    recall  f1-score   support

          0    0.90023   0.88275   0.89140      1339
          1    0.89232   0.90852   0.90035      1432

avg / total    0.89614   0.89607   0.89602      2771



## Optimize models
Tune parameters of previously defined models using Grid Search

In [38]:
modelNB.get_params().keys()

dict_keys(['features__ngrams__count_vectorizer__vocabulary', 'features__lda__steps', 'features__n_jobs', 'features__ngrams__count_vectorizer__max_df', 'features__lda__count__ngram_range', 'features__lda__count__stop_words', 'features__words__lowercase', 'features__ngrams__count_vectorizer__token_pattern', 'features__ngrams__count_vectorizer__analyzer', 'features__ngrams__count_vectorizer__preprocessor', 'features__words__decode_error', 'features__lda__lda__learning_method', 'features__ngrams__count_vectorizer__lowercase', 'features__ngrams__count_vectorizer__binary', 'features__lda', 'features__ngrams__count_vectorizer__decode_error', 'features__lda__lda__perp_tol', 'steps', 'features__words__dtype', 'features__ngrams__tfidf_transformer__norm', 'features__lda__count__vocabulary', 'features__words__token_pattern', 'features__lda__count__min_df', 'features__words__sublinear_tf', 'features__ngrams__count_vectorizer__stop_words', 'features__words__tokenizer', 'features__words__min_df', 'fe

### Multinomial NaiveBayes

In [37]:
from sklearn.model_selection import GridSearchCV
# Used alpha = .01
parametersNB = {'clf__alpha':numpy.linspace(0,2,20)[1:], 'features__lda__lda__n_components':numpy.linspace(1,45,20)[1:]}
scoresNB = ['precision']
for score in scoresNB:
    print("# Tuning hyper-parameters for %s" % score)
    print()

    gs_NB = GridSearchCV(modelNB,parametersNB, n_jobs=-1, scoring='%s_macro' % score)
    gs_NB.fit(X_train, y_train)

    print("Best parameters set found on development set:")
    print()
    print(gs_NB.best_params_)
    print()
    print("Grid scores on development set:")
    print()
    means = gs_NB.cv_results_['mean_test_score']
    stds = gs_NB.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, gs_NB.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))
    print()

    print("Detailed classification report:")
    print()
    print("The model is trained on the full development set.")
    print("The scores are computed on the full evaluation set.")
    print()
    y_true, y_pred = y_dev, gs_NB.predict(X_dev)
    print(classification_report(y_true, y_pred))
    print()
    

# Tuning hyper-parameters for precision



JoblibValueError: JoblibValueError
___________________________________________________________________________
Multiprocessing exception:
...........................................................................
/anaconda3/lib/python3.5/runpy.py in _run_module_as_main(mod_name='ipykernel_launcher', alter_argv=1)
    188         sys.exit(msg)
    189     main_globals = sys.modules["__main__"].__dict__
    190     if alter_argv:
    191         sys.argv[0] = mod_spec.origin
    192     return _run_code(code, main_globals, None,
--> 193                      "__main__", mod_spec)
        mod_spec = ModuleSpec(name='ipykernel_launcher', loader=<_f...b/python3.5/site-packages/ipykernel_launcher.py')
    194 
    195 def run_module(mod_name, init_globals=None,
    196                run_name=None, alter_sys=False):
    197     """Execute a module's code without importing it

...........................................................................
/anaconda3/lib/python3.5/runpy.py in _run_code(code=<code object <module> at 0x103c06660, file "/ana...3.5/site-packages/ipykernel_launcher.py", line 5>, run_globals={'__builtins__': <module 'builtins' (built-in)>, '__cached__': '/anaconda3/lib/python3.5/site-packages/__pycache__/ipykernel_launcher.cpython-35.pyc', '__doc__': 'Entry point for launching an IPython kernel.\n\nTh...orts until\nafter removing the cwd from sys.path.\n', '__file__': '/anaconda3/lib/python3.5/site-packages/ipykernel_launcher.py', '__loader__': <_frozen_importlib_external.SourceFileLoader object>, '__name__': '__main__', '__package__': '', '__spec__': ModuleSpec(name='ipykernel_launcher', loader=<_f...b/python3.5/site-packages/ipykernel_launcher.py'), 'app': <module 'ipykernel.kernelapp' from '/anaconda3/lib/python3.5/site-packages/ipykernel/kernelapp.py'>, 'sys': <module 'sys' (built-in)>}, init_globals=None, mod_name='__main__', mod_spec=ModuleSpec(name='ipykernel_launcher', loader=<_f...b/python3.5/site-packages/ipykernel_launcher.py'), pkg_name='', script_name=None)
     80                        __cached__ = cached,
     81                        __doc__ = None,
     82                        __loader__ = loader,
     83                        __package__ = pkg_name,
     84                        __spec__ = mod_spec)
---> 85     exec(code, run_globals)
        code = <code object <module> at 0x103c06660, file "/ana...3.5/site-packages/ipykernel_launcher.py", line 5>
        run_globals = {'__builtins__': <module 'builtins' (built-in)>, '__cached__': '/anaconda3/lib/python3.5/site-packages/__pycache__/ipykernel_launcher.cpython-35.pyc', '__doc__': 'Entry point for launching an IPython kernel.\n\nTh...orts until\nafter removing the cwd from sys.path.\n', '__file__': '/anaconda3/lib/python3.5/site-packages/ipykernel_launcher.py', '__loader__': <_frozen_importlib_external.SourceFileLoader object>, '__name__': '__main__', '__package__': '', '__spec__': ModuleSpec(name='ipykernel_launcher', loader=<_f...b/python3.5/site-packages/ipykernel_launcher.py'), 'app': <module 'ipykernel.kernelapp' from '/anaconda3/lib/python3.5/site-packages/ipykernel/kernelapp.py'>, 'sys': <module 'sys' (built-in)>}
     86     return run_globals
     87 
     88 def _run_module_code(code, init_globals=None,
     89                     mod_name=None, mod_spec=None,

...........................................................................
/anaconda3/lib/python3.5/site-packages/ipykernel_launcher.py in <module>()
     11     # This is added back by InteractiveShellApp.init_path()
     12     if sys.path[0] == '':
     13         del sys.path[0]
     14 
     15     from ipykernel import kernelapp as app
---> 16     app.launch_new_instance()

...........................................................................
/anaconda3/lib/python3.5/site-packages/traitlets/config/application.py in launch_instance(cls=<class 'ipykernel.kernelapp.IPKernelApp'>, argv=None, **kwargs={})
    653 
    654         If a global instance already exists, this reinitializes and starts it
    655         """
    656         app = cls.instance(**kwargs)
    657         app.initialize(argv)
--> 658         app.start()
        app.start = <bound method IPKernelApp.start of <ipykernel.kernelapp.IPKernelApp object>>
    659 
    660 #-----------------------------------------------------------------------------
    661 # utility functions, for convenience
    662 #-----------------------------------------------------------------------------

...........................................................................
/anaconda3/lib/python3.5/site-packages/ipykernel/kernelapp.py in start(self=<ipykernel.kernelapp.IPKernelApp object>)
    492         if self.poller is not None:
    493             self.poller.start()
    494         self.kernel.start()
    495         self.io_loop = ioloop.IOLoop.current()
    496         try:
--> 497             self.io_loop.start()
        self.io_loop.start = <bound method BaseAsyncIOLoop.start of <tornado.platform.asyncio.AsyncIOMainLoop object>>
    498         except KeyboardInterrupt:
    499             pass
    500 
    501 launch_new_instance = IPKernelApp.launch_instance

...........................................................................
/anaconda3/lib/python3.5/site-packages/tornado/platform/asyncio.py in start(self=<tornado.platform.asyncio.AsyncIOMainLoop object>)
    127         except (RuntimeError, AssertionError):
    128             old_loop = None
    129         try:
    130             self._setup_logging()
    131             asyncio.set_event_loop(self.asyncio_loop)
--> 132             self.asyncio_loop.run_forever()
        self.asyncio_loop.run_forever = <bound method BaseEventLoop.run_forever of <_Uni...EventLoop running=True closed=False debug=False>>
    133         finally:
    134             asyncio.set_event_loop(old_loop)
    135 
    136     def stop(self):

...........................................................................
/anaconda3/lib/python3.5/asyncio/base_events.py in run_forever(self=<_UnixSelectorEventLoop running=True closed=False debug=False>)
    416             sys.set_asyncgen_hooks(firstiter=self._asyncgen_firstiter_hook,
    417                                    finalizer=self._asyncgen_finalizer_hook)
    418         try:
    419             events._set_running_loop(self)
    420             while True:
--> 421                 self._run_once()
        self._run_once = <bound method BaseEventLoop._run_once of <_UnixS...EventLoop running=True closed=False debug=False>>
    422                 if self._stopping:
    423                     break
    424         finally:
    425             self._stopping = False

...........................................................................
/anaconda3/lib/python3.5/asyncio/base_events.py in _run_once(self=<_UnixSelectorEventLoop running=True closed=False debug=False>)
   1420                         logger.warning('Executing %s took %.3f seconds',
   1421                                        _format_handle(handle), dt)
   1422                 finally:
   1423                     self._current_handle = None
   1424             else:
-> 1425                 handle._run()
        handle._run = <bound method Handle._run of <Handle BaseAsyncIOLoop._handle_events(15, 1)>>
   1426         handle = None  # Needed to break cycles when an exception occurs.
   1427 
   1428     def _set_coroutine_wrapper(self, enabled):
   1429         try:

...........................................................................
/anaconda3/lib/python3.5/asyncio/events.py in _run(self=<Handle BaseAsyncIOLoop._handle_events(15, 1)>)
    122             self._callback = None
    123             self._args = None
    124 
    125     def _run(self):
    126         try:
--> 127             self._callback(*self._args)
        self._callback = <bound method BaseAsyncIOLoop._handle_events of <tornado.platform.asyncio.AsyncIOMainLoop object>>
        self._args = (15, 1)
    128         except Exception as exc:
    129             cb = _format_callback_source(self._callback, self._args)
    130             msg = 'Exception in callback {}'.format(cb)
    131             context = {

...........................................................................
/anaconda3/lib/python3.5/site-packages/tornado/platform/asyncio.py in _handle_events(self=<tornado.platform.asyncio.AsyncIOMainLoop object>, fd=15, events=1)
    117             self.writers.remove(fd)
    118         del self.handlers[fd]
    119 
    120     def _handle_events(self, fd, events):
    121         fileobj, handler_func = self.handlers[fd]
--> 122         handler_func(fileobj, events)
        handler_func = <function wrap.<locals>.null_wrapper>
        fileobj = <zmq.sugar.socket.Socket object>
        events = 1
    123 
    124     def start(self):
    125         try:
    126             old_loop = asyncio.get_event_loop()

...........................................................................
/anaconda3/lib/python3.5/site-packages/tornado/stack_context.py in null_wrapper(*args=(<zmq.sugar.socket.Socket object>, 1), **kwargs={})
    295         # Fast path when there are no active contexts.
    296         def null_wrapper(*args, **kwargs):
    297             try:
    298                 current_state = _state.contexts
    299                 _state.contexts = cap_contexts[0]
--> 300                 return fn(*args, **kwargs)
        args = (<zmq.sugar.socket.Socket object>, 1)
        kwargs = {}
    301             finally:
    302                 _state.contexts = current_state
    303         null_wrapper._wrapped = True
    304         return null_wrapper

...........................................................................
/anaconda3/lib/python3.5/site-packages/zmq/eventloop/zmqstream.py in _handle_events(self=<zmq.eventloop.zmqstream.ZMQStream object>, fd=<zmq.sugar.socket.Socket object>, events=1)
    445             return
    446         zmq_events = self.socket.EVENTS
    447         try:
    448             # dispatch events:
    449             if zmq_events & zmq.POLLIN and self.receiving():
--> 450                 self._handle_recv()
        self._handle_recv = <bound method ZMQStream._handle_recv of <zmq.eventloop.zmqstream.ZMQStream object>>
    451                 if not self.socket:
    452                     return
    453             if zmq_events & zmq.POLLOUT and self.sending():
    454                 self._handle_send()

...........................................................................
/anaconda3/lib/python3.5/site-packages/zmq/eventloop/zmqstream.py in _handle_recv(self=<zmq.eventloop.zmqstream.ZMQStream object>)
    475             else:
    476                 raise
    477         else:
    478             if self._recv_callback:
    479                 callback = self._recv_callback
--> 480                 self._run_callback(callback, msg)
        self._run_callback = <bound method ZMQStream._run_callback of <zmq.eventloop.zmqstream.ZMQStream object>>
        callback = <function wrap.<locals>.null_wrapper>
        msg = [<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>]
    481         
    482 
    483     def _handle_send(self):
    484         """Handle a send event."""

...........................................................................
/anaconda3/lib/python3.5/site-packages/zmq/eventloop/zmqstream.py in _run_callback(self=<zmq.eventloop.zmqstream.ZMQStream object>, callback=<function wrap.<locals>.null_wrapper>, *args=([<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>],), **kwargs={})
    427         close our socket."""
    428         try:
    429             # Use a NullContext to ensure that all StackContexts are run
    430             # inside our blanket exception handler rather than outside.
    431             with stack_context.NullContext():
--> 432                 callback(*args, **kwargs)
        callback = <function wrap.<locals>.null_wrapper>
        args = ([<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>],)
        kwargs = {}
    433         except:
    434             gen_log.error("Uncaught exception in ZMQStream callback",
    435                           exc_info=True)
    436             # Re-raise the exception so that IOLoop.handle_callback_exception

...........................................................................
/anaconda3/lib/python3.5/site-packages/tornado/stack_context.py in null_wrapper(*args=([<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>],), **kwargs={})
    295         # Fast path when there are no active contexts.
    296         def null_wrapper(*args, **kwargs):
    297             try:
    298                 current_state = _state.contexts
    299                 _state.contexts = cap_contexts[0]
--> 300                 return fn(*args, **kwargs)
        args = ([<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>],)
        kwargs = {}
    301             finally:
    302                 _state.contexts = current_state
    303         null_wrapper._wrapped = True
    304         return null_wrapper

...........................................................................
/anaconda3/lib/python3.5/site-packages/ipykernel/kernelbase.py in dispatcher(msg=[<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>])
    278         if self.control_stream:
    279             self.control_stream.on_recv(self.dispatch_control, copy=False)
    280 
    281         def make_dispatcher(stream):
    282             def dispatcher(msg):
--> 283                 return self.dispatch_shell(stream, msg)
        msg = [<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>]
    284             return dispatcher
    285 
    286         for s in self.shell_streams:
    287             s.on_recv(make_dispatcher(s), copy=False)

...........................................................................
/anaconda3/lib/python3.5/site-packages/ipykernel/kernelbase.py in dispatch_shell(self=<ipykernel.ipkernel.IPythonKernel object>, stream=<zmq.eventloop.zmqstream.ZMQStream object>, msg={'buffers': [], 'content': {'allow_stdin': True, 'code': 'from sklearn.model_selection import GridSearchCV...fication_report(y_true, y_pred))\n    print()\n    ', 'silent': False, 'stop_on_error': True, 'store_history': True, 'user_expressions': {}}, 'header': {'date': datetime.datetime(2018, 11, 18, 12, 40, 58, 145254, tzinfo=tzutc()), 'msg_id': 'ab04f8537ace43d88291274f72840f14', 'msg_type': 'execute_request', 'session': '0614b16bf7de4819bbc9047b044d79ec', 'username': 'username', 'version': '5.2'}, 'metadata': {}, 'msg_id': 'ab04f8537ace43d88291274f72840f14', 'msg_type': 'execute_request', 'parent_header': {}})
    228             self.log.warning("Unknown message type: %r", msg_type)
    229         else:
    230             self.log.debug("%s: %s", msg_type, msg)
    231             self.pre_handler_hook()
    232             try:
--> 233                 handler(stream, idents, msg)
        handler = <bound method Kernel.execute_request of <ipykernel.ipkernel.IPythonKernel object>>
        stream = <zmq.eventloop.zmqstream.ZMQStream object>
        idents = [b'0614b16bf7de4819bbc9047b044d79ec']
        msg = {'buffers': [], 'content': {'allow_stdin': True, 'code': 'from sklearn.model_selection import GridSearchCV...fication_report(y_true, y_pred))\n    print()\n    ', 'silent': False, 'stop_on_error': True, 'store_history': True, 'user_expressions': {}}, 'header': {'date': datetime.datetime(2018, 11, 18, 12, 40, 58, 145254, tzinfo=tzutc()), 'msg_id': 'ab04f8537ace43d88291274f72840f14', 'msg_type': 'execute_request', 'session': '0614b16bf7de4819bbc9047b044d79ec', 'username': 'username', 'version': '5.2'}, 'metadata': {}, 'msg_id': 'ab04f8537ace43d88291274f72840f14', 'msg_type': 'execute_request', 'parent_header': {}}
    234             except Exception:
    235                 self.log.error("Exception in message handler:", exc_info=True)
    236             finally:
    237                 self.post_handler_hook()

...........................................................................
/anaconda3/lib/python3.5/site-packages/ipykernel/kernelbase.py in execute_request(self=<ipykernel.ipkernel.IPythonKernel object>, stream=<zmq.eventloop.zmqstream.ZMQStream object>, ident=[b'0614b16bf7de4819bbc9047b044d79ec'], parent={'buffers': [], 'content': {'allow_stdin': True, 'code': 'from sklearn.model_selection import GridSearchCV...fication_report(y_true, y_pred))\n    print()\n    ', 'silent': False, 'stop_on_error': True, 'store_history': True, 'user_expressions': {}}, 'header': {'date': datetime.datetime(2018, 11, 18, 12, 40, 58, 145254, tzinfo=tzutc()), 'msg_id': 'ab04f8537ace43d88291274f72840f14', 'msg_type': 'execute_request', 'session': '0614b16bf7de4819bbc9047b044d79ec', 'username': 'username', 'version': '5.2'}, 'metadata': {}, 'msg_id': 'ab04f8537ace43d88291274f72840f14', 'msg_type': 'execute_request', 'parent_header': {}})
    394         if not silent:
    395             self.execution_count += 1
    396             self._publish_execute_input(code, parent, self.execution_count)
    397 
    398         reply_content = self.do_execute(code, silent, store_history,
--> 399                                         user_expressions, allow_stdin)
        user_expressions = {}
        allow_stdin = True
    400 
    401         # Flush output before sending the reply.
    402         sys.stdout.flush()
    403         sys.stderr.flush()

...........................................................................
/anaconda3/lib/python3.5/site-packages/ipykernel/ipkernel.py in do_execute(self=<ipykernel.ipkernel.IPythonKernel object>, code='from sklearn.model_selection import GridSearchCV...fication_report(y_true, y_pred))\n    print()\n    ', silent=False, store_history=True, user_expressions={}, allow_stdin=True)
    203 
    204         self._forward_input(allow_stdin)
    205 
    206         reply_content = {}
    207         try:
--> 208             res = shell.run_cell(code, store_history=store_history, silent=silent)
        res = undefined
        shell.run_cell = <bound method ZMQInteractiveShell.run_cell of <ipykernel.zmqshell.ZMQInteractiveShell object>>
        code = 'from sklearn.model_selection import GridSearchCV...fication_report(y_true, y_pred))\n    print()\n    '
        store_history = True
        silent = False
    209         finally:
    210             self._restore_input()
    211 
    212         if res.error_before_exec is not None:

...........................................................................
/anaconda3/lib/python3.5/site-packages/ipykernel/zmqshell.py in run_cell(self=<ipykernel.zmqshell.ZMQInteractiveShell object>, *args=('from sklearn.model_selection import GridSearchCV...fication_report(y_true, y_pred))\n    print()\n    ',), **kwargs={'silent': False, 'store_history': True})
    532             )
    533         self.payload_manager.write_payload(payload)
    534 
    535     def run_cell(self, *args, **kwargs):
    536         self._last_traceback = None
--> 537         return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
        self.run_cell = <bound method ZMQInteractiveShell.run_cell of <ipykernel.zmqshell.ZMQInteractiveShell object>>
        args = ('from sklearn.model_selection import GridSearchCV...fication_report(y_true, y_pred))\n    print()\n    ',)
        kwargs = {'silent': False, 'store_history': True}
    538 
    539     def _showtraceback(self, etype, evalue, stb):
    540         # try to preserve ordering of tracebacks and print statements
    541         sys.stdout.flush()

...........................................................................
/anaconda3/lib/python3.5/site-packages/IPython/core/interactiveshell.py in run_cell(self=<ipykernel.zmqshell.ZMQInteractiveShell object>, raw_cell='from sklearn.model_selection import GridSearchCV...fication_report(y_true, y_pred))\n    print()\n    ', store_history=True, silent=False, shell_futures=True)
   2657         -------
   2658         result : :class:`ExecutionResult`
   2659         """
   2660         try:
   2661             result = self._run_cell(
-> 2662                 raw_cell, store_history, silent, shell_futures)
        raw_cell = 'from sklearn.model_selection import GridSearchCV...fication_report(y_true, y_pred))\n    print()\n    '
        store_history = True
        silent = False
        shell_futures = True
   2663         finally:
   2664             self.events.trigger('post_execute')
   2665             if not silent:
   2666                 self.events.trigger('post_run_cell', result)

...........................................................................
/anaconda3/lib/python3.5/site-packages/IPython/core/interactiveshell.py in _run_cell(self=<ipykernel.zmqshell.ZMQInteractiveShell object>, raw_cell='from sklearn.model_selection import GridSearchCV...fication_report(y_true, y_pred))\n    print()\n    ', store_history=True, silent=False, shell_futures=True)
   2780                 self.displayhook.exec_result = result
   2781 
   2782                 # Execute the user code
   2783                 interactivity = 'none' if silent else self.ast_node_interactivity
   2784                 has_raised = self.run_ast_nodes(code_ast.body, cell_name,
-> 2785                    interactivity=interactivity, compiler=compiler, result=result)
        interactivity = 'last_expr'
        compiler = <IPython.core.compilerop.CachingCompiler object>
   2786                 
   2787                 self.last_execution_succeeded = not has_raised
   2788                 self.last_execution_result = result
   2789 

...........................................................................
/anaconda3/lib/python3.5/site-packages/IPython/core/interactiveshell.py in run_ast_nodes(self=<ipykernel.zmqshell.ZMQInteractiveShell object>, nodelist=[<_ast.ImportFrom object>, <_ast.Assign object>, <_ast.Assign object>, <_ast.For object>], cell_name='<ipython-input-37-1e7dfc82b7b9>', interactivity='none', compiler=<IPython.core.compilerop.CachingCompiler object>, result=<ExecutionResult object at 1a0ceb2f98, execution...rue silent=False shell_futures=True> result=None>)
   2896             raise ValueError("Interactivity was %r" % interactivity)
   2897         try:
   2898             for i, node in enumerate(to_run_exec):
   2899                 mod = ast.Module([node])
   2900                 code = compiler(mod, cell_name, "exec")
-> 2901                 if self.run_code(code, result):
        self.run_code = <bound method InteractiveShell.run_code of <ipykernel.zmqshell.ZMQInteractiveShell object>>
        code = <code object <module> at 0x1a0c7f76f0, file "<ipython-input-37-1e7dfc82b7b9>", line 5>
        result = <ExecutionResult object at 1a0ceb2f98, execution...rue silent=False shell_futures=True> result=None>
   2902                     return True
   2903 
   2904             for i, node in enumerate(to_run_interactive):
   2905                 mod = ast.Interactive([node])

...........................................................................
/anaconda3/lib/python3.5/site-packages/IPython/core/interactiveshell.py in run_code(self=<ipykernel.zmqshell.ZMQInteractiveShell object>, code_obj=<code object <module> at 0x1a0c7f76f0, file "<ipython-input-37-1e7dfc82b7b9>", line 5>, result=<ExecutionResult object at 1a0ceb2f98, execution...rue silent=False shell_futures=True> result=None>)
   2956         outflag = True  # happens in more places, so it's easier as default
   2957         try:
   2958             try:
   2959                 self.hooks.pre_run_code_hook()
   2960                 #rprint('Running code', repr(code_obj)) # dbg
-> 2961                 exec(code_obj, self.user_global_ns, self.user_ns)
        code_obj = <code object <module> at 0x1a0c7f76f0, file "<ipython-input-37-1e7dfc82b7b9>", line 5>
        self.user_global_ns = {'BaseEstimator': <class 'sklearn.base.BaseEstimator'>, 'CountVectorizer': <class 'sklearn.feature_extraction.text.CountVectorizer'>, 'DataFrame': <class 'pandas.core.frame.DataFrame'>, 'DictVectorizer': <class 'sklearn.feature_extraction.dict_vectorizer.DictVectorizer'>, 'FeatureUnion': <class 'sklearn.pipeline.FeatureUnion'>, 'FunctionTransformer': <class 'sklearn.preprocessing._function_transformer.FunctionTransformer'>, 'GridSearchCV': <class 'sklearn.model_selection._search.GridSearchCV'>, 'In': ['', "# General import and load data\nfrom sklearn.mode...ows containing nan\ndf=df.dropna(subset=['tweet'])", "# Before splitting database, a shuffling action ...\ny = df['ironic'].values.astype(int)\nprint(X[34])", "df.groupby('ironic').size()", '\n# Splitting\n# Test set will be the 25% taken ra...test_split(X, y, test_size=0.25, random_state=33)', '# A tokenizer will be defined\nfrom sklearn.base ...if  w not in punctuation]\n    return lemmas_punct', '# We will use NLTK\'s tag set\nfrom sklearn.base i...in train and test"""\n        return self\n        ', "from sklearn.pipeline import Pipeline, FeatureUn...),\n  ('tfidf_transformer', TfidfTransformer())\n])", 'from sklearn.naive_bayes import MultinomialNB\nfr...lf)  # classifier\n    ])\n    return pipeline\n    ', '\n# Splitting\n# Test set will be the 25% taken ra...train_test_split(X_train, y_train, test_size=0.5)', '\n# Splitting\n# Test set will be the 25% taken ra...lit(X_train, y_train, test_size=0.5)\nprint(X_dev)', "\n# Splitting\n# Test set will be the 25% taken ra...len(X_train)','len(X_dev)','len(X_train)','len(X)", "\n# Splitting\n# Test set will be the 25% taken ra...en(X_train)','len(X_dev)','len(X_train)','len(X))", "\n# Splitting\n# Test set will be the 25% taken ra...rain)+','+len(X_dev)+','+len(X_train)+','+len(X))", '\n# Splitting\n# Test set will be the 25% taken ra...rain, y_train, test_size=0.5)\nprint(len(X_train))', '\n# Splitting\n# Test set will be the 25% taken ra...t_size=0.5)\nprint(len(X_train))\nprint(len(X_dev))', '\n# Splitting\n# Test set will be the 25% taken ra...len(X_train))\nprint(len(X_dev))\nprint(len(X_test)', '\n# Splitting\n# Test set will be the 25% taken ra...len(X_train))\nprint(len(X_dev))\nprint(len(X_test)', '\n# Splitting\n# Test set will be the 25% taken ra...en(X_train))\nprint(len(X_dev))\nprint(len(X_test))', '\n# Splitting\n# Test set will be the 25% taken ra...n))\nprint(len(X_dev))\nprint(len(X_test))\nprint(X)', ...], 'KFold': <class 'sklearn.cross_validation.KFold'>, 'LatentDirichletAllocation': <class 'sklearn.decomposition.online_lda.LatentDirichletAllocation'>, ...}
        self.user_ns = {'BaseEstimator': <class 'sklearn.base.BaseEstimator'>, 'CountVectorizer': <class 'sklearn.feature_extraction.text.CountVectorizer'>, 'DataFrame': <class 'pandas.core.frame.DataFrame'>, 'DictVectorizer': <class 'sklearn.feature_extraction.dict_vectorizer.DictVectorizer'>, 'FeatureUnion': <class 'sklearn.pipeline.FeatureUnion'>, 'FunctionTransformer': <class 'sklearn.preprocessing._function_transformer.FunctionTransformer'>, 'GridSearchCV': <class 'sklearn.model_selection._search.GridSearchCV'>, 'In': ['', "# General import and load data\nfrom sklearn.mode...ows containing nan\ndf=df.dropna(subset=['tweet'])", "# Before splitting database, a shuffling action ...\ny = df['ironic'].values.astype(int)\nprint(X[34])", "df.groupby('ironic').size()", '\n# Splitting\n# Test set will be the 25% taken ra...test_split(X, y, test_size=0.25, random_state=33)', '# A tokenizer will be defined\nfrom sklearn.base ...if  w not in punctuation]\n    return lemmas_punct', '# We will use NLTK\'s tag set\nfrom sklearn.base i...in train and test"""\n        return self\n        ', "from sklearn.pipeline import Pipeline, FeatureUn...),\n  ('tfidf_transformer', TfidfTransformer())\n])", 'from sklearn.naive_bayes import MultinomialNB\nfr...lf)  # classifier\n    ])\n    return pipeline\n    ', '\n# Splitting\n# Test set will be the 25% taken ra...train_test_split(X_train, y_train, test_size=0.5)', '\n# Splitting\n# Test set will be the 25% taken ra...lit(X_train, y_train, test_size=0.5)\nprint(X_dev)', "\n# Splitting\n# Test set will be the 25% taken ra...len(X_train)','len(X_dev)','len(X_train)','len(X)", "\n# Splitting\n# Test set will be the 25% taken ra...en(X_train)','len(X_dev)','len(X_train)','len(X))", "\n# Splitting\n# Test set will be the 25% taken ra...rain)+','+len(X_dev)+','+len(X_train)+','+len(X))", '\n# Splitting\n# Test set will be the 25% taken ra...rain, y_train, test_size=0.5)\nprint(len(X_train))', '\n# Splitting\n# Test set will be the 25% taken ra...t_size=0.5)\nprint(len(X_train))\nprint(len(X_dev))', '\n# Splitting\n# Test set will be the 25% taken ra...len(X_train))\nprint(len(X_dev))\nprint(len(X_test)', '\n# Splitting\n# Test set will be the 25% taken ra...len(X_train))\nprint(len(X_dev))\nprint(len(X_test)', '\n# Splitting\n# Test set will be the 25% taken ra...en(X_train))\nprint(len(X_dev))\nprint(len(X_test))', '\n# Splitting\n# Test set will be the 25% taken ra...n))\nprint(len(X_dev))\nprint(len(X_test))\nprint(X)', ...], 'KFold': <class 'sklearn.cross_validation.KFold'>, 'LatentDirichletAllocation': <class 'sklearn.decomposition.online_lda.LatentDirichletAllocation'>, ...}
   2962             finally:
   2963                 # Reset our crash handler in place
   2964                 sys.excepthook = old_excepthook
   2965         except SystemExit as e:

...........................................................................
/Users/juanalvarez/Desktop/TFG/TFG/<ipython-input-37-1e7dfc82b7b9> in <module>()
      5 for score in scoresNB:
      6     print("# Tuning hyper-parameters for %s" % score)
      7     print()
      8 
      9     gs_NB = GridSearchCV(modelNB,parametersNB, n_jobs=-1, scoring='%s_macro' % score)
---> 10     gs_NB.fit(X_train, y_train)
     11 
     12     print("Best parameters set found on development set:")
     13     print()
     14     print(gs_NB.best_params_)

...........................................................................
/anaconda3/lib/python3.5/site-packages/sklearn/model_selection/_search.py in fit(self=GridSearchCV(cv=None, error_score='raise',
     ...rn',
       scoring='precision_macro', verbose=0), X=array([ 'Y si el proximo domingo a estas horas #...ad \n#SEP http://t.co/ZuDd6KRW8T'], dtype=object), y=array([0, 0, 0, ..., 1, 1, 1]), groups=None, **fit_params={})
    634                                   return_train_score=self.return_train_score,
    635                                   return_n_test_samples=True,
    636                                   return_times=True, return_parameters=False,
    637                                   error_score=self.error_score)
    638           for parameters, (train, test) in product(candidate_params,
--> 639                                                    cv.split(X, y, groups)))
        cv.split = <bound method StratifiedKFold.split of Stratifie...ld(n_splits=3, random_state=None, shuffle=False)>
        X = array([ 'Y si el proximo domingo a estas horas #...ad \n#SEP http://t.co/ZuDd6KRW8T'], dtype=object)
        y = array([0, 0, 0, ..., 1, 1, 1])
        groups = None
    640 
    641         # if one choose to see train score, "out" will contain train score info
    642         if self.return_train_score:
    643             (train_score_dicts, test_score_dicts, test_sample_counts, fit_time,

...........................................................................
/anaconda3/lib/python3.5/site-packages/sklearn/externals/joblib/parallel.py in __call__(self=Parallel(n_jobs=-1), iterable=<generator object BaseSearchCV.fit.<locals>.<genexpr>>)
    784             if pre_dispatch == "all" or n_jobs == 1:
    785                 # The iterable was consumed all at once by the above for loop.
    786                 # No need to wait for async callbacks to trigger to
    787                 # consumption.
    788                 self._iterating = False
--> 789             self.retrieve()
        self.retrieve = <bound method Parallel.retrieve of Parallel(n_jobs=-1)>
    790             # Make sure that we get a last message telling us we are done
    791             elapsed_time = time.time() - self._start_time
    792             self._print('Done %3i out of %3i | elapsed: %s finished',
    793                         (len(self._output), len(self._output),

---------------------------------------------------------------------------
Sub-process traceback:
---------------------------------------------------------------------------
ValueError                                         Sun Nov 18 13:40:58 2018
PID: 3141                               Python 3.5.5: /anaconda3/bin/python
...........................................................................
/anaconda3/lib/python3.5/site-packages/sklearn/externals/joblib/parallel.py in __call__(self=<sklearn.externals.joblib.parallel.BatchedCalls object>)
    126     def __init__(self, iterator_slice):
    127         self.items = list(iterator_slice)
    128         self._size = len(self.items)
    129 
    130     def __call__(self):
--> 131         return [func(*args, **kwargs) for func, args, kwargs in self.items]
        self.items = [(<function _fit_and_score>, (Pipeline(memory=None,
     steps=[('features', F...(alpha=0.01, class_prior=None, fit_prior=True))]), array([ 'Y si el proximo domingo a estas horas #...ad \n#SEP http://t.co/ZuDd6KRW8T'], dtype=object), array([0, 0, 0, ..., 1, 1, 1]), {'score': make_scorer(precision_score, average=macro, pos_label=None)}, array([1386, 1387, 1388, ..., 4153, 4154, 4155]), array([   0,    1,    2, ..., 1383, 1384, 1385]), 0, {'clf__alpha': 0.10526315789473684, 'lda__n_components': 3.3157894736842106}), {'error_score': 'raise', 'fit_params': {}, 'return_n_test_samples': True, 'return_parameters': False, 'return_times': True, 'return_train_score': 'warn'})]
    132 
    133     def __len__(self):
    134         return self._size
    135 

...........................................................................
/anaconda3/lib/python3.5/site-packages/sklearn/externals/joblib/parallel.py in <listcomp>(.0=<list_iterator object>)
    126     def __init__(self, iterator_slice):
    127         self.items = list(iterator_slice)
    128         self._size = len(self.items)
    129 
    130     def __call__(self):
--> 131         return [func(*args, **kwargs) for func, args, kwargs in self.items]
        func = <function _fit_and_score>
        args = (Pipeline(memory=None,
     steps=[('features', F...(alpha=0.01, class_prior=None, fit_prior=True))]), array([ 'Y si el proximo domingo a estas horas #...ad \n#SEP http://t.co/ZuDd6KRW8T'], dtype=object), array([0, 0, 0, ..., 1, 1, 1]), {'score': make_scorer(precision_score, average=macro, pos_label=None)}, array([1386, 1387, 1388, ..., 4153, 4154, 4155]), array([   0,    1,    2, ..., 1383, 1384, 1385]), 0, {'clf__alpha': 0.10526315789473684, 'lda__n_components': 3.3157894736842106})
        kwargs = {'error_score': 'raise', 'fit_params': {}, 'return_n_test_samples': True, 'return_parameters': False, 'return_times': True, 'return_train_score': 'warn'}
    132 
    133     def __len__(self):
    134         return self._size
    135 

...........................................................................
/anaconda3/lib/python3.5/site-packages/sklearn/model_selection/_validation.py in _fit_and_score(estimator=Pipeline(memory=None,
     steps=[('features', F...(alpha=0.01, class_prior=None, fit_prior=True))]), X=array([ 'Y si el proximo domingo a estas horas #...ad \n#SEP http://t.co/ZuDd6KRW8T'], dtype=object), y=array([0, 0, 0, ..., 1, 1, 1]), scorer={'score': make_scorer(precision_score, average=macro, pos_label=None)}, train=array([1386, 1387, 1388, ..., 4153, 4154, 4155]), test=array([   0,    1,    2, ..., 1383, 1384, 1385]), verbose=0, parameters={'clf__alpha': 0.10526315789473684, 'lda__n_components': 3.3157894736842106}, fit_params={}, return_train_score='warn', return_parameters=False, return_n_test_samples=True, return_times=True, error_score='raise')
    439                       for k, v in fit_params.items()])
    440 
    441     test_scores = {}
    442     train_scores = {}
    443     if parameters is not None:
--> 444         estimator.set_params(**parameters)
        estimator.set_params = <bound method Pipeline.set_params of Pipeline(me...alpha=0.01, class_prior=None, fit_prior=True))])>
        parameters = {'clf__alpha': 0.10526315789473684, 'lda__n_components': 3.3157894736842106}
    445 
    446     start_time = time.time()
    447 
    448     X_train, y_train = _safe_split(estimator, X, y, train)

...........................................................................
/anaconda3/lib/python3.5/site-packages/sklearn/pipeline.py in set_params(self=Pipeline(memory=None,
     steps=[('features', F...(alpha=0.01, class_prior=None, fit_prior=True))]), **kwargs={'clf__alpha': 0.10526315789473684, 'lda__n_components': 3.3157894736842106})
    137 
    138         Returns
    139         -------
    140         self
    141         """
--> 142         self._set_params('steps', **kwargs)
        self._set_params = <bound method _BaseComposition._set_params of Pi...alpha=0.01, class_prior=None, fit_prior=True))])>
        kwargs = {'clf__alpha': 0.10526315789473684, 'lda__n_components': 3.3157894736842106}
    143         return self
    144 
    145     def _validate_steps(self):
    146         names, estimators = zip(*self.steps)

...........................................................................
/anaconda3/lib/python3.5/site-packages/sklearn/utils/metaestimators.py in _set_params(self=Pipeline(memory=None,
     steps=[('features', F...(alpha=0.01, class_prior=None, fit_prior=True))]), attr='steps', **params={'clf__alpha': 0.10526315789473684, 'lda__n_components': 3.3157894736842106})
     44         names, _ = zip(*getattr(self, attr))
     45         for name in list(six.iterkeys(params)):
     46             if '__' not in name and name in names:
     47                 self._replace_estimator(attr, name, params.pop(name))
     48         # 3. Step parameters and other initilisation arguments
---> 49         super(_BaseComposition, self).set_params(**params)
        self.set_params = <bound method Pipeline.set_params of Pipeline(me...alpha=0.01, class_prior=None, fit_prior=True))])>
        params = {'clf__alpha': 0.10526315789473684, 'lda__n_components': 3.3157894736842106}
     50         return self
     51 
     52     def _replace_estimator(self, attr, name, new_val):
     53         # assumes `name` is a valid estimator name

...........................................................................
/anaconda3/lib/python3.5/site-packages/sklearn/base.py in set_params(self=Pipeline(memory=None,
     steps=[('features', F...(alpha=0.01, class_prior=None, fit_prior=True))]), **params={'clf__alpha': 0.10526315789473684, 'lda__n_components': 3.3157894736842106})
    269             key, delim, sub_key = key.partition('__')
    270             if key not in valid_params:
    271                 raise ValueError('Invalid parameter %s for estimator %s. '
    272                                  'Check the list of available parameters '
    273                                  'with `estimator.get_params().keys()`.' %
--> 274                                  (key, self))
        key = 'lda'
        self = Pipeline(memory=None,
     steps=[('features', F...(alpha=0.01, class_prior=None, fit_prior=True))])
    275 
    276             if delim:
    277                 nested_params[key][sub_key] = value
    278             else:

ValueError: Invalid parameter lda for estimator Pipeline(memory=None,
     steps=[('features', FeatureUnion(n_jobs=1,
       transformer_list=[('words', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_r...   transformer_weights=None)), ('clf', MultinomialNB(alpha=0.01, class_prior=None, fit_prior=True))]). Check the list of available parameters with `estimator.get_params().keys()`.
___________________________________________________________________________

gs_NB=gs_NB.fit(X_train,y_train)

print("Best Score with MultinomialNB: %s" % gs_NB.best_score_)
for param_name in sorted(parametersNB.keys()):
    print("%s: %r" % (param_name, gs_NB.best_params_[param_name]))

### SVC

#Optimize SVC


parametersSVC = {'clf__C':range(1,15),'clf__gamma': np.logspace(-6, -1, 10), 'clf__kernel': ('linear','rbf'),
                 'clf__probability':(True,False),}

gs_SVC = GridSearchCV(modelSVC, parametersSVC, n_jobs=-1)

gs_SVC = gs_SVC.fit(X_train, y_train)

 print("Best Score with SVC: %s" % gs_SVC.best_score_)
for param_name in sorted(parametersSVC.keys()):
    print("%s: %r" % (param_name, gs_SVC.best_params_[param_name]))

In [12]:
import numpy as np
from sklearn.model_selection import GridSearchCV

tuned_parameters = [{'clf__kernel': ['rbf'], 'clf__gamma': [1e-3, 1e-4],
                     'clf__C': [1, 10, 100, 1000]},
                    {'clf__kernel': ['linear'], 'clf__C': [1, 10, 100, 1000]}]
scores = ['precision', 'recall']

for score in scores:
    print("# Tuning hyper-parameters for %s" % score)
    print()

    clf = GridSearchCV(modelSVC, tuned_parameters, cv=5,
                       scoring='%s_macro' % score)
    clf.fit(X_train, y_train)

    print("Best parameters set found on development set:")
    print()
    print(clf.best_params_)
    print()
    print("Grid scores on development set:")
    print()
    means = clf.cv_results_['mean_test_score']
    stds = clf.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, clf.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))
    print()

    print("Detailed classification report:")
    print()
    print("The model is trained on the full development set.")
    print("The scores are computed on the full evaluation set.")
    print()
    y_true, y_pred = y_test, clf.predict(X_test)
    print(classification_report(y_true, y_pred))
    print()


# Tuning hyper-parameters for precision



  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


Best parameters set found on development set:

{'clf__C': 1, 'clf__kernel': 'linear'}

Grid scores on development set:

0.552 (+/-0.488) for {'clf__C': 1, 'clf__gamma': 0.001, 'clf__kernel': 'rbf'}
0.253 (+/-0.000) for {'clf__C': 1, 'clf__gamma': 0.0001, 'clf__kernel': 'rbf'}
0.826 (+/-0.039) for {'clf__C': 10, 'clf__gamma': 0.001, 'clf__kernel': 'rbf'}
0.552 (+/-0.488) for {'clf__C': 10, 'clf__gamma': 0.0001, 'clf__kernel': 'rbf'}
0.905 (+/-0.019) for {'clf__C': 100, 'clf__gamma': 0.001, 'clf__kernel': 'rbf'}
0.826 (+/-0.039) for {'clf__C': 100, 'clf__gamma': 0.0001, 'clf__kernel': 'rbf'}
0.907 (+/-0.021) for {'clf__C': 1000, 'clf__gamma': 0.001, 'clf__kernel': 'rbf'}
0.905 (+/-0.019) for {'clf__C': 1000, 'clf__gamma': 0.0001, 'clf__kernel': 'rbf'}
0.912 (+/-0.015) for {'clf__C': 1, 'clf__kernel': 'linear'}
0.906 (+/-0.019) for {'clf__C': 10, 'clf__kernel': 'linear'}
0.906 (+/-0.017) for {'clf__C': 100, 'clf__kernel': 'linear'}
0.906 (+/-0.017) for {'clf__C': 1000, 'clf__kernel': 'lin

In [20]:
modelKnn.get_params().keys()

dict_keys(['memory', 'steps', 'features', 'clf', 'features__n_jobs', 'features__transformer_list', 'features__transformer_weights', 'features__words', 'features__ngrams', 'features__lda', 'features__words__analyzer', 'features__words__binary', 'features__words__decode_error', 'features__words__dtype', 'features__words__encoding', 'features__words__input', 'features__words__lowercase', 'features__words__max_df', 'features__words__max_features', 'features__words__min_df', 'features__words__ngram_range', 'features__words__norm', 'features__words__preprocessor', 'features__words__smooth_idf', 'features__words__stop_words', 'features__words__strip_accents', 'features__words__sublinear_tf', 'features__words__token_pattern', 'features__words__tokenizer', 'features__words__use_idf', 'features__words__vocabulary', 'features__ngrams__memory', 'features__ngrams__steps', 'features__ngrams__count_vectorizer', 'features__ngrams__tfidf_transformer', 'features__ngrams__count_vectorizer__analyzer', 'fe

### KNeighbors Classifier

parametersKN = {'clf__n_neighbors': range(1,15), 'clf__p':(1,2),'clf__algorithm':('ball_tree', 'kd_tree', 'brute')}

gs_KN = GridSearchCV(modelKnn, parametersKN, n_jobs=-1)

gs_KN = gs_KN.fit(X_train, y_train)

 print("Best Score with KN: %s" % gs_KN.best_score_)
for param_name in sorted(parametersKN.keys()):
    print("%s: %r" % (param_name, gs_KN.best_params_[param_name]))

In [None]:
import numpy as np
from sklearn.model_selection import GridSearchCV


k__range = list(range(1, 31))


weight__options = ['uniform', 'distance']

tuned_parameters = [{'clf__n_neighbors': k__range,
                     'clf__weights': ['uniform', 'distance']},
                   ]
scores = ['precision', 'recall']

for score in scores:
    print("# Tuning hyper-parameters for %s" % score)
    print()

    clf = GridSearchCV(modelKnn, tuned_parameters, cv=5,
                       scoring='%s_macro' % score)
    clf.fit(X_train, y_train)

    print("Best parameters set found on development set:")
    print()
    print(clf.best_params_)
    print()
    print("Grid scores on development set:")
    print()
    means = clf.cv_results_['mean_test_score']
    stds = clf.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, clf.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))
    print()

    print("Detailed classification report:")
    print()
    print("The model is trained on the full development set.")
    print("The scores are computed on the full evaluation set.")
    print()
    y_true, y_pred = y_test, clf.predict(X_test)
    print(classification_report(y_true, y_pred))
    print()



# Tuning hyper-parameters for precision



















Best parameters set found on development set:

{'clf__n_neighbors': 29, 'clf__weights': 'distance'}

Grid scores on development set:

0.701 (+/-0.038) for {'clf__n_neighbors': 1, 'clf__weights': 'uniform'}
0.701 (+/-0.038) for {'clf__n_neighbors': 1, 'clf__weights': 'distance'}
0.720 (+/-0.044) for {'clf__n_neighbors': 2, 'clf__weights': 'uniform'}
0.701 (+/-0.038) for {'clf__n_neighbors': 2, 'clf__weights': 'distance'}
0.725 (+/-0.073) for {'clf__n_neighbors': 3, 'clf__weights': 'uniform'}
0.728 (+/-0.069) for {'clf__n_neighbors': 3, 'clf__weights': 'distance'}
0.714 (+/-0.074) for {'clf__n_neighbors': 4, 'clf__weights': 'uniform'}
0.711 (+/-0.051) for {'clf__n_neighbors': 4, 'clf__weights': 'distance'}
0.751 (+/-0.005) for {'clf__n_neighbors': 5, 'clf__weights': 'uniform'}
0.764 (+/-0.030) for {'clf__n_neighbors': 5, 'clf__weights': 'distance'}
0.737 (+/-0.016) for {'clf__n_neighbors': 6, 'clf__weights': 'uniform'}
0.749 (+/-0.025) for {'clf__n_neighbors': 6, 'clf__weights': 'distanc











### LogisticRgression

parametersLR = {'clf__penalty': ['l1','l2'], 'clf__tol': [0.0001,0.001,0.01,0.1], 'clf__C': range(1,15)}

gs_LR = GridSearchCV(modelLR, parametersLR, n_jobs=-1)

gs_LR = gs_LR.fit(X_train, y_train)

 print("Best Score with LogisticRegression: %s" % gs_LR.best_score_)
for param_name in sorted(parametersLR.keys()):
    print("%s: %r" % (param_name, gs_LR.best_params_[param_name]))

In [None]:
tuned_parameters = [{'clf__penalty': ['l1','l2'], 'clf__tol': [0.0001,0.001,0.01,0.1], 'clf__C': range(1,15)}]

scoresLR = ['precision', 'recall']

for score in scoresLR:
    print("# Tuning hyper-parameters for %s" % score)
    print()

    gs_LR = GridSearchCV(modelLR, tuned_parameters, cv=5,
                       scoring='%s_macro' % score)
    gs_LR.fit(X_train, y_train)

    print("Best parameters set found on development set:")
    print()
    print(gs_KNN.best_params_)
    print()
    print("Grid scores on development set:")
    print()
    means = gs_LR.cv_results_['mean_test_score']
    stds = gs_LR.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, gs_KNN.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))
    print()

    print("Detailed classification report:")
    print()
    print("The model is trained on the full development set.")
    print("The scores are computed on the full evaluation set.")
    print()
    y_true, y_pred = y_test, gs_LR.predict(X_test)
    print(classification_report(y_true, y_pred))
    print()

## Save optimal classifier in disk

In [41]:
# By looking at the output from the above code, the best classifier is the SVC.
import pickle
# Dump the trained classifier with Pickle
svm_pkl_filename = 'senpy/optimized_classifier.pkl'
# Open the file to save as pkl file
svm_model_pkl = open(svm_pkl_filename, 'wb')
pickle.dump(modelSVC, svm_model_pkl)
# Close the pickle instances
svm_model_pkl.close()


In [30]:
print(y_train)

[0 0 0 ..., 0 0 0]


Pipeline(memory=None,
     steps=[('features', FeatureUnion(n_jobs=1,
       transformer_list=[('words', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_r...',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False))])