## Classifier and preprocessing

In this notebook, the noironicos dataset will be treated, since ironicos's tweets are all ironic and we want a mixture of ironic and non ironic.

In [1]:
# General import and load data
from sklearn.model_selection import train_test_split
import numpy
import nltk
import pandas as pd
from pandas import Series, DataFrame
import matplotlib.pyplot as plt
from nltk.tokenize import sent_tokenize, word_tokenize
import re

# Needed for running
nltk.download('punkt')
nltk.download('stopwords')

# Import database
df=pd.read_csv('final_dataset.csv', encoding='utf-8', delimiter=",", header=0)
df.groupby('ironic').size()

# Delete rows containing nan
df=df.dropna(subset=['tweet'])


[nltk_data] Error loading punkt: <urlopen error [Errno 101] Network is
[nltk_data]     unreachable>
[nltk_data] Error loading stopwords: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     self signed certificate (_ssl.c:1045)>


In [2]:
# Before splitting database, a shuffling action will be performed since data is not randomized.
# That way the train and test splitting will be more balanced

df = df.sample(frac=1).reset_index(drop=True)

# Define X and Y
X = df['tweet'].values
y = df['ironic'].values.astype(int)
print(X[34])

PECES BARBA ?RT @lvicmartin: "@dexmith: @Buenafuente @ClaraZumo Sin barba pareces Berto."sin Berto pareces.... ?barba?


In [3]:
df.groupby('ironic').size()

ironic
0    5444
1    5638
dtype: int64

### Train and test splitting

In [4]:

# Splitting
# Test set will be the 25% taken randomly
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=33)
print(len(X_train))
print(len(X_test))
print(len(X))


8311
2771
11082


## Lexical features
The lexical features analysis will be performed by using the twitter tokenizer provided by nltk library.
Important: This feature extractor is NOT used since tweets are considered to contain only one sentence


# Sample statistics using NLTK
# A transformer will be implemented

from nltk.tokenize import sent_tokenize, word_tokenize


class LexicalStats (BaseEstimator, TransformerMixin):
    """Extract lexical features from each document"""
    
    def number_sentences(self, doc):
        sentences = sent_tokenize(doc, language='spanish')
        return len(sentences)

    def fit(self, x, y=None):
        return self

    def transform(self, docs):
       
        return [{'length': len(doc),
                 'num_sentences': self.number_sentences(doc)}
                
                for doc in docs]

In [5]:
# A tokenizer will be defined
from sklearn.base import BaseEstimator, TransformerMixin
from nltk.stem import SnowballStemmer
from nltk import word_tokenize
from nltk.corpus import stopwords
import string

def custom_tokenizer(words):
    tokens = word_tokenize(words.lower())
    stemmer = SnowballStemmer('spanish')
    lemmas = [stemmer.stem(t) for t in tokens]
    stoplist = stopwords.words('spanish')
    lemmas_clean = [w for w in lemmas if w not in stoplist]
    punctuation = set(string.punctuation)
    lemmas_punct = [w for w in lemmas_clean if  w not in punctuation]
    return lemmas_punct



## Syntactic features

ALOMEJOR HAY QUE QUITARLO

In [6]:
# We will use NLTK's tag set
from sklearn.base import BaseEstimator, TransformerMixin
from nltk import pos_tag, word_tokenize
import collections

# We can extract particular chunks (trozos, pedazos) from the sentence
# if we use a RegExpParser. See Syntactic Processing
def PosStats(BaseEstimator, TransformerMixin):
    
    def stats(self, doc):
        tokens = custom_tokenizer(doc)
        
        tagged = pos_tag(tokens, tagset = 'universal' )
        counts = collections.Counter(tag for word, tag in tagged)
        total = sum(counts.values())
        #copy tags so that we return always the same number of features
        pos_features = {'NOUN': 0, 'ADJ': 0, 'VERB': 0, 'ADV': 0, 'CONJ': 0, 
                        'ADP': 0, 'PRON':0, 'NUM': 0}
        
        pos_dic = dict((tag, float(count)/total) for tag,count in counts.items())
        for k in pos_dic:
            if k in pos_features:
                pos_features[k] = pos_dic[k]
        return pos_features
    
    def transform(self, docs, y=None):
        return [self.stats(doc) for doc in docs]
    
    def fit(self, docs, y=None):
        """Returns `self` unless something different happens in train and test"""
        return self
        

## Feature extraction Pipeline
The feature extraction will be carried out by using pipelines. The defined pipelines are selected in order to extract the desired features

In [7]:
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer


ngrams_featurizer = Pipeline([
  ('count_vectorizer',  CountVectorizer(ngram_range = (1, 2), encoding = 'ISO-8859-1', 
                                        tokenizer=custom_tokenizer)),
  ('tfidf_transformer', TfidfTransformer())
])

## Feature Union Pipeline
Now we define which features we want to extract, how to combine them and later apple machine learning in the resulting feature set.

In [8]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import classification_report
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import FunctionTransformer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.svm import SVC

def my_pipeline(clf):
    pipeline = Pipeline([
       ('features', FeatureUnion([
                    
                ('words', TfidfVectorizer(tokenizer=custom_tokenizer)),
                ('ngrams', ngrams_featurizer),
                    #('pos_stats', Pipeline([
                                #('pos_stats', PosStats()),
                                #('vectors', DictVectorizer())
                            #])),
                ('lda', Pipeline([ 
                             ('count', CountVectorizer(tokenizer=custom_tokenizer)),
                            ('lda',  LatentDirichletAllocation(n_components=45, max_iter=5, # Change ntopics
                                                       learning_method='online', 
                                                       learning_offset=50.,
                                                       random_state=0))
                         ])),
             ])),
       
    ('clf', clf)  # classifier
    ])
    return pipeline
    

## Multinomial NaiveBayes

In [9]:
from sklearn.naive_bayes import  MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
print("Size of training set: {}   size of test set: {}".format(X_train.shape[0], X_test.shape[0]))
model = MultinomialNB(alpha=.01)
modelNB = my_pipeline(model)
modelNB.fit(X_train, y_train)


Size of training set: 8311   size of test set: 2771


Pipeline(memory=None,
     steps=[('features', FeatureUnion(n_jobs=None,
       transformer_list=[('words', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ng...   transformer_weights=None)), ('clf', MultinomialNB(alpha=0.01, class_prior=None, fit_prior=True))])

In [10]:
predicted1 = modelNB.predict(X_test)
expected = y_test

In [11]:
from sklearn import metrics
# Accuracy
metrics.accuracy_score(expected, predicted1)

0.81378563695416817

In [12]:
print(classification_report(expected, predicted1, digits=5))

             precision    recall  f1-score   support

          0    0.81733   0.78781   0.80230      1329
          1    0.81074   0.83773   0.82401      1442

avg / total    0.81390   0.81379   0.81360      2771



### SVC

In [9]:
from sklearn.svm import SVC
from sklearn import metrics

types_of_kernels = ['linear', 'rbf', 'poly']

kernel = types_of_kernels[0]

# Create SVC model
# C already optimized
model = SVC(kernel=kernel, C = 12.915496650148826)
modelSVC = my_pipeline(model)
modelSVC.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('features', FeatureUnion(n_jobs=1,
       transformer_list=[('words', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_r...,
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))])

In [10]:
predicted2 = modelSVC.predict(X_test)
expected = y_test
metrics.accuracy_score(expected, predicted2)
print(classification_report(expected, predicted2, digits=5))

             precision    recall  f1-score   support

          0    0.90325   0.88476   0.89391      1319
          1    0.89723   0.91391   0.90549      1452

avg / total    0.90009   0.90004   0.89998      2771



### Kneighbors Classifier

In [15]:
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier(n_neighbors=7, algorithm='ball_tree')
modelKnn = my_pipeline(model)
modelKnn.fit(X_train, y_train)



Pipeline(memory=None,
     steps=[('features', FeatureUnion(n_jobs=1,
       transformer_list=[('words', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_r...owski',
           metric_params=None, n_jobs=1, n_neighbors=7, p=2,
           weights='uniform'))])

In [16]:
predicted3 = modelKnn.predict(X_test)
expected = y_test
metrics.accuracy_score(expected, predicted3)
print(classification_report(expected, predicted3, digits=5))

             precision    recall  f1-score   support

          0    0.78818   0.72235   0.75383      1329
          1    0.76240   0.82108   0.79065      1442

avg / total    0.77476   0.77373   0.77299      2771



### Logistic Regression classifier

In [9]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(n_jobs = -1)
modelLR = my_pipeline(model)
modelLR.fit(X_train, y_train)

  " = {}.".format(self.n_jobs))


Pipeline(memory=None,
     steps=[('features', FeatureUnion(n_jobs=1,
       transformer_list=[('words', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_r...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

In [10]:
predicted4 = modelLR.predict(X_test)
expected = y_test
metrics.accuracy_score(expected, predicted4)
print(classification_report(expected, predicted4, digits=5))

NameError: name 'metrics' is not defined

## Optimize models
Tune parameters of previously defined models using Grid Search

In [56]:
modelSVC.get_params().keys()

dict_keys(['features__words__decode_error', 'features__lda__lda__learning_decay', 'features__lda__count__max_df', 'features__words__vocabulary', 'features__lda__lda__max_iter', 'features__words__dtype', 'features__lda__lda__batch_size', 'clf__max_iter', 'features__lda__count__binary', 'features__lda__steps', 'clf__decision_function_shape', 'features__words__encoding', 'features__lda__lda__n_jobs', 'features__ngrams__count_vectorizer__tokenizer', 'features__words__preprocessor', 'features__words__norm', 'features__ngrams', 'features', 'features__lda__lda__doc_topic_prior', 'features__ngrams__count_vectorizer__decode_error', 'features__lda__lda__random_state', 'clf__tol', 'features__lda__count__min_df', 'features__ngrams__count_vectorizer__max_df', 'features__lda__lda__learning_method', 'features__ngrams__count_vectorizer__min_df', 'memory', 'features__ngrams__count_vectorizer__input', 'features__lda__memory', 'features__lda__count__vocabulary', 'features__words__input', 'features__lda__

### Multinomial NaiveBayes

In [25]:
from sklearn.model_selection import GridSearchCV
# Used alpha = .01
parametersNB = {'clf__alpha':numpy.linspace(0,2,20)[1:], 'features__lda__lda__n_components':[10, 15, 20, 25, 30]}

gs_NB = GridSearchCV(modelNB, parametersNB, n_jobs=-1)

In [18]:
gs_NB=gs_NB.fit(X_train,y_train)

In [19]:
print("Best Score with MultinomialNB: %s" % gs_NB.best_score_)
for param_name in sorted(parametersNB.keys()):
    print("%s: %r" % (param_name, gs_NB.best_params_[param_name]))

Best Score with MultinomialNB: 0.858999037536
clf__alpha: 1.8947368421052631
features__lda__lda__n_components: 15


In [26]:
modelNB.set_params(clf__alpha=gs_NB.best_params_['clf__alpha'], features__lda__lda__n_components=gs_NB.best_params_['features__lda__lda__n_components'], )

AttributeError: 'GridSearchCV' object has no attribute 'best_params_'

### SVC

#Optimize SVC


parametersSVC = {'clf__C':range(1,15),'clf__gamma': np.logspace(-6, -1, 10), 'clf__kernel': ('linear','rbf'),
                 'clf__probability':(True,False),}

gs_SVC = GridSearchCV(modelSVC, parametersSVC, n_jobs=-1)

gs_SVC = gs_SVC.fit(X_train, y_train)

 print("Best Score with SVC: %s" % gs_SVC.best_score_)
for param_name in sorted(parametersSVC.keys()):
    print("%s: %r" % (param_name, gs_SVC.best_params_[param_name]))

In [11]:
import numpy as np
from sklearn.model_selection import GridSearchCV

parametersSVC = {
    'features__lda__lda__n_components':np.linspace(30, 120, 10),
    'features__words__ngram_range':[(1,4), (1,5), (1,6), (1,7)]
} 
gs_SVC = GridSearchCV(modelSVC, parametersSVC, n_jobs=-1)



In [12]:
modelSVC.set_params(features__lda__lda__n_components=90)
gs_SVC = gs_SVC.fit(X_train, y_train)

  init_gamma, init_var, (self._n_components, n_features))
  init_gamma, init_var, (self._n_components, n_features))
  init_gamma, init_var, (self._n_components, n_features))
  init_gamma, init_var, (self._n_components, n_features))
  init_gamma, init_var, (self._n_components, n_features))
  init_gamma, init_var, (self._n_components, n_features))
  init_gamma, init_var, (self._n_components, n_features))
  init_gamma, init_var, (self._n_components, n_features))
  init_gamma, init_var, (self._n_components, n_features))
  init_gamma, init_var, (self._n_components, n_features))
  init_gamma, init_var, (self._n_components, n_features))
  init_gamma, init_var, (self._n_components, n_features))
  init_gamma, init_var, (self._n_components, n_features))
  init_gamma, init_var, (self._n_components, n_features))
  init_gamma, init_var, (self._n_components, n_features))
  init_gamma, init_var, (self._n_components, n_features))
  init_gamma, init_var, (self._n_components, n_features))
  init_gamma, 

  init_gamma, init_var, (self._n_components, n_features))
  init_gamma, init_var, (self._n_components, n_features))
  init_gamma, init_var, (self._n_components, n_features))
  init_gamma, init_var, (self._n_components, n_features))
  init_gamma, init_var, (self._n_components, n_features))
  init_gamma, init_var, (self._n_components, n_features))
  init_gamma, init_var, (self._n_components, n_features))
  init_gamma, init_var, (self._n_components, n_features))
  init_gamma, init_var, (self._n_components, n_features))
  init_gamma, init_var, (self._n_components, n_features))
  init_gamma, init_var, (self._n_components, n_features))
  init_gamma, init_var, (self._n_components, n_features))
  init_gamma, init_var, (self._n_components, n_features))
  init_gamma, init_var, (self._n_components, n_features))
  init_gamma, init_var, (self._n_components, n_features))
  init_gamma, init_var, (self._n_components, n_features))
  init_gamma, init_var, (self._n_components, n_features))
  init_gamma, 

  init_gamma, init_var, (self._n_components, n_features))
  init_gamma, init_var, (self._n_components, n_features))
  init_gamma, init_var, (self._n_components, n_features))
  init_gamma, init_var, (self._n_components, n_features))
  init_gamma, init_var, (self._n_components, n_features))
  init_gamma, init_var, (self._n_components, n_features))
  init_gamma, init_var, (self._n_components, n_features))
  init_gamma, init_var, (self._n_components, n_features))
  init_gamma, init_var, (self._n_components, n_features))
  init_gamma, init_var, (self._n_components, n_features))
  init_gamma, init_var, (self._n_components, n_features))
  init_gamma, init_var, (self._n_components, n_features))
  init_gamma, init_var, (self._n_components, n_features))
  init_gamma, init_var, (self._n_components, n_features))
  init_gamma, init_var, (self._n_components, n_features))
  init_gamma, init_var, (self._n_components, n_features))
  init_gamma, init_var, (self._n_components, n_features))
  init_gamma, 

  init_gamma, init_var, (self._n_components, n_features))
  init_gamma, init_var, (self._n_components, n_features))
  init_gamma, init_var, (self._n_components, n_features))
  init_gamma, init_var, (self._n_components, n_features))
  init_gamma, init_var, (self._n_components, n_features))
  init_gamma, init_var, (self._n_components, n_features))
  init_gamma, init_var, (self._n_components, n_features))
  init_gamma, init_var, (self._n_components, n_features))
  init_gamma, init_var, (self._n_components, n_features))
  init_gamma, init_var, (self._n_components, n_features))
  init_gamma, init_var, (self._n_components, n_features))
  init_gamma, init_var, (self._n_components, n_features))
  init_gamma, init_var, (self._n_components, n_features))
  init_gamma, init_var, (self._n_components, n_features))
  init_gamma, init_var, (self._n_components, n_features))
  init_gamma, init_var, (self._n_components, n_features))
  init_gamma, init_var, (self._n_components, n_features))
  init_gamma, 

In [13]:
print("Best Score with SVC: %s" % gs_SVC.best_score_)
for param_name in sorted(parametersSVC.keys()):
    print("%s: %r" % (param_name, gs_SVC.best_params_[param_name]))

Best Score with SVC: 0.893166506256
features__lda__lda__n_components: 100.0
features__words__ngram_range: (1, 5)


In [33]:
modelSVC.set_params(clf__kernel=gs_SVC.best_params_['clf__kernel'],
                   clf__C=gs_SVC.best_params_['clf__C'],
                    features__lda__lda__n_components=gs_SVC.best_params_['features__lda__lda__n_components'])

Pipeline(memory=None,
     steps=[('features', FeatureUnion(n_jobs=1,
       transformer_list=[('words', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_r...',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False))])

### KNeighbors Classifier

parametersKN = {'clf__n_neighbors': range(1,15), 'clf__p':(1,2),'clf__algorithm':('ball_tree', 'kd_tree', 'brute')}

gs_KN = GridSearchCV(modelKnn, parametersKN, n_jobs=-1)

gs_KN = gs_KN.fit(X_train, y_train)

 print("Best Score with KN: %s" % gs_KN.best_score_)
for param_name in sorted(parametersKN.keys()):
    print("%s: %r" % (param_name, gs_KN.best_params_[param_name]))

In [None]:
import numpy as np
from sklearn.model_selection import GridSearchCV


k__range = list(range(1, 31))


weight__options = ['uniform', 'distance']

tuned_parameters = [{'clf__n_neighbors': k__range,
                     'clf__weights': ['uniform', 'distance']},
                   ]
scores = ['precision', 'recall']

for score in scores:
    print("# Tuning hyper-parameters for %s" % score)
    print()

    clf = GridSearchCV(modelKnn, tuned_parameters, cv=5,
                       scoring='%s_macro' % score)
    clf.fit(X_train, y_train)

    print("Best parameters set found on development set:")
    print()
    print(clf.best_params_)
    print()
    print("Grid scores on development set:")
    print()
    means = clf.cv_results_['mean_test_score']
    stds = clf.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, clf.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))
    print()

    print("Detailed classification report:")
    print()
    print("The model is trained on the full development set.")
    print("The scores are computed on the full evaluation set.")
    print()
    y_true, y_pred = y_test, clf.predict(X_test)
    print(classification_report(y_true, y_pred))
    print()



# Tuning hyper-parameters for precision



















Best parameters set found on development set:

{'clf__n_neighbors': 29, 'clf__weights': 'distance'}

Grid scores on development set:

0.701 (+/-0.038) for {'clf__n_neighbors': 1, 'clf__weights': 'uniform'}
0.701 (+/-0.038) for {'clf__n_neighbors': 1, 'clf__weights': 'distance'}
0.720 (+/-0.044) for {'clf__n_neighbors': 2, 'clf__weights': 'uniform'}
0.701 (+/-0.038) for {'clf__n_neighbors': 2, 'clf__weights': 'distance'}
0.725 (+/-0.073) for {'clf__n_neighbors': 3, 'clf__weights': 'uniform'}
0.728 (+/-0.069) for {'clf__n_neighbors': 3, 'clf__weights': 'distance'}
0.714 (+/-0.074) for {'clf__n_neighbors': 4, 'clf__weights': 'uniform'}
0.711 (+/-0.051) for {'clf__n_neighbors': 4, 'clf__weights': 'distance'}
0.751 (+/-0.005) for {'clf__n_neighbors': 5, 'clf__weights': 'uniform'}
0.764 (+/-0.030) for {'clf__n_neighbors': 5, 'clf__weights': 'distance'}
0.737 (+/-0.016) for {'clf__n_neighbors': 6, 'clf__weights': 'uniform'}
0.749 (+/-0.025) for {'clf__n_neighbors': 6, 'clf__weights': 'distanc











### LogisticRgression

parametersLR = {'clf__penalty': ['l1','l2'], 'clf__tol': [0.0001,0.001,0.01,0.1], 'clf__C': range(1,15)}

gs_LR = GridSearchCV(modelLR, parametersLR, n_jobs=-1)

gs_LR = gs_LR.fit(X_train, y_train)

 print("Best Score with LogisticRegression: %s" % gs_LR.best_score_)
for param_name in sorted(parametersLR.keys()):
    print("%s: %r" % (param_name, gs_LR.best_params_[param_name]))

In [24]:
modelLR.get_params().keys()

dict_keys(['features__words__decode_error', 'features__lda__lda__learning_decay', 'features__lda__count__max_df', 'features__words__vocabulary', 'features__lda__lda__max_iter', 'features__words__dtype', 'features__lda__lda__batch_size', 'clf__max_iter', 'features__lda__count__binary', 'features__lda__steps', 'clf__C', 'features__words__encoding', 'features__lda__lda__n_jobs', 'features__ngrams__count_vectorizer__tokenizer', 'features__words__preprocessor', 'clf__n_jobs', 'features__words__norm', 'features__ngrams', 'features', 'features__lda__lda__doc_topic_prior', 'features__ngrams__count_vectorizer__decode_error', 'clf__intercept_scaling', 'features__lda__lda__random_state', 'clf__tol', 'features__lda__count__min_df', 'features__ngrams__count_vectorizer__max_df', 'features__lda__lda__learning_method', 'features__ngrams__count_vectorizer__min_df', 'memory', 'features__ngrams__count_vectorizer__input', 'features__lda__memory', 'features__lda__count__vocabulary', 'features__words__input

In [37]:
from sklearn.model_selection import GridSearchCV
parametersLR = [{ 
                 'clf__C': [0.001, 0.1 ,1,5],
                 'features__lda__lda__n_components':[10, 15, 20, 25, 30]
}]
gs_LR = GridSearchCV(modelLR, parametersLR)


In [38]:
gs_LR = gs_LR.fit(X_train, y_train)

  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}

  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))


In [46]:
print("Best Score with LR: %s" % gs_LR.best_score_)


Best Score with LR: 0.894850818094


In [47]:
print(gs_LR.best_params_)

{'features__lda__lda__n_components': 30, 'clf__C': 5}


In [48]:
modelLR.set_params( clf__C=gs_LR.best_params_['clf__C'],
                    features__lda__lda__n_components=gs_LR.best_params_['features__lda__lda__n_components'])

Pipeline(memory=None,
     steps=[('features', FeatureUnion(n_jobs=1,
       transformer_list=[('words', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_r...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

## Save optimal classifier in disk

In [41]:
# By looking at the output from the above code, the best classifier is the SVC.
import pickle
# Dump the trained classifier with Pickle
svm_pkl_filename = 'senpy/optimized_classifier.pkl'
# Open the file to save as pkl file
svm_model_pkl = open(svm_pkl_filename, 'wb')
pickle.dump(modelSVC, svm_model_pkl)
# Close the pickle instances
svm_model_pkl.close()


In [30]:
print(y_train)

[0 0 0 ..., 0 0 0]


Pipeline(memory=None,
     steps=[('features', FeatureUnion(n_jobs=1,
       transformer_list=[('words', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_r...',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False))])