## Classifier and preprocessing

In this notebook, the noironicos dataset will be treated, since ironicos's tweets are all ironic and we want a mixture of ironic and non ironic.

In [1]:
# General import and load data
from sklearn.model_selection import train_test_split
import nltk
import pandas as pd
from pandas import Series, DataFrame
import matplotlib.pyplot as plt
from nltk.tokenize import sent_tokenize, word_tokenize
import re

# Needed for running
nltk.download('punkt')
nltk.download('stopwords')

# Import database
df=pd.read_csv('final_dataset.csv', encoding='utf-8', delimiter=",", header=0)
df.groupby('ironic').size()

# Delete rows containing nan
df=df.dropna(subset=['tweet'])


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/juanalvarez/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/juanalvarez/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# Before splitting database, a shuffling action will be performed since data is not randomized.
# That way the train and test splitting will be more balanced

df = df.sample(frac=1).reset_index(drop=True)

# Define X and Y
X = df['tweet'].values
y = df['ironic'].values.astype(int)
print(X[34])

Si @FNFFranco dejaron salir al abuelo de @PabloCasado_ , ¡¡¡tan rojo no sería!!! #PabloCasadoDimision #sarcasmo


### Train and test splitting

In [3]:

# Splitting
# Test set will be the 25% taken randomly
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=33)


## Lexical features
The lexical features analysis will be performed by using the twitter tokenizer provided by nltk library.
Important: This feature extractor is NOT used since tweets are considered to contain only one sentence


# Sample statistics using NLTK
# A transformer will be implemented

from nltk.tokenize import sent_tokenize, word_tokenize


class LexicalStats (BaseEstimator, TransformerMixin):
    """Extract lexical features from each document"""
    
    def number_sentences(self, doc):
        sentences = sent_tokenize(doc, language='spanish')
        return len(sentences)

    def fit(self, x, y=None):
        return self

    def transform(self, docs):
       
        return [{'length': len(doc),
                 'num_sentences': self.number_sentences(doc)}
                
                for doc in docs]

In [4]:
# A tokenizer will be defined
from sklearn.base import BaseEstimator, TransformerMixin
from nltk.stem import SnowballStemmer
from nltk import word_tokenize
from nltk.corpus import stopwords
import string

def custom_tokenizer(words):
    tokens = word_tokenize(words.lower())
    stemmer = SnowballStemmer('spanish')
    lemmas = [stemmer.stem(t) for t in tokens]
    stoplist = stopwords.words('spanish')
    lemmas_clean = [w for w in lemmas if w not in stoplist]
    punctuation = set(string.punctuation)
    lemmas_punct = [w for w in lemmas_clean if  w not in punctuation]
    return lemmas_punct



## Syntactic features

ALOMEJOR HAY QUE QUITARLO

In [5]:
# We will use NLTK's tag set
from sklearn.base import BaseEstimator, TransformerMixin
from nltk import pos_tag, word_tokenize
import collections

# We can extract particular chunks (trozos, pedazos) from the sentence
# if we use a RegExpParser. See Syntactic Processing
def PosStats(BaseEstimator, TransformerMixin):
    
    def stats(self, doc):
        tokens = custom_tokenizer(doc)
        
        tagged = pos_tag(tokens, tagset = 'universal' )
        counts = collections.Counter(tag for word, tag in tagged)
        total = sum(counts.values())
        #copy tags so that we return always the same number of features
        pos_features = {'NOUN': 0, 'ADJ': 0, 'VERB': 0, 'ADV': 0, 'CONJ': 0, 
                        'ADP': 0, 'PRON':0, 'NUM': 0}
        
        pos_dic = dict((tag, float(count)/total) for tag,count in counts.items())
        for k in pos_dic:
            if k in pos_features:
                pos_features[k] = pos_dic[k]
        return pos_features
    
    def transform(self, docs, y=None):
        return [self.stats(doc) for doc in docs]
    
    def fit(self, docs, y=None):
        """Returns `self` unless something different happens in train and test"""
        return self
        

## Feature extraction Pipeline
The feature extraction will be carried out by using pipelines. The defined pipelines are selected in order to extract the desired features

In [6]:
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer


ngrams_featurizer = Pipeline([
  ('count_vectorizer',  CountVectorizer(ngram_range = (1, 2), encoding = 'ISO-8859-1', 
                                        tokenizer=custom_tokenizer)),
  ('tfidf_transformer', TfidfTransformer())
])

## Feature Union Pipeline
Now we define which features we want to extract, how to combine them and later apple machine learning in the resulting feature set.

In [7]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.cross_validation import cross_val_score, KFold
from sklearn.metrics import classification_report
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import FunctionTransformer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.svm import SVC

def my_pipeline(clf):
    pipeline = Pipeline([
       ('features', FeatureUnion([
                    
                ('words', TfidfVectorizer(tokenizer=custom_tokenizer)),
                ('ngrams', ngrams_featurizer),
                    #('pos_stats', Pipeline([
                                #('pos_stats', PosStats()),
                                #('vectors', DictVectorizer())
                            #])),
                ('lda', Pipeline([ 
                             ('count', CountVectorizer(tokenizer=custom_tokenizer)),
                            ('lda',  LatentDirichletAllocation(n_components=45, max_iter=5, # Change ntopics
                                                       learning_method='online', 
                                                       learning_offset=50.,
                                                       random_state=0))
                         ])),
             ])),
       
    ('clf', clf)  # classifier
    ])
    return pipeline
    



## Multinomial NaiveBayes

In [59]:
from sklearn.naive_bayes import  MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
print("Size of training set: {}   size of test set: {}".format(X_train.shape[0], X_test.shape[0]))
model = MultinomialNB(alpha=.01)
modelNB = my_pipeline(model)
modelNB.fit(X_train, y_train)


Size of training set: 8311   size of test set: 2771


Pipeline(memory=None,
     steps=[('features', FeatureUnion(n_jobs=1,
       transformer_list=[('words', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_r...   transformer_weights=None)), ('clf', MultinomialNB(alpha=0.01, class_prior=None, fit_prior=True))])

In [60]:
predicted1 = modelNB.predict(X_test)
expected = y_test

In [61]:
from sklearn import metrics
# Accuracy
metrics.accuracy_score(expected, predicted1)

0.89281847708408513

In [62]:
print(classification_report(expected, predicted1, digits=5))

             precision    recall  f1-score   support

          0    0.88160   0.90417   0.89274      1367
          1    0.90431   0.88177   0.89290      1404

avg / total    0.89311   0.89282   0.89282      2771



### SVC

In [8]:
from sklearn.svm import SVC
from sklearn import metrics

types_of_kernels = ['linear', 'rbf', 'poly']

kernel = types_of_kernels[0]
gamma = 3.0

# Create SVC model
model = SVC(kernel=kernel, probability=True, gamma=gamma)
modelSVC = my_pipeline(model)
modelSVC.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('features', FeatureUnion(n_jobs=1,
       transformer_list=[('words', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_r...',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False))])

In [9]:
predicted2 = modelSVC.predict(X_test)
expected = y_test
metrics.accuracy_score(expected, predicted2)
print(classification_report(expected, predicted2, digits=5))

             precision    recall  f1-score   support

          0    0.93354   0.90990   0.92157      1343
          1    0.91724   0.93908   0.92803      1428

avg / total    0.92514   0.92494   0.92490      2771



### Kneighbors Classifier

In [65]:
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier(n_neighbors=7, algorithm='ball_tree')
modelKnn = my_pipeline(model)
modelKnn.fit(X_train, y_train)



Pipeline(memory=None,
     steps=[('features', FeatureUnion(n_jobs=1,
       transformer_list=[('words', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_r...owski',
           metric_params=None, n_jobs=1, n_neighbors=7, p=2,
           weights='uniform'))])

In [66]:
predicted3 = modelKnn.predict(X_test)
expected = y_test
metrics.accuracy_score(expected, predicted3)
print(classification_report(expected, predicted3, digits=5))

             precision    recall  f1-score   support

          0    0.74566   0.81712   0.77976      1367
          1    0.80361   0.72863   0.76429      1404

avg / total    0.77502   0.77228   0.77192      2771



### Logistic Regression classifier

In [67]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(n_jobs = -1)
modelLR = my_pipeline(model)
modelLR.fit(X_train, y_train)

  " = {}.".format(self.n_jobs))


Pipeline(memory=None,
     steps=[('features', FeatureUnion(n_jobs=1,
       transformer_list=[('words', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_r...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

In [68]:
predicted4 = modelLR.predict(X_test)
expected = y_test
metrics.accuracy_score(expected, predicted4)
print(classification_report(expected, predicted4, digits=5))

             precision    recall  f1-score   support

          0    0.92197   0.89027   0.90584      1367
          1    0.89662   0.92664   0.91138      1404

avg / total    0.90913   0.90870   0.90865      2771



## Optimize models
Tune parameters of previously defined models using Grid Search

### Multinomial NaiveBayes

In [28]:
from sklearn.model_selection import GridSearchCV
# Used alpha = .01
parametersNB = {'clf__alpha': [.0001,.001,.01,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0]}
scoresNB = ['precision', 'recall']
for score in scoresNB:
    print("# Tuning hyper-parameters for %s" % score)
    print()

    gs_NB = GridSearchCV(modelNB,parametersNB, n_jobs=-1, scoring='%s_macro' % score)
    gs_NB.fit(X_train, y_train)

    print("Best parameters set found on development set:")
    print()
    print(gs_NB.best_params_)
    print()
    print("Grid scores on development set:")
    print()
    means = gs_NB.cv_results_['mean_test_score']
    stds = gs_NB.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, gs_NB.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))
    print()

    print("Detailed classification report:")
    print()
    print("The model is trained on the full development set.")
    print("The scores are computed on the full evaluation set.")
    print()
    y_true, y_pred = y_test, gs_NB.predict(X_test)
    print(classification_report(y_true, y_pred))
    print()
    

# Tuning hyper-parameters for precision

Best parameters set found on development set:

{'clf__alpha': 1.0}

Grid scores on development set:

0.825 (+/-0.009) for {'clf__alpha': 0.0001}
0.830 (+/-0.010) for {'clf__alpha': 0.001}
0.842 (+/-0.009) for {'clf__alpha': 0.01}
0.860 (+/-0.015) for {'clf__alpha': 0.1}
0.867 (+/-0.012) for {'clf__alpha': 0.2}
0.870 (+/-0.007) for {'clf__alpha': 0.3}
0.872 (+/-0.007) for {'clf__alpha': 0.4}
0.874 (+/-0.005) for {'clf__alpha': 0.5}
0.875 (+/-0.004) for {'clf__alpha': 0.6}
0.876 (+/-0.004) for {'clf__alpha': 0.7}
0.877 (+/-0.001) for {'clf__alpha': 0.8}
0.877 (+/-0.003) for {'clf__alpha': 0.9}
0.877 (+/-0.004) for {'clf__alpha': 1.0}

Detailed classification report:

The model is trained on the full development set.
The scores are computed on the full evaluation set.

             precision    recall  f1-score   support

          0       0.94      0.85      0.89      1335
          1       0.87      0.95      0.91      1436

avg / total       0.9

gs_NB=gs_NB.fit(X_train,y_train)

print("Best Score with MultinomialNB: %s" % gs_NB.best_score_)
for param_name in sorted(parametersNB.keys()):
    print("%s: %r" % (param_name, gs_NB.best_params_[param_name]))

### SVC

#Optimize SVC


parametersSVC = {'clf__C':range(1,15),'clf__gamma': np.logspace(-6, -1, 10), 'clf__kernel': ('linear','rbf'),
                 'clf__probability':(True,False),}

gs_SVC = GridSearchCV(modelSVC, parametersSVC, n_jobs=-1)

gs_SVC = gs_SVC.fit(X_train, y_train)

 print("Best Score with SVC: %s" % gs_SVC.best_score_)
for param_name in sorted(parametersSVC.keys()):
    print("%s: %r" % (param_name, gs_SVC.best_params_[param_name]))

In [12]:
import numpy as np
from sklearn.model_selection import GridSearchCV

tuned_parameters = [{'clf__kernel': ['rbf'], 'clf__gamma': [1e-3, 1e-4],
                     'clf__C': [1, 10, 100, 1000]},
                    {'clf__kernel': ['linear'], 'clf__C': [1, 10, 100, 1000]}]
scores = ['precision', 'recall']

for score in scores:
    print("# Tuning hyper-parameters for %s" % score)
    print()

    clf = GridSearchCV(modelSVC, tuned_parameters, cv=5,
                       scoring='%s_macro' % score)
    clf.fit(X_train, y_train)

    print("Best parameters set found on development set:")
    print()
    print(clf.best_params_)
    print()
    print("Grid scores on development set:")
    print()
    means = clf.cv_results_['mean_test_score']
    stds = clf.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, clf.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))
    print()

    print("Detailed classification report:")
    print()
    print("The model is trained on the full development set.")
    print("The scores are computed on the full evaluation set.")
    print()
    y_true, y_pred = y_test, clf.predict(X_test)
    print(classification_report(y_true, y_pred))
    print()


# Tuning hyper-parameters for precision



  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


Best parameters set found on development set:

{'clf__C': 1, 'clf__kernel': 'linear'}

Grid scores on development set:

0.552 (+/-0.488) for {'clf__C': 1, 'clf__gamma': 0.001, 'clf__kernel': 'rbf'}
0.253 (+/-0.000) for {'clf__C': 1, 'clf__gamma': 0.0001, 'clf__kernel': 'rbf'}
0.826 (+/-0.039) for {'clf__C': 10, 'clf__gamma': 0.001, 'clf__kernel': 'rbf'}
0.552 (+/-0.488) for {'clf__C': 10, 'clf__gamma': 0.0001, 'clf__kernel': 'rbf'}
0.905 (+/-0.019) for {'clf__C': 100, 'clf__gamma': 0.001, 'clf__kernel': 'rbf'}
0.826 (+/-0.039) for {'clf__C': 100, 'clf__gamma': 0.0001, 'clf__kernel': 'rbf'}
0.907 (+/-0.021) for {'clf__C': 1000, 'clf__gamma': 0.001, 'clf__kernel': 'rbf'}
0.905 (+/-0.019) for {'clf__C': 1000, 'clf__gamma': 0.0001, 'clf__kernel': 'rbf'}
0.912 (+/-0.015) for {'clf__C': 1, 'clf__kernel': 'linear'}
0.906 (+/-0.019) for {'clf__C': 10, 'clf__kernel': 'linear'}
0.906 (+/-0.017) for {'clf__C': 100, 'clf__kernel': 'linear'}
0.906 (+/-0.017) for {'clf__C': 1000, 'clf__kernel': 'lin

In [20]:
modelKnn.get_params().keys()

dict_keys(['memory', 'steps', 'features', 'clf', 'features__n_jobs', 'features__transformer_list', 'features__transformer_weights', 'features__words', 'features__ngrams', 'features__lda', 'features__words__analyzer', 'features__words__binary', 'features__words__decode_error', 'features__words__dtype', 'features__words__encoding', 'features__words__input', 'features__words__lowercase', 'features__words__max_df', 'features__words__max_features', 'features__words__min_df', 'features__words__ngram_range', 'features__words__norm', 'features__words__preprocessor', 'features__words__smooth_idf', 'features__words__stop_words', 'features__words__strip_accents', 'features__words__sublinear_tf', 'features__words__token_pattern', 'features__words__tokenizer', 'features__words__use_idf', 'features__words__vocabulary', 'features__ngrams__memory', 'features__ngrams__steps', 'features__ngrams__count_vectorizer', 'features__ngrams__tfidf_transformer', 'features__ngrams__count_vectorizer__analyzer', 'fe

### KNeighbors Classifier

parametersKN = {'clf__n_neighbors': range(1,15), 'clf__p':(1,2),'clf__algorithm':('ball_tree', 'kd_tree', 'brute')}

gs_KN = GridSearchCV(modelKnn, parametersKN, n_jobs=-1)

gs_KN = gs_KN.fit(X_train, y_train)

 print("Best Score with KN: %s" % gs_KN.best_score_)
for param_name in sorted(parametersKN.keys()):
    print("%s: %r" % (param_name, gs_KN.best_params_[param_name]))

In [None]:
import numpy as np
from sklearn.model_selection import GridSearchCV


k__range = list(range(1, 31))


weight__options = ['uniform', 'distance']

tuned_parameters = [{'clf__n_neighbors': k__range,
                     'clf__weights': ['uniform', 'distance']},
                   ]
scores = ['precision', 'recall']

for score in scores:
    print("# Tuning hyper-parameters for %s" % score)
    print()

    clf = GridSearchCV(modelKnn, tuned_parameters, cv=5,
                       scoring='%s_macro' % score)
    clf.fit(X_train, y_train)

    print("Best parameters set found on development set:")
    print()
    print(clf.best_params_)
    print()
    print("Grid scores on development set:")
    print()
    means = clf.cv_results_['mean_test_score']
    stds = clf.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, clf.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))
    print()

    print("Detailed classification report:")
    print()
    print("The model is trained on the full development set.")
    print("The scores are computed on the full evaluation set.")
    print()
    y_true, y_pred = y_test, clf.predict(X_test)
    print(classification_report(y_true, y_pred))
    print()



# Tuning hyper-parameters for precision



















Best parameters set found on development set:

{'clf__n_neighbors': 29, 'clf__weights': 'distance'}

Grid scores on development set:

0.701 (+/-0.038) for {'clf__n_neighbors': 1, 'clf__weights': 'uniform'}
0.701 (+/-0.038) for {'clf__n_neighbors': 1, 'clf__weights': 'distance'}
0.720 (+/-0.044) for {'clf__n_neighbors': 2, 'clf__weights': 'uniform'}
0.701 (+/-0.038) for {'clf__n_neighbors': 2, 'clf__weights': 'distance'}
0.725 (+/-0.073) for {'clf__n_neighbors': 3, 'clf__weights': 'uniform'}
0.728 (+/-0.069) for {'clf__n_neighbors': 3, 'clf__weights': 'distance'}
0.714 (+/-0.074) for {'clf__n_neighbors': 4, 'clf__weights': 'uniform'}
0.711 (+/-0.051) for {'clf__n_neighbors': 4, 'clf__weights': 'distance'}
0.751 (+/-0.005) for {'clf__n_neighbors': 5, 'clf__weights': 'uniform'}
0.764 (+/-0.030) for {'clf__n_neighbors': 5, 'clf__weights': 'distance'}
0.737 (+/-0.016) for {'clf__n_neighbors': 6, 'clf__weights': 'uniform'}
0.749 (+/-0.025) for {'clf__n_neighbors': 6, 'clf__weights': 'distanc











### LogisticRgression

parametersLR = {'clf__penalty': ['l1','l2'], 'clf__tol': [0.0001,0.001,0.01,0.1], 'clf__C': range(1,15)}

gs_LR = GridSearchCV(modelLR, parametersLR, n_jobs=-1)

gs_LR = gs_LR.fit(X_train, y_train)

 print("Best Score with LogisticRegression: %s" % gs_LR.best_score_)
for param_name in sorted(parametersLR.keys()):
    print("%s: %r" % (param_name, gs_LR.best_params_[param_name]))

In [None]:
tuned_parameters = [{'clf__penalty': ['l1','l2'], 'clf__tol': [0.0001,0.001,0.01,0.1], 'clf__C': range(1,15)}]

scoresLR = ['precision', 'recall']

for score in scoresLR:
    print("# Tuning hyper-parameters for %s" % score)
    print()

    gs_LR = GridSearchCV(modelLR, tuned_parameters, cv=5,
                       scoring='%s_macro' % score)
    gs_LR.fit(X_train, y_train)

    print("Best parameters set found on development set:")
    print()
    print(gs_KNN.best_params_)
    print()
    print("Grid scores on development set:")
    print()
    means = gs_LR.cv_results_['mean_test_score']
    stds = gs_LR.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, gs_KNN.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))
    print()

    print("Detailed classification report:")
    print()
    print("The model is trained on the full development set.")
    print("The scores are computed on the full evaluation set.")
    print()
    y_true, y_pred = y_test, gs_LR.predict(X_test)
    print(classification_report(y_true, y_pred))
    print()

## Save optimal classifier in disk

In [11]:
# By looking at the output from the above code, the best classifier is the SVC.
import pickle
# Dump the trained classifier with Pickle
svm_pkl_filename = 'senpy/senpy_data/optimized_classifier.pkl'
# Open the file to save as pkl file
svm_model_pkl = open(svm_pkl_filename, 'wb')
pickle.dump(modelSVC, svm_model_pkl)
# Close the pickle instances
svm_model_pkl.close()


In [30]:
print(y_train)

[0 0 0 ..., 0 0 0]


Pipeline(memory=None,
     steps=[('features', FeatureUnion(n_jobs=1,
       transformer_list=[('words', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_r...',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False))])