In [1]:
# macro do ipython para rederizar o matplotlib inline
%matplotlib inline

# numpy é uma biblioteca de python que 
# nos permite fazer operações matriciais e vetoriais
# facilmente, e eficientemente (até um certo tamanho)
import numpy as np

import matplotlib.pyplot as plt

from pylab import rcParams
rcParams['figure.figsize'] = 15, 10

import time

import warnings
warnings.filterwarnings('ignore')

## Objetivo

Dar alguns exemplos de ensembles no contexto de classificação textual.

### Carregando conjunto de dados

In [2]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import  cross_val_score, StratifiedKFold
from sklearn.metrics import f1_score

from sklearn.linear_model import Perceptron
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import TfidfTransformer

# carregando conjunto de dados
train_20ng = fetch_20newsgroups(subset='train', shuffle=True, remove=('headers', 'quotes', 'footers'))
test_20ng = fetch_20newsgroups(subset='test', shuffle=True, remove=('headers', 'quotes', 'footers'))

# Vamos criar o nosso fluxo de execução
pipe = make_pipeline(
    CountVectorizer(stop_words='english', min_df=2),
    TfidfTransformer(norm='l2', use_idf=True, smooth_idf=True, sublinear_tf=True),
    Perceptron(random_state = 0)
)

pipe

Pipeline(steps=[('countvectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=2,
        ngram_range=(1, 1), preprocessor=None, stop_words='english...n_iter=5, n_jobs=1, penalty=None, random_state=0, shuffle=True,
      verbose=0, warm_start=False))])

#### Perceptron

In [3]:
cv = StratifiedKFold(n_splits=5)

valid_score = cross_val_score(pipe, train_20ng.data, train_20ng.target,
                              scoring='f1_macro', cv=cv, n_jobs=-1, verbose=1)
pipe.fit(train_20ng.data, train_20ng.target)

[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    5.3s remaining:    8.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    5.7s finished


Pipeline(steps=[('countvectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=2,
        ngram_range=(1, 1), preprocessor=None, stop_words='english...n_iter=5, n_jobs=1, penalty=None, random_state=0, shuffle=True,
      verbose=0, warm_start=False))])

In [4]:
print(valid_score.mean())
print("Teste F1: %f" % (f1_score(test_20ng.target, pipe.predict(test_20ng.data), average='macro')))

0.693276526757
Teste F1: 0.618027


#### Bagging de Perceptrons

In [5]:
from sklearn.ensemble import BaggingClassifier

pipe = make_pipeline(
    CountVectorizer(stop_words='english', min_df=2),
    TfidfTransformer(norm='l2', use_idf=True, smooth_idf=True, sublinear_tf=True),
    BaggingClassifier(base_estimator=Perceptron(random_state = 0), n_estimators=50, n_jobs=-1, oob_score=True)
)

valid_score = cross_val_score(pipe, train_20ng.data, train_20ng.target,
                              scoring='f1_macro', cv=cv, n_jobs=1, verbose=1)
pipe.fit(train_20ng.data, train_20ng.target)

[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  1.1min finished


Pipeline(steps=[('countvectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=2,
        ngram_range=(1, 1), preprocessor=None, stop_words='english...estimators=50, n_jobs=-1, oob_score=True,
         random_state=None, verbose=0, warm_start=False))])

In [6]:
print("Baggin de Perceptrons")
print("OOB Score: ", pipe.steps[-1][-1].oob_score_)
print("Valid. F1: ", valid_score.mean())
print("Teste F1: %f" % (f1_score(test_20ng.target, pipe.predict(test_20ng.data), average='macro')))

Baggin de Perceptrons
OOB Score:  0.764804666785
Valid. F1:  0.749541995122
Teste F1: 0.674578


#### Ensemble de Árvores de Decisão

In [7]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

pipe = make_pipeline(
    CountVectorizer(stop_words='english', min_df=2),
    TfidfTransformer(norm='l2', use_idf=True, smooth_idf=True, sublinear_tf=True),
    DecisionTreeClassifier()
)

valid_score = cross_val_score(pipe, train_20ng.data, train_20ng.target,
                              scoring='f1_macro', cv=cv, n_jobs=-1, verbose=1)
pipe.fit(train_20ng.data, train_20ng.target)

print("Árvore de Decisão")
print("Valid. F1: ",valid_score.mean())
print("Teste F1: %f" % (f1_score(test_20ng.target, pipe.predict(test_20ng.data), average='macro')))

[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:   24.9s remaining:   37.4s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   25.4s finished


Árvore de Decisão
Valid. F1:  0.47155917936
Teste F1: 0.429698


In [8]:
pipe = make_pipeline(
    CountVectorizer(stop_words='english', min_df=2),
    TfidfTransformer(norm='l2', use_idf=True, smooth_idf=True, sublinear_tf=True),
    RandomForestClassifier(n_jobs=-1, n_estimators=100, oob_score=True)
)

valid_score = cross_val_score(pipe, train_20ng.data, train_20ng.target,
                              scoring='f1_macro', cv=cv, n_jobs=-1, verbose=1)
pipe.fit(train_20ng.data, train_20ng.target)

print("Random Forest")
print("OOB Score: ", pipe.steps[-1][-1].oob_score_)
print("Valid. F1: ", valid_score.mean())
print("Teste F1: %f" % (f1_score(test_20ng.target, pipe.predict(test_20ng.data), average='macro')))

[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:   35.9s remaining:   53.8s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   36.5s finished


Random Forest
OOB Score:  0.65432207884
Valid. F1:  0.650516480409
Teste F1: 0.596571


In [None]:
pipe = make_pipeline(
    CountVectorizer(stop_words='english', min_df=2),
    TfidfTransformer(norm='l2', use_idf=True, smooth_idf=True, sublinear_tf=True),
    BaggingClassifier(base_estimator=DecisionTreeClassifier(), n_estimators=200, n_jobs=-1)
)

valid_score = cross_val_score(pipe, train_20ng.data, train_20ng.target,
                              scoring='f1_macro', cv=cv, n_jobs=1, verbose=1)
pipe.fit(train_20ng.data, train_20ng.target)

print("Bagging de Árvores")
print("Valid. F1: ", valid_score.mean())
print("Teste F1: %f" % (f1_score(test_20ng.target, pipe.predict(test_20ng.data), average='macro')))

## Voto majoritário

In [None]:
from sklearn.ensemble import VotingClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression

# Vamos utilizar os melhores modelos que vimos até então.
# São eles: SVM, Bagging de Perceptrons e lreg
clfs = [('svm', LinearSVC(C=0.5, dual=True)),
        ('bag', BaggingClassifier(base_estimator=Perceptron(random_state = 0), n_estimators=50, n_jobs=-1)),
        ('lreg', LogisticRegression(C=0.297302, random_state=0, dual=True, n_jobs=-1))]

vote = VotingClassifier(estimators=clfs)

pipe = make_pipeline(
    CountVectorizer(stop_words='english', min_df=2),
    TfidfTransformer(norm='l2', use_idf=True, smooth_idf=True, sublinear_tf=True),
    vote
)

valid_score = cross_val_score(pipe, train_20ng.data, train_20ng.target,
                              scoring='f1_macro', cv=cv, n_jobs=1, verbose=1)
pipe.fit(train_20ng.data, train_20ng.target)
print("Voto")
print("Valid. F1: ",valid_score.mean())
print("Teste F1: %f" % (f1_score(test_20ng.target, pipe.predict(test_20ng.data), average='macro')))

[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   53.9s finished


Voto
Valid. F1:  0.754733592708
Teste F1: 0.685985
