# Dependências

In [1]:
import os
import re
import unicodedata
import random
from enum import Enum

import nltk
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.feature_selection import chi2
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import SelectKBest

from sklearn.model_selection import RepeatedKFold
from sklearn.metrics import accuracy_score, classification_report, precision_recall_fscore_support

from sklearn.externals import joblib

from sklearn.exceptions import UndefinedMetricWarning
import warnings

warnings.filterwarnings("ignore", category=UndefinedMetricWarning)

# Funções auxiliares

Aqui são criados classes e métodos auxiliares para a tokenização e classificação dos documentos.
___

## Tokenização

- TODO: Adicionar `stemmer` e `lemmatizer`

Aqui são definidos duas classes, a primeira é um `Enum` que organiza os possíveis tipos de tokens. A segunda é um tokenizador, capaz de remover _stopwords_, utilizando o nltk, além de filtrar apenas os tipos de tokens desejados.

In [2]:
class ETokenType(Enum):
    """
    Enumerable class with all token's types.
    Update this enum every time a new regex group is added to WordTokenizer._token_pattern
    The order of the values must match with WordTokenizer._token_pattern regexes' order
    """
    EMAIL = 0
    URL = 1
    GLUED_TITLES = 2
    GLUED_WORD = 3
    GLUED_LOWER = 4
    TELEPHONE_CEP = 5
    VALUE = 6
    DATE = 7
    GLUED_VALUE = 8
    WORD = 9
    NON_WORD = 10

class WordTokenizer(object):

    # _token_pattern holds its state across instances of WordTokenizer
    # Every time a new regex group is added to _token_pattern, ETokenType must be updated
    # The order of the regexes' order must match with ETokenType values' order
    _token_pattern = r"""(?x)           # Set flag to allow verbose regexps
        ([\w\.-]+@[\w\.-]+(?:\.[\w]+)+) # E-mail regex
        | (                             # URL regex
            (?:http(?:s)?(?::)?(?:\\\\)?)?  # Optional http or https followed by optional : and //
            (?:[a-z0-9_-]+\.)?              # Optional domain
            [a-z0-9_-]+                     # host
            (?:\.
                (?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)
            )+
            (?::[0-9]+)?                    # Optional port
            (?!\w)(?:\/(?:[^\s\.,]|[\.,][^\s\.,])+)*(?![^\.,]$)  # Optional relative URI
        )
        | ([A-Z][a-z]+(?=\.?(?:[A-Z][A-Za-z]|\d)+)) # Capture titles glued to digits or other words
        | ([A-Z][A-Za-z]+(?=\.?(?:[A-Z][a-z]|\d)+)) # Capture words glued to digits or other words
        | ([a-z]+(?=\.?(?:[A-Z]|\d)+))              # Capture lower words glued to digits or captalized words
        | (         # Capture telephones and CEPs
            (?:         # Asserts telephones
                (?:(?:\(?\ *)\d{2,3}(?:\ *\))?)?    # Gets the DDD
                (?:\ *9\ *(?:\.|-|\/|\\)?)?         # Optional ninth digit
                (?!(?:1|2)\d{3})        # Negative lookahead to prevent from getting years
                \d{4}(?:\.|-|\/|\\)?        # First 4 telephone digits with optional separator
                \d{4}                       # Last 4 digits
            ) | (?:     # Asserts CEPs
                \d{2}(?:\.|-|\/|\\)?    # First two digits, followed by an optional separator
                \d{3}(?:\.|-|\/|\\)?    # Following three digits, followed by an optional separator
                \d{3}                   # Last three digits
            )   # Since the CEPs regex gets some telephones as false positives
        )       # both regexes are in same group
        | (             # Capture values (as in currencies, percentage, measures...)
            (?<![\d\.\/\\-])        # Negative lookbehind for digits or separators
            (?:(?:R?\$|€)(?:\ )*)?  # Currencies symbols
            (?!(?:1|2)\d{3})        # Negative lookahead to prevent from getting years
            \d+                     # Proper digits
            (?:
                (?:\.|,)            # Punctuation
                (?!(?:1|2)\d{3})    # Negative lookahead to prevent from getting years
                \d+                 # After punctuation digits
            )*
            (?:%|\w{1,3}\b)?        # Percentage or measures abbreviations
            # (?![\d\.\/\\-])         # Negative lookahead for digits or separators TODO: Fix it by 15%15%9999999999911111 199999999999999 12-1999 janeiro/2000 09/9/2000
        )
        | (         # Date regex
            # (?<![\d])   # Negative lookbehind for digits
            (?:(?:0?[1-9]|[1-2][0-9]|3[0-1])(?!\d)(?:\.|-|\/|\\))?    # Asserts the first of three parts of a date (optional)
            (?:(?:[A-Za-z_]+|0?[1-9]|[1-2][0-9]|3[0-1])(?!\d)(?:\.|-|\/|\\))?   # Asserts the second part, can be either a word or one to two digits (optional)
            (?:(?:(?:1|2)\d{3})|[0-9]{2})(?!\d)                       # Asserts the year
        )
        | (     # Capture (glued) values (as in currencies, percentage, measures...)
            (?:(?:R?\$|€)(?:\ )*)?  # Currencies symbols
            \d+                     # Proper digits
            (?:(?:\.|,)\d+)*        # Punctuation
            (?:%|\w{1,3}\b)?        # Percentage or measures abbreviations
        )       # This second search aims to get values that were glued to digits or separators
        | ((?:\w+\.?)*(?:\w+))   # Words and abbreviations with optional : at the end
        | ([^A-Za-z0-9\ \n])    # Every thing that is not a letter, a digit, space or line break
    """

    @property
    def token_pattern(self):
        """
        Read-only property. This property holds its state across instances of WordTokenizer.
        """
        return self._token_pattern

    @property
    def stopwords(self):
        # TODO: Set self.remove_stopwords setter to also set self._stopwords considering the lang
        #       Also set the lang setter to change self._stopwords accordingly
        """
        Read-only property. Returns the list of stopwords if and only if
        self._remove_stopwords is True
        """
        if self.remove_stopwords:
            if self._stopwords is None:
                self._stopwords = nltk.corpus.stopwords.words(self.lang)
            return self._stopwords

        return None

    def __init__(self, lang, remove_stopwords=False, lower_case=False, do_stemming=False):
        self.lang = lang
        self.remove_stopwords = remove_stopwords
        self.lower_case = lower_case
        self.do_stemming = do_stemming
        self._stopwords = None

        if self.remove_stopwords:
            self._stopwords = nltk.corpus.stopwords.words(lang)
        if do_stemming:
            self.stemmer = nltk.stem.RSLPStemmer()

    def _extract_text(self, html):
        ## Regexes for html pages splitting
        #  Remove script tags and its content
        SCRIPT_TAG_REGEX = re.compile(r'<script.+?>(.|\n)+?</script>')
        STYLE_TAG_REGEX = re.compile(r'<style.+?>(.|\n)+?</style>')
        # Remove remaining tags, leaving content
        HTML_TAGS_REGEX = re.compile(r'<[^>]*>')

        return HTML_TAGS_REGEX.sub(' ', SCRIPT_TAG_REGEX.sub(' ', html))

    def _shave_marks(self, text):
        """
        Removes all diacritic marks from the given string
        """
        if text is None:
            return ''

        norm_text = unicodedata.normalize('NFD', text)
        shaved = ''.join(char for char in norm_text if not unicodedata.combining(char))
        return unicodedata.normalize('NFC', shaved)

    def _tag_tokens(self, document_tokens):
        typed_tokens = []
        for match_group in document_tokens:
            typed_group = []
            for index, match in enumerate(match_group):
                if match:
                    typed_group.append((ETokenType(index), match))

            # if typed_group:
            assert len(typed_group) > 0, "Token with no match, probably missing parenthesis on regex"
            assert len(typed_group) == 1, "Multiple matches for a single token %r" % ' '.join(match_group)
            typed_tokens.append(typed_group[0])

        return typed_tokens

    def tokenize(self, html, ignored_token_types=[], min_token_size=2):
        """
        Tokenize a string by: e-mail, url, date, glued words, values, abbreviations, words and
        every thing that isn't a letter, digit, blank space or line break.

        Returning only tokens of desirable types
        """

        # Extract text from html document
        text = self._extract_text(html)

        # Remove diacritcs
        shaved_text = self._shave_marks(text)
        
        # Returns an array where every position has a tuple with one position to
        # every regex on token_pattern
        document_tokens = nltk.regexp_tokenize(shaved_text, self._token_pattern)

        # Transform the array of tuples into another array of tuples where
        # the first position is the token_type and the second is the token itself
        document_tokens = self._tag_tokens(document_tokens)

        # Filter token types
        document_tokens = [token for token_type, token in document_tokens
                           if token_type not in ignored_token_types]

        if self.remove_stopwords:
            # Keeps tokens that has at least one captalized letter (even if is a stopword)
            # Since only lower case words test the second condition, there is no need to lower the token
            document_tokens = [token for token in document_tokens
                               if not token.islower() or not token in self._stopwords]

        if self.lower_case:
            document_tokens = [token.lower() for token in document_tokens]

        document_tokens = [token.strip() for token in document_tokens if len(token.strip()) >= min_token_size]

        if self.do_stemming:
            document_tokens = [self.stemmer.stem(token) for token in document_tokens]
        
        return document_tokens

## Corpus

Aqui temos um método para executar a leitura e tokenização do _corpus_.

Aplicando o stemmer reduzimos aproximadamente 10000 features.

In [4]:
def read_corpus(corpus_dir, lang, ignored_token_types=[], min_token_size=2):
    """
    Read html files from the received directory.

    :param corpus_dir: corpus directory
    :return: {doc_name:[doc_terms]}
    """

    tokenizer = WordTokenizer(lang, remove_stopwords=True, lower_case=True, do_stemming=True)

    corpus = {}
    classes = []
    for path, subdirs, files in os.walk(corpus_dir):
        if subdirs:
            classes = subdirs

        cur_class = ''
        for class_ in classes:
            if class_ in path:
                cur_class = class_

        for file in files:
            try:
                html = open(path + '/' + file, mode='r', encoding='utf-8').read()
            except UnicodeDecodeError:
                print('Error reading file:', file)
                continue

            corpus[cur_class + '_' + file] = tokenizer.tokenize(
                html,
                ignored_token_types=ignored_token_types,
                min_token_size=min_token_size
            )

    print('Corpus loaded, document count:', len(corpus))

    return corpus

# Leitura dos dados

In [5]:
corpus = read_corpus(
    './pages',
    'portuguese',
    ignored_token_types=[
        ETokenType.EMAIL,
        ETokenType.URL,
        ETokenType.TELEPHONE_CEP,
        ETokenType.DATE,
        ETokenType.NON_WORD
    ],
    min_token_size=2
)

Error reading file: pocilga2.html
Corpus loaded, document count: 231


## Amostra dos documentos

In [6]:
for key in corpus:
    print('Preview of document', key + ':', corpus[key][:5])

Preview of document negative_pocilga11.html: ['the', 'leftov', 'resenh', 'livr', 'pocilg']
Preview of document negative_pocilga6.html: ['retrat', 'dorian', 'gray', 'resenh', 'pocilg']
Preview of document negative_cinemasim39.html: ['feel', 'pretty', '8211', 'cinem', 'sim']
Preview of document negative_pocilga4.html: ['meridi', 'sang', 'resenh', 'livr', 'pocilg']
Preview of document negative_cinemaemcasa7.html: ['outr', 'clip', 'mae', 'darren', 'aronofsky']
Preview of document negative_rapadura10.html: ['403', 'forbidden', '403', 'forbidden', 'nginx']
Preview of document negative_planocritico9.html: ['crit', 'doc', 'who', 'uma', 'questa']
Preview of document negative_omelete22.html: ['omelet', 'mai', 'port', 'notic', 'entreten']
Preview of document negative_rapadura8.html: ['extermin', 'futur', 'arnold', 'schwarzenegg', 'lind']
Preview of document negative_rapadura4.html: ['han', 'sol', 'uma', 'hist', 'st']
Preview of document negative_planoaberto0.html: ['function', 'h.hj', 'h.hj', 'fu

## Transformação dos dados

Primeiro separamos os documentos e seus respectivos ids, para utilizarmos os recursos do `scikit-learn` sem complicações, transformamos nossos documentos (lista de termos) em textos corridos (string única).

In [16]:
# Separate ids from documents
ids, documents = zip(*[(id_, ' '.join(document)) for id_, document in corpus.items()])

labels = []
for id_ in ids:
    labels.append(id_.split('_')[0])

labels = np.array(labels)
ids = np.array(ids)
documents = np.array(documents)

No total temos 131 exemplos de documentos não relevantes e 101 de documentos relevantes. Dado que um documento não relevante apresentou erro de leitura, com o intuito de balancear as classes, excluiremos 29 documentos aleatórios dentre os negativos

In [17]:
# Loop 29 times
for iteration in range(29):
    negative_indexes = np.where(labels == 'negative')[0]
    index = random.randint(0, len(negative_indexes))

    labels = np.delete(labels, index)
    ids = np.delete(ids, index)
    documents = np.delete(documents, index)

print('Total of elements:', len(labels))
print('Total of negative elements:', len(np.where(labels == 'negative')[0]))
print('Total of positive elements:', len(np.where(labels == 'positive')[0]))

Total of elements: 202
Total of negative elements: 101
Total of positive elements: 101


## Feature Selection

Aqui executaremos algumas estratégias para seleção de features. Antes de mais nada, criamos a matriz `term_document`, para tal utilizamos a classe `TfidfVectorizer` do pacote `scikit-learn`.

In [18]:
# Create 1 and 2-grams features
vectorizer = TfidfVectorizer(ngram_range=(1, 2))

# Documents' order remains the same
term_document = vectorizer.fit_transform(documents)

# Preview of features: 11 documents, 11133 features
term_document

<202x101178 sparse matrix of type '<class 'numpy.float64'>'
	with 663778 stored elements in Compressed Sparse Row format>

### Chi2

Aqui aplicamos a técnica Chi2. Para tal utilizamos como número de features alvo, escolheremos 10% do conjunto inicial de features, equivalente a aproximadamente 10000.

In [19]:
# Fit transform mutual information
chi_term_document = SelectKBest(chi2, k=10000).fit_transform(term_document, labels)

chi_term_document.shape

(202, 10000)

### Mutual information

Novamente, selecionaremos aproximadamente 10% do total de features.

In [20]:
# Fit transform mutual information
mutual_term_document = SelectKBest(mutual_info_classif, k=10000).fit_transform(term_document, labels)

# Shape of transformed term-documents
mutual_term_document.shape

(202, 10000)

# Classificação

Aqui são empregados os seguintes algorítmos de classificação:
- Naïve	bayes
- Decision tree (J48)
- SVM (SMO)
- Logistic regression (logistic)
- Multilayer perceptron

É importante notar que para todos os classificadores, testaremos __três__ _datasets_, um com as originais considerando bigrams, outro com as features transformadas pelo método LSA e por último, um dataset com features transformadas pelo método de mutual information.

Antes disso, dividiremos os dados entre treinamento e teste.
___

## Divisão dos dados

Para termos melhor estimativa dos métodos utilizados, aqui definiremos um método para realizar um `K-Fold` nos dados, retornando a média final dos resultados.

In [21]:
def kfold_training(classifier, data, labels, k=5, r=15, verbose=0):
    kf = RepeatedKFold(n_splits=k, n_repeats=r)

    results = {
        'precision': [],
        'recall': [],
        'fscore': [],
        'accuracy': []
    }

    for train_indexes, test_indexes in kf.split(labels):
        train_data, test_data = data[train_indexes], data[test_indexes]
        train_labels, test_labels = labels[train_indexes], labels[test_indexes]

        # Train and test
        classifier.fit(train_data, train_labels)
        pred_labels = classifier.predict(test_data)

        # Calculate metrics
        precision, recall, fscore, _ = precision_recall_fscore_support(
            test_labels, pred_labels, average='weighted'
        )
        accuracy = accuracy_score(test_labels, pred_labels)

        # If verbose, print fold results
        if verbose > 1:
            print(
                classification_report(test_labels, pred_labels)
            )
            print('Accuracy:', accuracy)

        # Save metrics
        results['precision'] = precision
        results['recall'] = recall
        results['fscore'] = fscore
        results['accuracy'] = accuracy
    
    # If verbose, print final results
    if verbose > 0:
        print(
            'Precision mean:', results['precision'].mean(),
            'Recall mean:', results['recall'].mean(),
            'Fscore mean:', results['fscore'].mean(),
            'Accuracy mean:', results['accuracy'].mean(),
        )

    return results

## Naive Bayes

Aqui utilizamos o `MultinomialNB` pois ele aceita uma matriz esparça como input

In [36]:
# Can train with sparse matrix
nb = MultinomialNB()

kfold_training(nb, term_document, labels, k=5, r=30)

{'precision': 0.7,
 'recall': 0.6,
 'fscore': 0.5878787878787879,
 'accuracy': 0.6}

In [34]:
# Can train with sparse matrix
nb = MultinomialNB()

kfold_training(nb, chi_term_document, labels, k=5, r=30)

{'precision': 0.7646666666666666,
 'recall': 0.725,
 'fscore': 0.7213702074167191,
 'accuracy': 0.725}

In [38]:
# Can train with sparse matrix
nb = MultinomialNB()

kfold_training(nb, mutual_term_document, labels, k=5, r=30)

{'precision': 0.6549999999999999,
 'recall': 0.65,
 'fscore': 0.6508771929824562,
 'accuracy': 0.65}

## Decision Tree

In [22]:
dt = DecisionTreeClassifier()

kfold_training(dt, term_document, labels, k=5, r=30)

{'precision': 1.0, 'recall': 1.0, 'fscore': 1.0, 'accuracy': 1.0}

In [23]:
dt = DecisionTreeClassifier()

kfold_training(dt, chi_term_document, labels, k=5, r=30)

{'precision': 0.906060606060606,
 'recall': 0.9,
 'fscore': 0.9007672634271099,
 'accuracy': 0.9}

In [24]:
dt = DecisionTreeClassifier(random_state=0)

kfold_training(dt, mutual_term_document, labels, k=5, r=30)

{'precision': 1.0, 'recall': 1.0, 'fscore': 1.0, 'accuracy': 1.0}

## SVM

In [53]:
svclassifier = SVC(C=0.5, kernel='rbf', degree=8, gamma=0.01, probability=True)

kfold_training(svclassifier, term_document, labels, k=5, r=30)

{'precision': 0.6994301994301995,
 'recall': 0.675,
 'fscore': 0.6647324306898775,
 'accuracy': 0.675}

In [54]:
svclassifier = SVC(C=0.5, kernel='rbf', degree=8, gamma=0.01, probability=True)

kfold_training(svclassifier, chi_term_document, labels, k=5, r=30)

{'precision': 0.22562500000000002,
 'recall': 0.475,
 'fscore': 0.3059322033898305,
 'accuracy': 0.475}

In [55]:
svclassifier = SVC(C=0.5, kernel='rbf', degree=8, gamma=0.01, probability=True)

kfold_training(svclassifier, mutual_term_document, labels, k=5, r=30)

{'precision': 0.20249999999999999,
 'recall': 0.45,
 'fscore': 0.2793103448275862,
 'accuracy': 0.45}

## Logistic regression

In [56]:
# solvers: 'liblinear', 'sag', 'saga'
lg = LogisticRegression(random_state=0, solver='sag', multi_class='ovr')

kfold_training(lg, term_document, labels, k=5, r=30)

{'precision': 0.6470959595959596,
 'recall': 0.625,
 'fscore': 0.6261726078799249,
 'accuracy': 0.625}

In [57]:
# solvers: 'liblinear', 'sag', 'saga'
lg = LogisticRegression(random_state=0, solver='sag', multi_class='ovr')

kfold_training(lg, chi_term_document, labels, k=5, r=30)

{'precision': 0.8293103448275861,
 'recall': 0.725,
 'fscore': 0.7113475177304964,
 'accuracy': 0.725}

In [58]:
# solvers: 'liblinear', 'sag', 'saga'
lg = LogisticRegression(random_state=0, solver='sag', multi_class='ovr')

kfold_training(lg, mutual_term_document, labels, k=5, r=30)

{'precision': 0.593162393162393,
 'recall': 0.525,
 'fscore': 0.5151477058453803,
 'accuracy': 0.525}

## Multilayer perceptron

In [59]:
mlp = MLPClassifier()

kfold_training(mlp, term_document, labels, k=5, r=2)



{'precision': 0.8125, 'recall': 0.75, 'fscore': 0.75, 'accuracy': 0.75}

In [60]:
mlp = MLPClassifier()

kfold_training(mlp, chi_term_document, labels, k=5, r=2)



{'precision': 0.8, 'recall': 0.8, 'fscore': 0.8, 'accuracy': 0.8}

In [61]:
mlp = MLPClassifier()

kfold_training(mlp, mutual_term_document, labels, k=5, r=2)



{'precision': 0.8156010230179028,
 'recall': 0.775,
 'fscore': 0.783367139959432,
 'accuracy': 0.775}

## Análise

Dado os bons resultados do `DecisionTree`, testaremos o `RandomForest`, um ensemble de árvores de decisão

### Random Forest

In [64]:
rf = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=200)

kfold_training(rf, term_document, labels, k=5, r=30)

{'precision': 0.8541353383458647,
 'recall': 0.825,
 'fscore': 0.8266499057196732,
 'accuracy': 0.825}

In [47]:
rf = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=200)

kfold_training(rf, chi_term_document, labels, k=5, r=30)

{'precision': 0.9055137844611529,
 'recall': 0.9,
 'fscore': 0.9005050505050505,
 'accuracy': 0.9}

In [25]:
rf = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=200)

kfold_training(rf, mutual_term_document, labels, k=5, r=30)

{'precision': 0.85, 'recall': 0.85, 'fscore': 0.85, 'accuracy': 0.85}

Vale a pena notar, que diferentes execuções resultarão em diferentes resultados, mesmo que o `random_state` seja declarado. Isso provavelmente ocorre pelas instancias escolhidas no k-fold. Para suprir tal característica, repetimos o 5-fold 30 vezes. A escolha de repetir 30 vezes vem do fato de que, na estatística, a partir de 30 exemplos, os dados seguem a distribuição normal (ou próximo dela). Mesmo utilizando este artifício, os resultados variaram.

A termo de informação, o melhor resultado encontrado foi de 97% para precisão, recall, fscore e acurácia.

Testes foram executados, e a melhor combinação de parâmetros foi quando utilizamos 100 estimadores com profundidade máxima de 10.

## Persistindo o modelo

In [26]:
# Persist model
joblib.dump(dt, 'decisiontree.joblib') 

['decisiontree.joblib']

In [52]:
# Create best classifier
rf = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=200)

# Fit using all data available
rf.fit(mutual_term_document, labels)

# Persist model
joblib.dump(rf, 'randomforest.joblib') 

['randomforest.joblib']

Para carregar o modelo em memória use:

### Feature selector

In [27]:
# Persist model
joblib.dump(vectorizer, 'vectorizer.joblib') 

['vectorizer.joblib']

In [28]:
# Create feature selector
selector = SelectKBest(mutual_info_classif, k=10000)

# Fit using mutual information
selector.fit(term_document, labels)

# Persist model
joblib.dump(selector, 'featureselector.joblib') 

['featureselector.joblib']

### Pipeline:

In [29]:
dt = joblib.load('decisiontree.joblib')
vectorizer = joblib.load('vectorizer.joblib')
selector = joblib.load('featureselector.joblib')

In [72]:
new_document = 'aqui esta um documento novo nunca visto pela base ele foi previamente tokenizado e em seguida transformado numa string unica'

vec = vectorizer.transform([new_document])
new_vec = selector.transform(vec)
rf.predict(vec)

ValueError: Number of features of the model must match the input. Model n_features is 10616 and input n_features is 96004 