# Dependências

In [99]:
import os
import re
import unicodedata
from enum import Enum

import nltk
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier

from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, classification_report, precision_recall_fscore_support

from sklearn.exceptions import UndefinedMetricWarning
import warnings

warnings.filterwarnings("ignore", category=UndefinedMetricWarning)

# Funções auxiliares

Aqui são criados classes e métodos auxiliares para a tokenização e classificação dos documentos.
___

## Tokenização

- TODO: Adicionar `stemmer` e `lemmatizer`

Aqui são definidos duas classes, a primeira é um `Enum` que organiza os possíveis tipos de tokens. A segunda é um tokenizador, capaz de remover _stopwords_, utilizando o nltk, além de filtrar apenas os tipos de tokens desejados.

In [25]:
class ETokenType(Enum):
    """
    Enumerable class with all token's types.
    Update this enum every time a new regex group is added to WordTokenizer._token_pattern
    The order of the values must match with WordTokenizer._token_pattern regexes' order
    """
    EMAIL = 0
    URL = 1
    GLUED_TITLES = 2
    GLUED_WORD = 3
    GLUED_LOWER = 4
    TELEPHONE_CEP = 5
    VALUE = 6
    DATE = 7
    GLUED_VALUE = 8
    WORD = 9
    NON_WORD = 10

class WordTokenizer(object):

    # _token_pattern holds its state across instances of WordTokenizer
    # Every time a new regex group is added to _token_pattern, ETokenType must be updated
    # The order of the regexes' order must match with ETokenType values' order
    _token_pattern = r"""(?x)           # Set flag to allow verbose regexps
        ([\w\.-]+@[\w\.-]+(?:\.[\w]+)+) # E-mail regex
        | (                             # URL regex
            (?:http(?:s)?(?::)?(?:\\\\)?)?  # Optional http or https followed by optional : and //
            (?:[a-z0-9_-]+\.)?              # Optional domain
            [a-z0-9_-]+                     # host
            (?:\.
                (?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)
            )+
            (?::[0-9]+)?                    # Optional port
            (?!\w)(?:\/(?:[^\s\.,]|[\.,][^\s\.,])+)*(?![^\.,]$)  # Optional relative URI
        )
        | ([A-Z][a-z]+(?=\.?(?:[A-Z][A-Za-z]|\d)+)) # Capture titles glued to digits or other words
        | ([A-Z][A-Za-z]+(?=\.?(?:[A-Z][a-z]|\d)+)) # Capture words glued to digits or other words
        | ([a-z]+(?=\.?(?:[A-Z]|\d)+))              # Capture lower words glued to digits or captalized words
        | (         # Capture telephones and CEPs
            (?:         # Asserts telephones
                (?:(?:\(?\ *)\d{2,3}(?:\ *\))?)?    # Gets the DDD
                (?:\ *9\ *(?:\.|-|\/|\\)?)?         # Optional ninth digit
                (?!(?:1|2)\d{3})        # Negative lookahead to prevent from getting years
                \d{4}(?:\.|-|\/|\\)?        # First 4 telephone digits with optional separator
                \d{4}                       # Last 4 digits
            ) | (?:     # Asserts CEPs
                \d{2}(?:\.|-|\/|\\)?    # First two digits, followed by an optional separator
                \d{3}(?:\.|-|\/|\\)?    # Following three digits, followed by an optional separator
                \d{3}                   # Last three digits
            )   # Since the CEPs regex gets some telephones as false positives
        )       # both regexes are in same group
        | (             # Capture values (as in currencies, percentage, measures...)
            (?<![\d\.\/\\-])        # Negative lookbehind for digits or separators
            (?:(?:R?\$|€)(?:\ )*)?  # Currencies symbols
            (?!(?:1|2)\d{3})        # Negative lookahead to prevent from getting years
            \d+                     # Proper digits
            (?:
                (?:\.|,)            # Punctuation
                (?!(?:1|2)\d{3})    # Negative lookahead to prevent from getting years
                \d+                 # After punctuation digits
            )*
            (?:%|\w{1,3}\b)?        # Percentage or measures abbreviations
            # (?![\d\.\/\\-])         # Negative lookahead for digits or separators TODO: Fix it by 15%15%9999999999911111 199999999999999 12-1999 janeiro/2000 09/9/2000
        )
        | (         # Date regex
            # (?<![\d])   # Negative lookbehind for digits
            (?:(?:0?[1-9]|[1-2][0-9]|3[0-1])(?!\d)(?:\.|-|\/|\\))?    # Asserts the first of three parts of a date (optional)
            (?:(?:[A-Za-z_]+|0?[1-9]|[1-2][0-9]|3[0-1])(?!\d)(?:\.|-|\/|\\))?   # Asserts the second part, can be either a word or one to two digits (optional)
            (?:(?:(?:1|2)\d{3})|[0-9]{2})(?!\d)                       # Asserts the year
        )
        | (     # Capture (glued) values (as in currencies, percentage, measures...)
            (?:(?:R?\$|€)(?:\ )*)?  # Currencies symbols
            \d+                     # Proper digits
            (?:(?:\.|,)\d+)*        # Punctuation
            (?:%|\w{1,3}\b)?        # Percentage or measures abbreviations
        )       # This second search aims to get values that were glued to digits or separators
        | ((?:\w+\.?)*(?:\w+))   # Words and abbreviations with optional : at the end
        | ([^A-Za-z0-9\ \n])    # Every thing that is not a letter, a digit, space or line break
    """

    @property
    def token_pattern(self):
        """
        Read-only property. This property holds its state across instances of WordTokenizer.
        """
        return self._token_pattern

    @property
    def stopwords(self):
        # TODO: Set self.remove_stopwords setter to also set self._stopwords considering the lang
        #       Also set the lang setter to change self._stopwords accordingly
        """
        Read-only property. Returns the list of stopwords if and only if
        self._remove_stopwords is True
        """
        if self.remove_stopwords:
            if self._stopwords is None:
                self._stopwords = nltk.corpus.stopwords.words(self.lang)
            return self._stopwords

        return None

    def __init__(self, lang, remove_stopwords=False, lower_case=False):
        self.lang = lang
        self.remove_stopwords = remove_stopwords
        self.lower_case = lower_case
        self._stopwords = None

        if self.remove_stopwords:
            self._stopwords = nltk.corpus.stopwords.words(lang)

    def _extract_text(self, html):
        ## Regexes for html pages splitting
        #  Remove script tags and its content
        SCRIPT_TAG_REGEX = re.compile(r'<script.+?>(.|\n)+?</script>')
        STYLE_TAG_REGEX = re.compile(r'<style.+?>(.|\n)+?</style>')
        # Remove remaining tags, leaving content
        HTML_TAGS_REGEX = re.compile(r'<[^>]*>')

        return HTML_TAGS_REGEX.sub(' ', SCRIPT_TAG_REGEX.sub(' ', html))

    def _shave_marks(self, text):
        """
        Removes all diacritic marks from the given string
        """
        if text is None:
            return ''

        norm_text = unicodedata.normalize('NFD', text)
        shaved = ''.join(char for char in norm_text if not unicodedata.combining(char))
        return unicodedata.normalize('NFC', shaved)

    def _tag_tokens(self, document_tokens):
        typed_tokens = []
        for match_group in document_tokens:
            typed_group = []
            for index, match in enumerate(match_group):
                if match:
                    typed_group.append((ETokenType(index), match))

            # if typed_group:
            assert len(typed_group) > 0, "Token with no match, probably missing parenthesis on regex"
            assert len(typed_group) == 1, "Multiple matches for a single token %r" % ' '.join(match_group)
            typed_tokens.append(typed_group[0])

        return typed_tokens

    def tokenize(self, html, ignored_token_types=[], min_token_size=2):
        """
        Tokenize a string by: e-mail, url, date, glued words, values, abbreviations, words and
        every thing that isn't a letter, digit, blank space or line break.

        Returning only tokens of desirable types
        """

        # Extract text from html document
        text = self._extract_text(html)

        # Remove diacritcs
        shaved_text = self._shave_marks(text)
        
        # Returns an array where every position has a tuple with one position to
        # every regex on token_pattern
        document_tokens = nltk.regexp_tokenize(shaved_text, self._token_pattern)

        # Transform the array of tuples into another array of tuples where
        # the first position is the token_type and the second is the token itself
        document_tokens = self._tag_tokens(document_tokens)

        # Filter token types
        document_tokens = [token for token_type, token in document_tokens
                           if token_type not in ignored_token_types]

        if self.remove_stopwords:
            # Keeps tokens that has at least one captalized letter (even if is a stopword)
            # Since only lower case words test the second condition, there is no need to lower the token
            document_tokens = [token for token in document_tokens
                               if not token.islower() or not token in self._stopwords]

        if self.lower_case:
            document_tokens = [token.lower() for token in document_tokens]

        document_tokens = [token.strip() for token in document_tokens if len(token.strip()) >= min_token_size]
        
        return document_tokens

## Corpus

Aqui temos um método para executar a leitura e tokenização do _corpus_

In [26]:
def read_corpus(corpus_dir, lang, ignored_token_types=[], min_token_size=2):
    """
    Read html files from the received directory.

    :param corpus_dir: corpus directory
    :return: {doc_name:[doc_terms]}
    """

    tokenizer = WordTokenizer(lang, remove_stopwords=True, lower_case=True)

    corpus = {}
    for _, _, files in os.walk(corpus_dir):
        for file in files:
            html = open(corpus_dir + '/' + file, mode='r', encoding='utf-8').read()
            corpus[file] = tokenizer.tokenize(
                html,
                ignored_token_types=ignored_token_types,
                min_token_size=min_token_size
            )

    print('Corpus loaded, document count:', len(corpus))

    return corpus

# Leitura dos dados

In [29]:
corpus = read_corpus(
    './pages',
    'portuguese',
    ignored_token_types=[
        ETokenType.EMAIL,
        ETokenType.URL,
        ETokenType.TELEPHONE_CEP,
        ETokenType.DATE,
        ETokenType.NON_WORD
    ],
    min_token_size=2
)

Corpus loaded, document count: 11


## Amostra dos documentos

In [52]:
for key in corpus:
    print('Preview of document', key + ':', corpus[key][:5])

Preview of document page_9.html: ['critica', 'cargo', 'filme', 'zumbi', 'muita']
Preview of document page_7.html: ['supremacia', 'bourne', 'verdadeiro', 'marco', 'filmes']
Preview of document page_4.html: ['critica', 'neve', 'negra', 'thriller', 'desinteressante']
Preview of document page_2.html: ['critica', 'piratas', 'caribe', 'vinganca', 'salazar']
Preview of document page_0.html: ['demonio', 'neon', 'estetica', 'apresenta', 'limites']
Preview of document page_6.html: ['filha', 'meu', 'melhor', 'amigo', 'sobre']
Preview of document page_5.html: ['critica', 'the', 'titan', 'ficcao', 'cientifica']
Preview of document page_10.html: ['critica', 'han', 'solo', 'uma', 'historia']
Preview of document page_3.html: ['harry', 'potter', 'enigma', 'principe', 'ousado']
Preview of document page_8.html: ['harry', 'potter', 'camara', 'secreta', 'divertido']
Preview of document page_1.html: ['kick', 'ass', 'quebrando', 'tudo', 'miscelanea']


## Transformação dos dados

Primeiro separamos os documentos e seus respectivos ids, para utilizarmos os recursos do `scikit-learn` sem complicações, transformamos nossos documentos (lista de termos) em textos corridos (string única).

In [64]:
# Separate ids from documents
ids, documents = zip(*[(id_, ' '.join(document)) for id_, document in corpus.items()])

ids = list(ids)
documents = list(documents)

Aqui criamos a matriz `term_document`, para tal utilizamos a classe `TfidfVectorizer` do pacote `scikit-learn`.

In [75]:
# Create 1 and 2-grams features
vectorizer = TfidfVectorizer(ngram_range=(1, 2))

# Documents' order remains the same
features = vectorizer.fit_transform(documents)

# Preview of features: 11 documents, 11133 features
features

<11x11133 sparse matrix of type '<class 'numpy.float64'>'
	with 14896 stored elements in Compressed Sparse Row format>

### TODO

Manually classify pages and create a label list

In [90]:
labels = np.array([1] * len(ids))

# Classificação

Aqui são empregados os seguintes algorítmos de classificação:
- Naïve	bayes
- Decision tree (J48)
- SVM (SMO)
- Logistic regression (logistic)
- Multilayer perceptron

Antes disso, dividiremos os dados entre treinamento e teste

## Divisão dos dados

Para termos melhor estimativa dos métodos utilizados, aqui definiremos um método para realizar um `K-Fold` nos dados, retornando a média final dos resultados.

In [88]:
def kfold_training(classifier, data, labels, k=5, verbose=0):
    kf = KFold(n_splits=k, shuffle=True)

    results = {
        'precision': [],
        'recall': [],
        'fscore': [],
        'accuracy': []
    }

    for train_indexes, test_indexes in kf.split(labels):
        train_data, test_data = data[train_indexes], data[test_indexes]
        train_labels, test_labels = labels[train_indexes], labels[test_indexes]

        # Train and test
        classifier.fit(train_data, train_labels)
        pred_labels = classifier.predict(test_data)

        # Calculate metrics
        precision, recall, fscore, _ = precision_recall_fscore_support(
            test_labels, pred_labels, average='weighted'
        )
        accuracy = accuracy_score(test_labels, pred_labels)

        # If verbose, print fold results
        if verbose > 1:
            print(
                classification_report(test_labels, pred_labels)
            )
            print('Accuracy:', accuracy)

        # Save metrics
        results['precision'] = precision
        results['recall'] = recall
        results['fscore'] = fscore
        results['accuracy'] = accuracy
    
    # If verbose, print final results
    if verbose > 0:
        print(
            'Precision mean:', results['precision'].mean(),
            'Recall mean:', results['recall'].mean(),
            'Fscore mean:', results['fscore'].mean(),
            'Accuracy mean:', results['accuracy'].mean(),
        )

    return results

## Naive Bayes

Aqui utilizamos o `MultinomialNB` pois ele aceita uma matriz esparça como input

In [91]:
# Can train with sparse matrix
nb = MultinomialNB()

kfold_training(nb, features, labels)

  self.class_log_prior_ = (np.log(self.class_count_) -
  self.class_log_prior_ = (np.log(self.class_count_) -
  self.class_log_prior_ = (np.log(self.class_count_) -
  self.class_log_prior_ = (np.log(self.class_count_) -
  self.class_log_prior_ = (np.log(self.class_count_) -


{'precision': 1.0, 'recall': 1.0, 'fscore': 1.0, 'accuracy': 1.0}

## Decision Tree

In [94]:
dt = DecisionTreeClassifier()

kfold_training(dt, features, labels)

{'precision': 1.0, 'recall': 1.0, 'fscore': 1.0, 'accuracy': 1.0}

## SVM

In [97]:
svclassifier = SVC(C=0.5, kernel='rbf', degree=8, gamma=0.01, probability=True)

kfold_training(svclassifier, features, labels)

ValueError: The number of classes has to be greater than one; got 1 class

## Logistic regression

In [100]:
# solvers: 'liblinear', 'sag', 'saga'
lg = LogisticRegression(random_state=0, solver='sag', multi_class='ovr')

kfold_training(lg, features, labels)

ValueError: This solver needs samples of at least 2 classes in the data, but the data contains only one class: 1

## Multilayer perceptron

In [101]:
mlp = MLPClassifier()

kfold_training(mlp, features, labels)

{'precision': 1.0, 'recall': 1.0, 'fscore': 1.0, 'accuracy': 1.0}